4 files changed, 76 insertions, 26 deletions
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 1b63379..afbdbaf 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -11,7 +11,8 @@ typedef struct {
 	int device_blocks;
 	int device_threads;
 	int device_bfactor;
-	int device_bsleep;
+	int device_bsleep;	
+	int syncMode;
 
 	uint32_t *d_input;
 	uint32_t inputlen;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index a92fa8c..0b175b5 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -167,10 +167,10 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_
 #endif
 }
 
+template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
 #endif
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK>
 __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -327,18 +327,22 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx)
 
 	for ( int i = 0; i < partcount; i++ )
 	{
-        CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase2<ITERATIONS,THREAD_SHIFT,MASK><<<
-            grid,
-            block4,
-            block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-        >>>(
-            ctx->device_blocks*ctx->device_threads,
-            ctx->device_bfactor,
-            i,
-            ctx->d_long_state,
-            ctx->d_ctx_a,
-            ctx->d_ctx_b
-        ));
+        CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
+			cryptonight_core_gpu_phase2<ITERATIONS,THREAD_SHIFT,MASK><<<
+				grid,
+				block4,
+				block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+			>>>(
+				ctx->device_blocks*ctx->device_threads,
+				ctx->device_bfactor,
+				i,
+				ctx->d_long_state,
+				ctx->d_ctx_a,
+				ctx->d_ctx_b
+			)
+	    );
 
 		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
 	}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
index 078c165..563bb3b 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
@@ -9,22 +9,41 @@
 /** execute and check a CUDA api command
  *
  * @param id gpu id (thread id)
+ * @param msg message string which should be added to the error message
  * @param ... CUDA api command
  */
-#define CUDA_CHECK(id, ...) {														\
-	cudaError_t error = __VA_ARGS__;											\
-	if(error!=cudaSuccess){														\
-		std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__ << std::endl; \
-		throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error)));	\
-	}																			\
-}																				\
+#define CUDA_CHECK_MSG(id, msg, ...) { \
+	cudaError_t error = __VA_ARGS__; \
+	if(error!=cudaSuccess){	\
+		std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \
+		std::cerr << msg << std::endl;                                         \
+		throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \
+	} \
+} \
 ( (void) 0 )
 
+/** execute and check a CUDA api command
+ *
+ * @param id gpu id (thread id)
+ * @param ... CUDA api command
+ */
+#define CUDA_CHECK(id, ...) CUDA_CHECK_MSG(id, "", __VA_ARGS__)
+
 /** execute and check a CUDA kernel
  *
  * @param id gpu id (thread id)
  * @param ... CUDA kernel call
  */
-#define CUDA_CHECK_KERNEL(id, ...)													\
-	__VA_ARGS__;																\
+#define CUDA_CHECK_KERNEL(id, ...) \
+	__VA_ARGS__; \
 	CUDA_CHECK(id, cudaGetLastError())
+
+/** execute and check a CUDA kernel
+ *
+ * @param id gpu id (thread id)
+ * @param msg message string which should be added to the error message
+ * @param ... CUDA kernel call
+ */
+#define CUDA_CHECK_MSG_KERNEL(id, msg, ...) \
+	__VA_ARGS__; \
+	CUDA_CHECK_MSG(id, msg, cudaGetLastError())
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 5501d8d..492201d 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -189,7 +189,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	}
 
 	cudaDeviceReset();
-	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+	switch(ctx->syncMode)
+	{
+	case 0:
+		cudaSetDeviceFlags(cudaDeviceScheduleAuto);
+		break;
+	case 1:
+		cudaSetDeviceFlags(cudaDeviceScheduleSpin);
+		break;
+	case 2:
+		cudaSetDeviceFlags(cudaDeviceScheduleYield);
+		break;
+	case 3:
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		break;
+
+	};
 	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 
 	size_t hashMemSize;
@@ -203,7 +218,6 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	}
 
 	size_t wsize = ctx->device_blocks * ctx->device_threads;
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, hashMemSize * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize));
@@ -213,6 +227,10 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) ));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
+	CUDA_CHECK_MSG(
+		ctx->device_id,
+		"\n**suggestion: Try to reduce the value of the attribute 'threads' in the NVIDIA config file.**",
+		cudaMalloc(&ctx->d_long_state, hashMemSize * wsize));
 	return 1;
 }
 
@@ -239,7 +257,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) ));
 	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) ));
 
-	CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state ));
+	CUDA_CHECK_MSG_KERNEL(
+		ctx->device_id,
+		"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+		cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state )
+	);
 
 	CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
 	CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
@@ -380,6 +402,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 */
 		ctx->device_blocks = props.multiProcessorCount *
 			( props.major < 3 ? 2 : 3 );
+
+		// increase bfactor for low end devices to avoid that the miner is killed by the OS
+		if(props.multiProcessorCount < 6)
+			ctx->device_bfactor += 2;
 	}
 	if(ctx->device_threads == -1)
 	{