option to controll gpu synchronization

- add option `sync_mode` - update auto suggestion and jconf
author: psychocrypt <psychocrypt@users.noreply.github.com> 2017-12-01 21:37:11 +0100
committer: psychocrypt <psychocrypt@users.noreply.github.com> 2017-12-01 21:37:11 +0100
commit: 4dca64c1a9ffcb506ad81720951f7536b70e394e (patch)
tree: 9b19e206ceaf43dc9ec7cfac0facc0636d9e9f4e
parent: 2920e9a3227da307b04ee23ecc5c63ecee4a224c (diff)
download: xmr-stak-4dca64c1a9ffcb506ad81720951f7536b70e394e.zip
xmr-stak-4dca64c1a9ffcb506ad81720951f7536b70e394e.tar.gz
7 files changed, 39 insertions, 6 deletions
diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index c6a7dca..be7d1ce 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -95,7 +95,7 @@ private:
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" +
 					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
 					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
-					"    \"affine_to_cpu\" : false,\n" +
+					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
 					"  },\n";
 			}
 		}
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index 99dc023..5479172 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -9,6 +9,12 @@ R"===(
  * bsleep        - Insert a delay of X microseconds between kernel launches.
  *                 Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 100
  * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ * sync_mode     - method used to synchronize the device
+ *                 documentation: http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g69e73c7dda3fc05306ae7c811a690fac
+ *                 0 = cudaDeviceScheduleAuto
+ *                 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu
+ *                 2 = cudaDeviceScheduleYield
+ *                 3 = cudaDeviceScheduleBlockingSync (default)
  *
  * On the first run the miner will look at your system and suggest a basic configuration that will work,
  * you can try to tweak it from there to get the best performance.
@@ -16,7 +22,9 @@ R"===(
  * A filled out configuration should look like this:
  * "gpu_threads_conf" :
  * [
- *     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0, "affine_to_cpu" : false},
+ *     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0, 
+ *       "affine_to_cpu" : false, "sync_mode" : 3,
+ *     },
  * ],
  */
 
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
index 4208145..46c5726 100644
--- a/xmrstak/backend/nvidia/jconf.cpp
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -123,16 +123,17 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff;
+	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode;
 	gid = GetObjectMember(oThdConf, "index");
 	blocks = GetObjectMember(oThdConf, "blocks");
 	threads = GetObjectMember(oThdConf, "threads");
 	bfactor = GetObjectMember(oThdConf, "bfactor");
 	bsleep = GetObjectMember(oThdConf, "bsleep");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+	syncMode = GetObjectMember(oThdConf, "sync_mode");
 
 	if(gid == nullptr || blocks == nullptr || threads == nullptr ||
-		bfactor == nullptr || bsleep == nullptr || aff == nullptr)
+		bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr)
 	{
 		return false;
 	}
@@ -155,11 +156,17 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	if(!aff->IsUint64() && !aff->IsBool())
 		return false;
 
+	if(!syncMode->IsNumber() || syncMode->GetInt() < 0 || syncMode->GetInt() > 3)
+	{
+		printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)");
+		return false;
+	}
 	cfg.id = gid->GetInt();
 	cfg.blocks = blocks->GetInt();
 	cfg.threads = threads->GetInt();
 	cfg.bfactor = bfactor->GetInt();
 	cfg.bsleep = bsleep->GetInt();
+	cfg.syncMode = syncMode->GetInt();
 
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt();
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
index b09a162..7f60f1d 100644
--- a/xmrstak/backend/nvidia/jconf.hpp
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -28,6 +28,7 @@ public:
 		bool bDoubleMode;
 		bool bNoPrefetch;
 		int32_t cpu_aff;
+		int syncMode;
 
 		long long iCpuAff;
 	};
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 9eab1c0..6e628fd 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -77,6 +77,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	ctx.device_threads = (int)cfg.threads;
 	ctx.device_bfactor = (int)cfg.bfactor;
 	ctx.device_bsleep = (int)cfg.bsleep;
+	ctx.syncMode = cfg.syncMode;
 	this->affinity = cfg.cpu_aff;
 
 	std::unique_lock<std::mutex> lck(thd_aff_set);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 1b63379..afbdbaf 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -11,7 +11,8 @@ typedef struct {
 	int device_blocks;
 	int device_threads;
 	int device_bfactor;
-	int device_bsleep;
+	int device_bsleep;	
+	int syncMode;
 
 	uint32_t *d_input;
 	uint32_t inputlen;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 333ae73..0fc99a4 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -189,7 +189,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	}
 
 	cudaDeviceReset();
-	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+	switch(ctx->syncMode)
+	{
+	case 0:
+		cudaSetDeviceFlags(cudaDeviceScheduleAuto);
+		break;
+	case 1:
+		cudaSetDeviceFlags(cudaDeviceScheduleSpin);
+		break;
+	case 2:
+		cudaSetDeviceFlags(cudaDeviceScheduleYield);
+		break;
+	case 3:
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		break;
+
+	};
 	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 
 	size_t hashMemSize;
author	psychocrypt <psychocrypt@users.noreply.github.com>	2017-12-01 21:37:11 +0100
committer	psychocrypt <psychocrypt@users.noreply.github.com>	2017-12-01 21:37:11 +0100
commit	4dca64c1a9ffcb506ad81720951f7536b70e394e (patch)
tree	9b19e206ceaf43dc9ec7cfac0facc0636d9e9f4e
parent	2920e9a3227da307b04ee23ecc5c63ecee4a224c (diff)
download	xmr-stak-4dca64c1a9ffcb506ad81720951f7536b70e394e.zip xmr-stak-4dca64c1a9ffcb506ad81720951f7536b70e394e.tar.gz