summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpsychocrypt <psychocryptHPC@gmail.com>2018-02-12 20:39:49 +0100
committerpsychocrypt <psychocryptHPC@gmail.com>2018-02-19 22:13:49 +0100
commitcff6b6cbfbb3da44d85753885466de5122e20472 (patch)
treef08c352ae4261825dca47635c88b8e4a08d67bb7
parentf785481bb7c1fb887a65a0b19c3e453904bb5474 (diff)
downloadxmr-stak-cff6b6cbfbb3da44d85753885466de5122e20472.zip
xmr-stak-cff6b6cbfbb3da44d85753885466de5122e20472.tar.gz
add OpenCL compatibility mode
- add new option `comp_mode` to the amd config - disable `if guards` within opencl kernel if `comp_mode : false`
-rw-r--r--xmrstak/backend/amd/amd_gpu/gpu.cpp17
-rw-r--r--xmrstak/backend/amd/amd_gpu/gpu.hpp1
-rw-r--r--xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl19
-rw-r--r--xmrstak/backend/amd/autoAdjust.hpp3
-rw-r--r--xmrstak/backend/amd/config.tpl12
-rw-r--r--xmrstak/backend/amd/jconf.cpp10
-rw-r--r--xmrstak/backend/amd/jconf.hpp1
-rw-r--r--xmrstak/backend/amd/minethd.cpp1
8 files changed, 48 insertions, 16 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 054ffc4..2f16b67 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -332,8 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
char options[256];
snprintf(options, sizeof(options),
- "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d",
- hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk));
+ "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d -DCOMP_MODE=%d",
+ hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0);
ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
if(ret != CL_SUCCESS)
{
@@ -873,10 +873,15 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
size_t g_intensity = ctx->rawIntensity;
size_t w_size = ctx->workSize;
- // round up to next multiple of w_size
- size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
- // number of global threads must be a multiple of the work group size (w_size)
- assert(g_thd%w_size == 0);
+ size_t g_thd = g_intensity;
+
+ if(ctx->compMode)
+ {
+ // round up to next multiple of w_size
+ size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
+ // number of global threads must be a multiple of the work group size (w_size)
+ assert(g_thd%w_size == 0);
+ }
for(int i = 2; i < 6; ++i)
{
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index abfad5c..8fb7168 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -26,6 +26,7 @@ struct GpuContext
size_t workSize;
int stridedIndex;
int memChunk;
+ int compMode;
/*Output vars*/
cl_device_id DeviceID;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 53299ec..4bac68c 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -451,8 +451,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
barrier(CLK_LOCAL_MEM_FENCE);
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
states += 25 * gIdx;
@@ -483,9 +485,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
-
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
#pragma unroll
for(int i = 0; i < 25; ++i) states[i] = State[i];
@@ -499,9 +502,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
}
mem_fence(CLK_LOCAL_MEM_FENCE);
-
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
#pragma unroll 2
for(int i = 0; i < (ITERATIONS >> 5); ++i)
@@ -536,9 +540,10 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
barrier(CLK_LOCAL_MEM_FENCE);
uint4 b_x;
-
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
@@ -559,8 +564,10 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
mem_fence(CLK_LOCAL_MEM_FENCE);
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
#pragma unroll 8
for(int i = 0; i < ITERATIONS; ++i)
@@ -612,8 +619,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
barrier(CLK_LOCAL_MEM_FENCE);
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
@@ -641,8 +650,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
barrier(CLK_LOCAL_MEM_FENCE);
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
#pragma unroll 2
for(int i = 0; i < (ITERATIONS >> 5); ++i)
@@ -659,8 +670,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
barrier(CLK_GLOBAL_MEM_FENCE);
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
if(!get_local_id(1))
{
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index b88d3ee..8d60b94 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -143,7 +143,8 @@ private:
// set 8 threads per block (this is a good value for the most gpus)
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
- " \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4\n"
+ " \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4,\n"
+ " \"comp_mode\" : true\n" +
" },\n";
}
else
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 8914130..84251c7 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -1,9 +1,9 @@
R"===(
/*
* GPU configuration. You should play around with intensity and worksize as the fastest settings will vary.
- * index - GPU index number usually starts from 0
- * intensity - Number of parallel GPU threads (nothing to do with CPU threads)
- * worksize - Number of local GPU threads (nothing to do with CPU threads)
+ * index - GPU index number usually starts from 0
+ * intensity - Number of parallel GPU threads (nothing to do with CPU threads)
+ * worksize - Number of local GPU threads (nothing to do with CPU threads)
* affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
* strided_index - switch memory pattern used for the scratch pad memory
* 2 = chunked memory, chunk size is controlled by 'mem_chunk'
@@ -13,9 +13,13 @@ R"===(
* mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk
* this value is only used if 'strided_index' == 2
* element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows
+ * to use a intensity which is not the multiple of the worksize.
+ * If you set false and the intensity is not multiple of the worksize the miner can crash:
+ * in this case set the intensity to a multiple of the worksize or activate comp_mode.
* "gpu_threads_conf" :
* [
- * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4 },
+ * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4, "comp_mode" : true },
* ],
* If you do not wish to mine with your AMD GPU(s) then use:
* "gpu_threads_conf" :
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 22381e1..93ba709 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -106,15 +106,17 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
- const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk;
+ const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode;
idx = GetObjectMember(oThdConf, "index");
intensity = GetObjectMember(oThdConf, "intensity");
w_size = GetObjectMember(oThdConf, "worksize");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
stridedIndex = GetObjectMember(oThdConf, "strided_index");
memChunk = GetObjectMember(oThdConf, "mem_chunk");
+ compMode = GetObjectMember(oThdConf, "comp_mode");
- if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr || memChunk == nullptr)
+ if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
+ stridedIndex == nullptr || compMode == nullptr)
return false;
if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -148,9 +150,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
return false;
}
+ if(!compMode->IsBool())
+ return false;
+
cfg.index = idx->GetUint64();
cfg.w_size = w_size->GetUint64();
cfg.intensity = intensity->GetUint64();
+ cfg.compMode = compMode->GetBool();
if(aff->IsNumber())
cfg.cpu_aff = aff->GetInt64();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index 91e5d0d..580b69f 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -28,6 +28,7 @@ public:
long long cpu_aff;
int stridedIndex;
int memChunk;
+ bool compMode;
};
size_t GetThreadCount();
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index ca5e163..8dfbce5 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -98,6 +98,7 @@ bool minethd::init_gpus()
vGpuData[i].workSize = cfg.w_size;
vGpuData[i].stridedIndex = cfg.stridedIndex;
vGpuData[i].memChunk = cfg.memChunk;
+ vGpuData[i].compMode = cfg.compMode;
}
return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
OpenPOWER on IntegriCloud