From 737185ee82bae05953680b1f4c4cdf8646c51b5a Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 17 Feb 2018 20:51:55 +0100 Subject: AMD: `mem_chunk`and new `strided_index` - add new option for `strided_index` - add additional option if `strided_index == 2` to controll the memory chunk with --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 11 +++++-- xmrstak/backend/amd/amd_gpu/gpu.hpp | 1 + xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 35 +++++++++++++++++------ xmrstak/backend/amd/autoAdjust.hpp | 2 +- xmrstak/backend/amd/config.tpl | 11 +++++-- xmrstak/backend/amd/jconf.cpp | 32 +++++++++++++++++---- xmrstak/backend/amd/jconf.hpp | 3 +- xmrstak/backend/amd/minethd.cpp | 1 + 8 files changed, 75 insertions(+), 21 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index c39c567..054ffc4 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -332,8 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[256]; snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", - hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0); + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d", + hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk)); ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); if(ret != CL_SUCCESS) { @@ -696,6 +696,13 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) for(int i = 0; i < num_gpus; ++i) { + if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) + { + size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; + ctx[i].rawIntensity = reduced_intensity; + printer::inst()->print_msg(L0, "WARNING AMD: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", ctx[i].deviceIdx, int(reduced_intensity)); + } + if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) { return ret; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index c17bac1..abfad5c 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -25,6 +25,7 @@ struct GpuContext size_t rawIntensity; size_t workSize; int stridedIndex; + int memChunk; /*Output vars*/ cl_device_id DeviceID; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index ec05712..2514092 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -411,12 +411,23 @@ void AESExpandKey256(uint *keybuf) } } +#define MEM_CHUNK (1<<4) + #if(STRIDED_INDEX==0) # define IDX(x) (x) -#else +#elif(STRIDED_INDEX==1) # define IDX(x) ((x) * (Threads)) +#elif(STRIDED_INDEX==2) +# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) #endif +inline ulong getIdx() +{ +#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2) + return get_global_id(0) - get_global_offset(0); +#endif +} + __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) { @@ -425,7 +436,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul __local uint AES0[256], AES1[256], AES2[256], AES3[256]; uint4 text; - const ulong gIdx = get_global_id(0) - get_global_offset(0); + const ulong gIdx = getIdx(); for(int i = get_local_id(1) * WORKSIZE + get_local_id(0); i < 256; @@ -439,7 +450,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul } barrier(CLK_LOCAL_MEM_FENCE); - + // do not use early return here if(gIdx < Threads) { @@ -447,8 +458,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul #if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); -#else +#elif(STRIDED_INDEX==1) Scratchpad += gIdx; +#elif(STRIDED_INDEX==2) + Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif ((ulong8 *)State)[0] = vload8(0, input); @@ -509,7 +522,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre ulong a[2], b[2]; __local uint AES0[256], AES1[256], AES2[256], AES3[256]; - const ulong gIdx = get_global_id(0) - get_global_offset(0); + const ulong gIdx = getIdx(); for(int i = get_local_id(0); i < 256; i += WORKSIZE) { @@ -523,15 +536,17 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre barrier(CLK_LOCAL_MEM_FENCE); uint4 b_x; - + // do not use early return here if(gIdx < Threads) { states += 25 * gIdx; #if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); -#else +#elif(STRIDED_INDEX==1) Scratchpad += gIdx; +#elif(STRIDED_INDEX==2) + Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif a[0] = states[0] ^ states[4]; @@ -582,7 +597,7 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u ulong State[25]; uint4 text; - const ulong gIdx = get_global_id(0) - get_global_offset(0); + const ulong gIdx = getIdx(); for(int i = get_local_id(1) * WORKSIZE + get_local_id(0); i < 256; @@ -603,8 +618,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u states += 25 * gIdx; #if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); -#else +#elif(STRIDED_INDEX==1) Scratchpad += gIdx; +#elif(STRIDED_INDEX==2) + Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif #if defined(__Tahiti__) || defined(__Pitcairn__) diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index afedb5c..b88d3ee 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -143,7 +143,7 @@ private: // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : true\n" + " \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4\n" " },\n"; } else diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 25b75a1..8914130 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -6,11 +6,16 @@ R"===( * worksize - Number of local GPU threads (nothing to do with CPU threads) * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. * strided_index - switch memory pattern used for the scratch pad memory - * true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks - * false = use a contiguous block of memory per thread + * 2 = chunked memory, chunk size is controlled by 'mem_chunk' + * required: intensity must be a multiple of worksize + * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks + * 0 or false = use a contiguous block of memory per thread + * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk + * this value is only used if 'strided_index' == 2 + * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4 }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index f126342..22381e1 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -106,14 +106,15 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); stridedIndex = GetObjectMember(oThdConf, "strided_index"); + memChunk = GetObjectMember(oThdConf, "mem_chunk"); - if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr) + if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr || memChunk == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -122,13 +123,34 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!aff->IsUint64() && !aff->IsBool()) return false; - if(!stridedIndex->IsBool()) + if(!stridedIndex->IsBool() && !stridedIndex->IsNumber()) + { + printer::inst()->print_msg(L0, "ERROR: strided_index must be a bool or a number"); + return false; + } + + if(stridedIndex->IsBool()) + cfg.stridedIndex = stridedIndex->GetBool() ? 1 : 0; + else + cfg.stridedIndex = (int)stridedIndex->GetInt64(); + + if(cfg.stridedIndex > 2) + { + printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2"); return false; + } + + cfg.memChunk = (int)memChunk->GetInt64(); + + if(!idx->IsUint64() || cfg.memChunk > 18 ) + { + printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18"); + return false; + } cfg.index = idx->GetUint64(); - cfg.intensity = intensity->GetUint64(); cfg.w_size = w_size->GetUint64(); - cfg.stridedIndex = stridedIndex->GetBool(); + cfg.intensity = intensity->GetUint64(); if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt64(); diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index ee1882a..91e5d0d 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -26,7 +26,8 @@ public: size_t intensity; size_t w_size; long long cpu_aff; - bool stridedIndex; + int stridedIndex; + int memChunk; }; size_t GetThreadCount(); diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 422c28c..ca5e163 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -97,6 +97,7 @@ bool minethd::init_gpus() vGpuData[i].rawIntensity = cfg.intensity; vGpuData[i].workSize = cfg.w_size; vGpuData[i].stridedIndex = cfg.stridedIndex; + vGpuData[i].memChunk = cfg.memChunk; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; -- cgit v1.1