From 3216b47fdacf701942457b23341f483aadbe5eb7 Mon Sep 17 00:00:00 2001 From: dangrabbits Date: Mon, 25 Dec 2017 09:32:26 +0800 Subject: Updated config.tpl comments to include how to exclude CPU/GPUs --- xmrstak/backend/amd/config.tpl | 3 +++ 1 file changed, 3 insertions(+) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index af662f8..25b75a1 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -12,6 +12,9 @@ R"===( * [ * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true }, * ], + * If you do not wish to mine with your AMD GPU(s) then use: + * "gpu_threads_conf" : + * null, */ "gpu_threads_conf" : [ -- cgit v1.1 From 58db6082a33a1233eff0b33ce9fba9cc5a9f5de8 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 30 Dec 2017 21:16:35 +0100 Subject: differgence in OpenCL code remove branch differgences in AMD OpenCl code based on #454 a Please enter the commit message for your changes. Lines starting --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 104 ++++++++++------------ 1 file changed, 48 insertions(+), 56 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 255fcbb..ec05712 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -653,21 +653,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u for(int i = 0; i < 25; ++i) states[i] = State[i]; - switch(State[0] & 3) - { - case 0: - Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0); - break; - case 1: - Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0); - break; - case 2: - Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0); - break; - case 3: - Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0); - break; - } + ulong StateSwitch = State[0] & 3; + __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1; + __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3; + __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2; + destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx; } } mem_fence(CLK_GLOBAL_MEM_FENCE); @@ -704,8 +694,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u for(uint i = 0; i < 4; ++i) { - if(i < 3) t[0] += 0x40UL; - else t[0] += 0x08UL; + t[0] += i < 3 ? 0x40UL : 0x08UL; t[2] = t[0] ^ t[1]; @@ -715,8 +704,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u h = m ^ p; - if(i < 2) t[1] = 0x3000000000000000UL; - else t[1] = 0xB000000000000000UL; + t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL; } t[0] = 0x08UL; @@ -744,6 +732,27 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u #define SWAP8(x) as_ulong(as_uchar8(x).s76543210) +#define JHXOR \ + h0h ^= input[0]; \ + h0l ^= input[1]; \ + h1h ^= input[2]; \ + h1l ^= input[3]; \ + h2h ^= input[4]; \ + h2l ^= input[5]; \ + h3h ^= input[6]; \ + h3l ^= input[7]; \ +\ + E8; \ +\ + h4h ^= input[0]; \ + h4l ^= input[1]; \ + h5h ^= input[2]; \ + h5l ^= input[3]; \ + h6h ^= input[4]; \ + h6l ^= input[5]; \ + h7h ^= input[6]; \ + h7l ^= input[7] + __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads) { const uint idx = get_global_id(0) - get_global_offset(0); @@ -757,46 +766,27 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL; sph_u64 tmp; - for(int i = 0; i < 5; ++i) + for(int i = 0; i < 3; ++i) { ulong input[8]; - if(i < 3) - { - for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]); - } - else if(i == 3) - { - input[0] = (states[24]); - input[1] = 0x80UL; - for(int x = 2; x < 8; ++x) input[x] = 0x00UL; - } - else - { - input[7] = 0x4006000000000000UL; - - for(int x = 0; x < 7; ++x) input[x] = 0x00UL; - } - - h0h ^= input[0]; - h0l ^= input[1]; - h1h ^= input[2]; - h1l ^= input[3]; - h2h ^= input[4]; - h2l ^= input[5]; - h3h ^= input[6]; - h3l ^= input[7]; - - E8; - - h4h ^= input[0]; - h4l ^= input[1]; - h5h ^= input[2]; - h5l ^= input[3]; - h6h ^= input[4]; - h6l ^= input[5]; - h7h ^= input[6]; - h7l ^= input[7]; + const int shifted = i << 3; + for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]); + JHXOR; + } + { + ulong input[8]; + input[0] = (states[24]); + input[1] = 0x80UL; + #pragma unroll 6 + for(int x = 2; x < 8; ++x) input[x] = 0x00UL; + JHXOR; + } + { + ulong input[8]; + for(int x = 0; x < 7; ++x) input[x] = 0x00UL; + input[7] = 0x4006000000000000UL; + JHXOR; } //output[0] = h6h; @@ -832,6 +822,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u ((uint8 *)h)[0] = vload8(0U, c_IV256); + #pragma unroll 4 for(uint i = 0, bitlen = 0; i < 4; ++i) { if(i < 3) @@ -907,6 +898,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global State[7] = 0x0001000000000000UL; + #pragma unroll 4 for(uint i = 0; i < 4; ++i) { ulong H[8], M[8]; -- cgit v1.1 From 16759bc35357e9e981544273c42125de443c18f3 Mon Sep 17 00:00:00 2001 From: Doug Johnson Date: Sat, 30 Dec 2017 16:49:51 -0700 Subject: Modify invalid result report to show GPU id --- xmrstak/backend/amd/minethd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index e83527c..85a48d3 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -245,7 +245,7 @@ void minethd::work_main() if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo), oWork.iPoolId)); else - executor::inst()->push_event(ex_event("AMD Invalid Result", oWork.iPoolId)); + executor::inst()->push_event(ex_event("AMD Invalid Result", pGpuCtx->deviceIdx, oWork.iPoolId)); } iCount += pGpuCtx->rawIntensity; -- cgit v1.1 From 0c845b3569f0a2c9524f98d4ca9b6866288fe3d0 Mon Sep 17 00:00:00 2001 From: Doug Johnson Date: Sat, 30 Dec 2017 23:59:03 -0700 Subject: Add warning and fallback when auto intensity is 0 Occassionally the auto adjust doesn't find enough memory and the intensity is detected too low and aligned to 0 with the compute units. This patch fixes this situation by issuing a warning with a suggestion to set environment vars and then ignoring the alignment to 0 Per several issues: Principally: https://github.com/fireice-uk/xmr-stak/issues/81 Related: https://github.com/fireice-uk/xmr-stak/issues/490 https://github.com/fireice-uk/xmr-stak/issues/472 --- xmrstak/backend/amd/autoAdjust.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 0bc5239..4673613 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -118,6 +118,19 @@ private: size_t possibleIntensity = std::min( maxThreads , maxIntensity ); // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; + //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity + if (intensity == 0) { + /* See Issues: + * https://github.com/fireice-uk/xmr-stak/issues/81 + * https://github.com/fireice-uk/xmr-stak/issues/472 + * https://github.com/fireice-uk/xmr-stak/issues/490 + * Note that it appears that Northern Islands GPUs (HD 6XXX) are unaffected by + * these environment variables, according to my testing (dougvj) + */ + printer::inst()->print_msg(L0, "WARNING: Autodetected intensity unexpectedly low. Try setting GPU_SINGLE_ALLOC_PERCENT and etc."); + intensity = possibleIntensity; + + } conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n"; conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n"; // set 8 threads per block (this is a good value for the most gpus) -- cgit v1.1 From d01bab0cd73181353cbc8ae61ec5712b06fcb775 Mon Sep 17 00:00:00 2001 From: Brian Recchia Date: Tue, 2 Jan 2018 16:09:40 -0500 Subject: Update minethd.cpp Changed capitalization of "macOS" Squashed the commit --- xmrstak/backend/amd/minethd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index e83527c..0ee3f8e 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -139,7 +139,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor if(cfg.cpu_aff >= 0) { #if defined(__APPLE__) - printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory."); + printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory."); #endif printer::inst()->print_msg(L1, "Starting AMD GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff); -- cgit v1.1 From eb4967b5bff3a909796e1783f18e579639becde7 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 9 Jan 2018 21:37:06 +0100 Subject: update VEGA names for auto suggestion update VEGA names --- xmrstak/backend/amd/autoAdjust.hpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 0bc5239..511a712 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -101,7 +101,16 @@ private: * sowing down the memory performance because of TLB cache misses */ size_t maxThreads = 1000u; - if(ctx.name.compare("gfx901") == 0) + if( + ctx.name.compare("gfx901") == 0 || + ctx.name.compare("gfx904") == 0 || + // APU + ctx.name.compare("gfx902") == 0 || + // UNKNOWN + ctx.name.compare("gfx900") == 0 || + ctx.name.compare("gfx903") == 0 || + ctx.name.compare("gfx905") == 0 + ) { /* Increase the number of threads for AMD VEGA gpus. * Limit the number of threads based on the issue: https://github.com/fireice-uk/xmr-stak/issues/5#issuecomment-339425089 -- cgit v1.1 From 14f60635915f545fce2f61117ccf87143c7629cc Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 13 Jan 2018 20:27:02 +0100 Subject: ignore gpu with intensity zero - if the intensity is zero than do not suggest a config - remove the links to old issues --- xmrstak/backend/amd/autoAdjust.hpp | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 4673613..c16edac 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -94,7 +94,6 @@ private: } std::string conf; - int i = 0; for(auto& ctx : devVec) { /* 1000 is a magic selected limit, the reason is that more than 2GiB memory @@ -119,26 +118,26 @@ private: // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity - if (intensity == 0) { - /* See Issues: - * https://github.com/fireice-uk/xmr-stak/issues/81 - * https://github.com/fireice-uk/xmr-stak/issues/472 - * https://github.com/fireice-uk/xmr-stak/issues/490 - * Note that it appears that Northern Islands GPUs (HD 6XXX) are unaffected by - * these environment variables, according to my testing (dougvj) - */ - printer::inst()->print_msg(L0, "WARNING: Autodetected intensity unexpectedly low. Try setting GPU_SINGLE_ALLOC_PERCENT and etc."); + if (intensity == 0) + { + printer::inst()->print_msg(L0, "WARNING: Auto detected intensity unexpectedly low. Try to set the environment variable GPU_SINGLE_ALLOC_PERCENT."); intensity = possibleIntensity; } - conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n"; - conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n"; - // set 8 threads per block (this is a good value for the most gpus) - conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + - " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : true\n" - " },\n"; - ++i; + if (intensity != 0) + { + conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n"; + conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n"; + // set 8 threads per block (this is a good value for the most gpus) + conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : true\n" + " },\n"; + } + else + { + printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str()); + } } configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex)); -- cgit v1.1 From 617af4b301582e1373a94c0f34bad754a1f5bc76 Mon Sep 17 00:00:00 2001 From: Grant Galitz Date: Sun, 7 Jan 2018 23:41:34 -0500 Subject: Optimize Skein - Eliminate modulus math (It runs slow inside microcode). - Convert whatever the hell was going on into a rotate op. Like... Someone kinda reinvented the wheel in order to do a simple rotate. --- xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl | 78 +++++++++++++++--------- 1 file changed, 48 insertions(+), 30 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl index 868757b..bebc2ab 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl @@ -22,68 +22,59 @@ static const __constant ulong SKEIN512_256_IV[8] = 0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL }; -#define SKEIN_INJECT_KEY(p, s) do { \ +#define SKEIN_INJECT_KEY(p, s, q) do { \ p += h; \ - p.s5 += t[s % 3]; \ - p.s6 += t[(s + 1) % 3]; \ - p.s7 += s; \ + p.s5 += t[s]; \ + p.s6 += t[select(s + 1U, 0U, s == 2U)]; \ + p.s7 += q; \ } while(0) -ulong SKEIN_ROT(const uint2 x, const uint y) -{ - if(y < 32) return(as_ulong(amd_bitalign(x, x.s10, 32 - y))); - else return(as_ulong(amd_bitalign(x.s10, x, 32 - (y - 32)))); -} - -void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const uint rc0, const uint rc1, const uint rc2, const uint rc3) +void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const ulong4 rc) { *pv0 += *pv1; - (*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc0); - (*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc1); - (*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc2); - (*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc3); + *pv1 = rotate(*pv1, (ulong4)rc); *pv1 ^= *pv0; } -ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s) +ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, const uint q) { - SKEIN_INJECT_KEY(p, s); + SKEIN_INJECT_KEY(p, s, q); ulong4 pv0 = p.even, pv1 = p.odd; - SkeinMix8(&pv0, &pv1, 46, 36, 19, 37); + SkeinMix8(&pv0, &pv1, (ulong4)(46, 36, 19, 37)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 33, 27, 14, 42); + SkeinMix8(&pv0, &pv1, (ulong4)(33, 27, 14, 42)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 17, 49, 36, 39); + SkeinMix8(&pv0, &pv1, (ulong4)(17, 49, 36, 39)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 44, 9, 54, 56); + SkeinMix8(&pv0, &pv1, (ulong4)(44, 9, 54, 56)); return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5))); } -ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s) +ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, const uint q) { - SKEIN_INJECT_KEY(p, s); + SKEIN_INJECT_KEY(p, s, q); ulong4 pv0 = p.even, pv1 = p.odd; - SkeinMix8(&pv0, &pv1, 39, 30, 34, 24); + SkeinMix8(&pv0, &pv1, (ulong4)(39, 30, 34, 24)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 13, 50, 10, 17); + SkeinMix8(&pv0, &pv1, (ulong4)(13, 50, 10, 17)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 25, 29, 39, 43); + SkeinMix8(&pv0, &pv1, (ulong4)(25, 29, 39, 43)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 8, 35, 56, 22); + SkeinMix8(&pv0, &pv1, (ulong4)(8, 35, 56, 22)); return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5))); } @@ -92,20 +83,47 @@ ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t) #pragma unroll for(int i = 0; i < 18; ++i) { - p = SkeinEvenRound(p, h, t, i); + p = SkeinEvenRound(p, h, t, 0U, i); ++i; ulong tmp = h.s0; h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); h.s7 = h8; h8 = tmp; - p = SkeinOddRound(p, h, t, i); + p = SkeinOddRound(p, h, t, 1U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinEvenRound(p, h, t, 2U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 0U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinEvenRound(p, h, t, 1U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 2U, i); tmp = h.s0; h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); h.s7 = h8; h8 = tmp; } - SKEIN_INJECT_KEY(p, 18); + p += h; + p.s5 += t[0]; + p.s6 += t[1]; + p.s7 += 18; return(p); } -- cgit v1.1 From b9fb744a104cc8eceb9196a676bea2c4f0e14d51 Mon Sep 17 00:00:00 2001 From: Grant Galitz Date: Sun, 7 Jan 2018 23:45:41 -0500 Subject: author update --- xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl | 1 + 1 file changed, 1 insertion(+) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl index bebc2ab..e2a867d 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl @@ -3,6 +3,7 @@ R"===( #define WOLF_SKEIN_CL // Vectorized Skein implementation macros and functions by Wolf +// Updated by taisel #define SKEIN_KS_PARITY 0x1BD11BDAA9FC1A22 -- cgit v1.1 From ac474caa8b9392881736ddaae349d4cb649683a9 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 24 Jan 2018 22:08:48 +0100 Subject: remove usage of `rotate` revert the change that the OpenCl function `rotate` is used instead of `SKEIN_ROT` --- xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl index e2a867d..279b652 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl @@ -30,10 +30,19 @@ static const __constant ulong SKEIN512_256_IV[8] = p.s7 += q; \ } while(0) +ulong SKEIN_ROT(const uint2 x, const uint y) +{ + if(y < 32) return(as_ulong(amd_bitalign(x, x.s10, 32 - y))); + else return(as_ulong(amd_bitalign(x.s10, x, 32 - (y - 32)))); +} + void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const ulong4 rc) { *pv0 += *pv1; - *pv1 = rotate(*pv1, (ulong4)rc); + (*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc.s0); + (*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc.s1); + (*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc.s2); + (*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc.s3); *pv1 ^= *pv0; } -- cgit v1.1 From 2bc5a055e1416a852b23eb33ac7ad0a0d96d8de5 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 30 Jan 2018 20:32:09 +0100 Subject: improve AMD auto suggestion for AEON increase the intensity limit for AEON --- xmrstak/backend/amd/autoAdjust.hpp | 3 +++ 1 file changed, 3 insertions(+) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 93b71ba..afedb5c 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -117,6 +117,9 @@ private: */ maxThreads = 2024u; } + // increase all intensity limits by two for aeon + if(!::jconf::inst()->IsCurrencyMonero()) + maxThreads *= 2u; // keep 128MiB memory free (value is randomly chosen) size_t availableMem = ctx.freeMem - (128u * byteToMiB); -- cgit v1.1 From 1ea14c8d23d8cbbb97ecf99b0a7673a031151ebe Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 30 Jan 2018 21:33:30 +0100 Subject: fix output of gpu name fix that the GPU name is printed before the name is querried from OpenCL --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index d9bc962..c39c567 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -518,13 +518,13 @@ std::vector getAMDDevices(int index) printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k); continue; } - printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory ctx.deviceIdx = k; ctx.freeMem = std::min(ctx.freeMem, maxMem); ctx.name = std::string(devNameVec.data()); ctx.DeviceID = device_list[k]; + printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); ctxVec.push_back(ctx); } } -- cgit v1.1 From 064804bd17be1216dba42fa55c820294c5e763a3 Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 1 Feb 2018 20:37:50 +1030 Subject: Fix Disabling AMD GPUs The AMD jconf.cpp would only accept an array. The config sample, and the nvidia and cpu config work with the value 'null', as they accept a 'kNullType'. This means at current, AMD GPUs could not be turned off, the config file wouldn't parse on load. This change makes AMD consistent with the others, and can now be disabled. --- xmrstak/backend/amd/jconf.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 07afb19..f126342 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -56,9 +56,10 @@ struct configVal { Type iType; }; -//Same order as in configEnum, as per comment above +// Same order as in configEnum, as per comment above +// kNullType means any type configVal oConfigValues[] = { - { aGpuThreadsConf, "gpu_threads_conf", kArrayType }, + { aGpuThreadsConf, "gpu_threads_conf", kNullType }, { iPlatformIdx, "platform_index", kNumberType } }; @@ -68,6 +69,8 @@ inline bool checkType(Type have, Type want) { if(want == have) return true; + else if(want == kNullType) + return true; else if(want == kTrueType && have == kFalseType) return true; else if(want == kFalseType && have == kTrueType) -- cgit v1.1 From b0d03b3302549b27866bb978d495c4051bc50371 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 14 Feb 2018 21:21:51 +0100 Subject: AMD: reduce register usage reduce usage of registers: based on the suggestion of @enerc77 --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index ec05712..c0b6529 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -399,7 +399,7 @@ static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x void AESExpandKey256(uint *keybuf) { //#pragma unroll 4 - for(uint c = 8, i = 1; c < 60; ++c) + for(uint c = 8, i = 1; c < 40; ++c) { // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1]; @@ -421,7 +421,7 @@ __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) { ulong State[25]; - uint ExpandedKey1[256]; + uint ExpandedKey1[40]; __local uint AES0[256], AES1[256], AES2[256], AES3[256]; uint4 text; @@ -578,7 +578,7 @@ __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads) { __local uint AES0[256], AES1[256], AES2[256], AES3[256]; - uint ExpandedKey2[256]; + uint ExpandedKey2[40]; ulong State[25]; uint4 text; @@ -632,7 +632,7 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u { text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; - #pragma unroll + #pragma unroll 10 for(int j = 0; j < 10; ++j) text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); } -- cgit v1.1 From 737185ee82bae05953680b1f4c4cdf8646c51b5a Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 17 Feb 2018 20:51:55 +0100 Subject: AMD: `mem_chunk`and new `strided_index` - add new option for `strided_index` - add additional option if `strided_index == 2` to controll the memory chunk with --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 11 +++++-- xmrstak/backend/amd/amd_gpu/gpu.hpp | 1 + xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 35 +++++++++++++++++------ xmrstak/backend/amd/autoAdjust.hpp | 2 +- xmrstak/backend/amd/config.tpl | 11 +++++-- xmrstak/backend/amd/jconf.cpp | 32 +++++++++++++++++---- xmrstak/backend/amd/jconf.hpp | 3 +- xmrstak/backend/amd/minethd.cpp | 1 + 8 files changed, 75 insertions(+), 21 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index c39c567..054ffc4 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -332,8 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[256]; snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", - hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0); + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d", + hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk)); ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); if(ret != CL_SUCCESS) { @@ -696,6 +696,13 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) for(int i = 0; i < num_gpus; ++i) { + if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) + { + size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; + ctx[i].rawIntensity = reduced_intensity; + printer::inst()->print_msg(L0, "WARNING AMD: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", ctx[i].deviceIdx, int(reduced_intensity)); + } + if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) { return ret; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index c17bac1..abfad5c 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -25,6 +25,7 @@ struct GpuContext size_t rawIntensity; size_t workSize; int stridedIndex; + int memChunk; /*Output vars*/ cl_device_id DeviceID; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index ec05712..2514092 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -411,12 +411,23 @@ void AESExpandKey256(uint *keybuf) } } +#define MEM_CHUNK (1<<4) + #if(STRIDED_INDEX==0) # define IDX(x) (x) -#else +#elif(STRIDED_INDEX==1) # define IDX(x) ((x) * (Threads)) +#elif(STRIDED_INDEX==2) +# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) #endif +inline ulong getIdx() +{ +#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2) + return get_global_id(0) - get_global_offset(0); +#endif +} + __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) { @@ -425,7 +436,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul __local uint AES0[256], AES1[256], AES2[256], AES3[256]; uint4 text; - const ulong gIdx = get_global_id(0) - get_global_offset(0); + const ulong gIdx = getIdx(); for(int i = get_local_id(1) * WORKSIZE + get_local_id(0); i < 256; @@ -439,7 +450,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul } barrier(CLK_LOCAL_MEM_FENCE); - + // do not use early return here if(gIdx < Threads) { @@ -447,8 +458,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul #if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); -#else +#elif(STRIDED_INDEX==1) Scratchpad += gIdx; +#elif(STRIDED_INDEX==2) + Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif ((ulong8 *)State)[0] = vload8(0, input); @@ -509,7 +522,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre ulong a[2], b[2]; __local uint AES0[256], AES1[256], AES2[256], AES3[256]; - const ulong gIdx = get_global_id(0) - get_global_offset(0); + const ulong gIdx = getIdx(); for(int i = get_local_id(0); i < 256; i += WORKSIZE) { @@ -523,15 +536,17 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre barrier(CLK_LOCAL_MEM_FENCE); uint4 b_x; - + // do not use early return here if(gIdx < Threads) { states += 25 * gIdx; #if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); -#else +#elif(STRIDED_INDEX==1) Scratchpad += gIdx; +#elif(STRIDED_INDEX==2) + Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif a[0] = states[0] ^ states[4]; @@ -582,7 +597,7 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u ulong State[25]; uint4 text; - const ulong gIdx = get_global_id(0) - get_global_offset(0); + const ulong gIdx = getIdx(); for(int i = get_local_id(1) * WORKSIZE + get_local_id(0); i < 256; @@ -603,8 +618,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u states += 25 * gIdx; #if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); -#else +#elif(STRIDED_INDEX==1) Scratchpad += gIdx; +#elif(STRIDED_INDEX==2) + Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif #if defined(__Tahiti__) || defined(__Pitcairn__) diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index afedb5c..b88d3ee 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -143,7 +143,7 @@ private: // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : true\n" + " \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4\n" " },\n"; } else diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 25b75a1..8914130 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -6,11 +6,16 @@ R"===( * worksize - Number of local GPU threads (nothing to do with CPU threads) * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. * strided_index - switch memory pattern used for the scratch pad memory - * true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks - * false = use a contiguous block of memory per thread + * 2 = chunked memory, chunk size is controlled by 'mem_chunk' + * required: intensity must be a multiple of worksize + * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks + * 0 or false = use a contiguous block of memory per thread + * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk + * this value is only used if 'strided_index' == 2 + * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4 }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index f126342..22381e1 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -106,14 +106,15 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); stridedIndex = GetObjectMember(oThdConf, "strided_index"); + memChunk = GetObjectMember(oThdConf, "mem_chunk"); - if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr) + if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr || memChunk == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -122,13 +123,34 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!aff->IsUint64() && !aff->IsBool()) return false; - if(!stridedIndex->IsBool()) + if(!stridedIndex->IsBool() && !stridedIndex->IsNumber()) + { + printer::inst()->print_msg(L0, "ERROR: strided_index must be a bool or a number"); + return false; + } + + if(stridedIndex->IsBool()) + cfg.stridedIndex = stridedIndex->GetBool() ? 1 : 0; + else + cfg.stridedIndex = (int)stridedIndex->GetInt64(); + + if(cfg.stridedIndex > 2) + { + printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2"); return false; + } + + cfg.memChunk = (int)memChunk->GetInt64(); + + if(!idx->IsUint64() || cfg.memChunk > 18 ) + { + printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18"); + return false; + } cfg.index = idx->GetUint64(); - cfg.intensity = intensity->GetUint64(); cfg.w_size = w_size->GetUint64(); - cfg.stridedIndex = stridedIndex->GetBool(); + cfg.intensity = intensity->GetUint64(); if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt64(); diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index ee1882a..91e5d0d 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -26,7 +26,8 @@ public: size_t intensity; size_t w_size; long long cpu_aff; - bool stridedIndex; + int stridedIndex; + int memChunk; }; size_t GetThreadCount(); diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 422c28c..ca5e163 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -97,6 +97,7 @@ bool minethd::init_gpus() vGpuData[i].rawIntensity = cfg.intensity; vGpuData[i].workSize = cfg.w_size; vGpuData[i].stridedIndex = cfg.stridedIndex; + vGpuData[i].memChunk = cfg.memChunk; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; -- cgit v1.1 From 7b3929dbeff5254a77dcd2c6be89324a11adf4c8 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 19 Feb 2018 21:52:18 +0100 Subject: add Mesa OpenCL support allow usage of Mesa OpenCl --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 24 ++++++++++++++++++++--- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 5 +++++ 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 054ffc4..af20dce 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -549,6 +549,8 @@ int getAMDPlatformIdx() clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL); int platformIndex = -1; + // Mesa OpenCL is the fallback if no AMD or Apple OpenCL is found + int mesaPlatform = -1; if(clStatus == CL_SUCCESS) { @@ -559,13 +561,29 @@ int getAMDPlatformIdx() clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL); std::string platformName(platformNameVec.data()); - if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos) + if( platformName.find("Advanced Micro Devices") != std::string::npos || + platformName.find("Apple") != std::string::npos || + platformName.find("Mesa") != std::string::npos + ) { - platformIndex = i; + printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str()); - break; + if(platformName.find("Mesa") != std::string::npos) + mesaPlatform = i; + else + { + // exit if AMD or Apple platform is found + platformIndex = i; + break; + } } } + // fall back to Mesa OpenCL + if(platformIndex == -1 && mesaPlatform != -1) + { + printer::inst()->print_msg(L0,"No AMD platform found select Mesa as OpenCL platform"); + platformIndex = mesaPlatform; + } } else printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 53299ec..9ff5bf7 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -14,6 +14,11 @@ R"===( * along with this program. If not, see . */ +/* For Mesa clover support */ +#ifdef cl_clang_storage_class_specifiers +# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable +#endif + #ifdef cl_amd_media_ops #pragma OPENCL EXTENSION cl_amd_media_ops : enable #else -- cgit v1.1 From cff6b6cbfbb3da44d85753885466de5122e20472 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 12 Feb 2018 20:39:49 +0100 Subject: add OpenCL compatibility mode - add new option `comp_mode` to the amd config - disable `if guards` within opencl kernel if `comp_mode : false` --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 17 +++++++++++------ xmrstak/backend/amd/amd_gpu/gpu.hpp | 1 + xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 19 ++++++++++++++++--- xmrstak/backend/amd/autoAdjust.hpp | 3 ++- xmrstak/backend/amd/config.tpl | 12 ++++++++---- xmrstak/backend/amd/jconf.cpp | 10 ++++++++-- xmrstak/backend/amd/jconf.hpp | 1 + xmrstak/backend/amd/minethd.cpp | 1 + 8 files changed, 48 insertions(+), 16 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 054ffc4..2f16b67 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -332,8 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[256]; snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d", - hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk)); + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d -DCOMP_MODE=%d", + hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0); ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); if(ret != CL_SUCCESS) { @@ -873,10 +873,15 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput) size_t g_intensity = ctx->rawIntensity; size_t w_size = ctx->workSize; - // round up to next multiple of w_size - size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size; - // number of global threads must be a multiple of the work group size (w_size) - assert(g_thd%w_size == 0); + size_t g_thd = g_intensity; + + if(ctx->compMode) + { + // round up to next multiple of w_size + size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size; + // number of global threads must be a multiple of the work group size (w_size) + assert(g_thd%w_size == 0); + } for(int i = 2; i < 6; ++i) { diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index abfad5c..8fb7168 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -26,6 +26,7 @@ struct GpuContext size_t workSize; int stridedIndex; int memChunk; + int compMode; /*Output vars*/ cl_device_id DeviceID; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 53299ec..4bac68c 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -451,8 +451,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul barrier(CLK_LOCAL_MEM_FENCE); +#if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) +#endif { states += 25 * gIdx; @@ -483,9 +485,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul } mem_fence(CLK_GLOBAL_MEM_FENCE); - +#if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) +#endif { #pragma unroll for(int i = 0; i < 25; ++i) states[i] = State[i]; @@ -499,9 +502,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul } mem_fence(CLK_LOCAL_MEM_FENCE); - +#if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) +#endif { #pragma unroll 2 for(int i = 0; i < (ITERATIONS >> 5); ++i) @@ -536,9 +540,10 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre barrier(CLK_LOCAL_MEM_FENCE); uint4 b_x; - +#if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) +#endif { states += 25 * gIdx; #if(STRIDED_INDEX==0) @@ -559,8 +564,10 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre mem_fence(CLK_LOCAL_MEM_FENCE); +#if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) +#endif { #pragma unroll 8 for(int i = 0; i < ITERATIONS; ++i) @@ -612,8 +619,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u barrier(CLK_LOCAL_MEM_FENCE); +#if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) +#endif { states += 25 * gIdx; #if(STRIDED_INDEX==0) @@ -641,8 +650,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u barrier(CLK_LOCAL_MEM_FENCE); +#if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) +#endif { #pragma unroll 2 for(int i = 0; i < (ITERATIONS >> 5); ++i) @@ -659,8 +670,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u barrier(CLK_GLOBAL_MEM_FENCE); +#if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) +#endif { if(!get_local_id(1)) { diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index b88d3ee..8d60b94 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -143,7 +143,8 @@ private: // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4\n" + " \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 4,\n" + " \"comp_mode\" : true\n" + " },\n"; } else diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 8914130..84251c7 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -1,9 +1,9 @@ R"===( /* * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary. - * index - GPU index number usually starts from 0 - * intensity - Number of parallel GPU threads (nothing to do with CPU threads) - * worksize - Number of local GPU threads (nothing to do with CPU threads) + * index - GPU index number usually starts from 0 + * intensity - Number of parallel GPU threads (nothing to do with CPU threads) + * worksize - Number of local GPU threads (nothing to do with CPU threads) * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. * strided_index - switch memory pattern used for the scratch pad memory * 2 = chunked memory, chunk size is controlled by 'mem_chunk' @@ -13,9 +13,13 @@ R"===( * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) + * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows + * to use a intensity which is not the multiple of the worksize. + * If you set false and the intensity is not multiple of the worksize the miner can crash: + * in this case set the intensity to a multiple of the worksize or activate comp_mode. * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4 }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 4, "comp_mode" : true }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 22381e1..93ba709 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -106,15 +106,17 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); stridedIndex = GetObjectMember(oThdConf, "strided_index"); memChunk = GetObjectMember(oThdConf, "mem_chunk"); + compMode = GetObjectMember(oThdConf, "comp_mode"); - if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr || memChunk == nullptr) + if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr || + stridedIndex == nullptr || compMode == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -148,9 +150,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return false; } + if(!compMode->IsBool()) + return false; + cfg.index = idx->GetUint64(); cfg.w_size = w_size->GetUint64(); cfg.intensity = intensity->GetUint64(); + cfg.compMode = compMode->GetBool(); if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt64(); diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index 91e5d0d..580b69f 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -28,6 +28,7 @@ public: long long cpu_aff; int stridedIndex; int memChunk; + bool compMode; }; size_t GetThreadCount(); diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index ca5e163..8dfbce5 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -98,6 +98,7 @@ bool minethd::init_gpus() vGpuData[i].workSize = cfg.w_size; vGpuData[i].stridedIndex = cfg.stridedIndex; vGpuData[i].memChunk = cfg.memChunk; + vGpuData[i].compMode = cfg.compMode; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; -- cgit v1.1 From c975def43e6e8f5a776b872ec089326fd319c0d7 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 19 Feb 2018 22:41:08 +0100 Subject: fix compile with OpenCL 1.1 guard error types those are only defined in OpenCL >1.1 --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 054ffc4..f8f8a6e 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -84,6 +84,7 @@ const char* err_to_str(cl_int ret) return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; +#ifdef CL_VERSION_1_2 case CL_COMPILE_PROGRAM_FAILURE: return "CL_COMPILE_PROGRAM_FAILURE"; case CL_LINKER_NOT_AVAILABLE: @@ -94,6 +95,7 @@ const char* err_to_str(cl_int ret) return "CL_DEVICE_PARTITION_FAILED"; case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; +#endif case CL_INVALID_VALUE: return "CL_INVALID_VALUE"; case CL_INVALID_DEVICE_TYPE: @@ -164,6 +166,7 @@ const char* err_to_str(cl_int ret) return "CL_INVALID_GLOBAL_WORK_SIZE"; case CL_INVALID_PROPERTY: return "CL_INVALID_PROPERTY"; +#ifdef CL_VERSION_1_2 case CL_INVALID_IMAGE_DESCRIPTOR: return "CL_INVALID_IMAGE_DESCRIPTOR"; case CL_INVALID_COMPILER_OPTIONS: @@ -172,6 +175,7 @@ const char* err_to_str(cl_int ret) return "CL_INVALID_LINKER_OPTIONS"; case CL_INVALID_DEVICE_PARTITION_COUNT: return "CL_INVALID_DEVICE_PARTITION_COUNT"; +#endif #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) case CL_INVALID_PIPE_SIZE: return "CL_INVALID_PIPE_SIZE"; -- cgit v1.1 From dc4e3793454a8ac7ae85704e7997d878c378b0aa Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 21 Feb 2018 22:23:00 +0100 Subject: fix broken memchunk feature fix double definition of define `MEM_CHUNK` --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 2 +- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 2 +- xmrstak/backend/amd/autoAdjust.hpp | 2 +- xmrstak/backend/amd/config.tpl | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 8c4a40d..95d30f7 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -336,7 +336,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[256]; snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d -DCOMP_MODE=%d", + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d", hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0); ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); if(ret != CL_SUCCESS) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index dbe8991..9383b04 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -416,7 +416,7 @@ void AESExpandKey256(uint *keybuf) } } -#define MEM_CHUNK (1<<4) +#define MEM_CHUNK (1< Date: Sat, 3 Mar 2018 22:39:14 +0100 Subject: add OpenCL compiler cache Reduce OpenCL start time by using a self made compiler cache. - store compiled OpenCL binary - load OpenCl binary if available --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 211 +++++++++++++++++++++++++++++++----- 1 file changed, 182 insertions(+), 29 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 95d30f7..79afa00 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -15,6 +15,7 @@ #include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/picosha2/picosha2.hpp" #include #include @@ -25,8 +26,41 @@ #include #include +#include +#include +#include +#include +#include + +#if defined _MSC_VER +#include +#elif defined __GNUC__ +#include +#include +#endif + + + #ifdef _WIN32 #include +#include + +static inline void create_directory(std::string dirname) +{ + _mkdir(dirname.data()); +} + +static inline std::string get_home() +{ + char path[MAX_PATH + 1]; + // get folder "appdata\local" + if (SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE)) + { + return path; + } + else + return "."; +} static inline void port_sleep(size_t sec) { @@ -34,6 +68,22 @@ static inline void port_sleep(size_t sec) } #else #include +#include + +static inline void create_directory(std::string dirname) +{ + mkdir(dirname.data(), 0744); +} + +static inline std::string get_home() +{ + const char *home = "."; + + if ((home = getenv("HOME")) == nullptr) + home = getpwuid(getuid())->pw_dir; + + return home; +} static inline void port_sleep(size_t sec) { @@ -327,57 +377,157 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); - if(ret != CL_SUCCESS) + std::vector devNameVec(1024); + if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the contents of cryptonight.cl", err_to_str(ret)); + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret),ctx->deviceIdx ); return ERR_OCL_API; } char options[256]; - snprintf(options, sizeof(options), + snprintf(options, sizeof(options), "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d", hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0); - ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); - if(ret != CL_SUCCESS) - { - size_t len; - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + /* create a hash for the compile time cache + * used data: + * - source code + * - device name + * - compile paramater + */ + std::string src_str(source_code); + src_str += options; + src_str += devNameVec.data(); + std::string hash_hex_str; + picosha2::hash256_hex_string(src_str, hash_hex_str); + + std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin"; + std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary); + if(!clBinFile.good()) + { + printer::inst()->print_msg(L1,"WARNING: OpenCL device %u - OpenCL binary %s not found.",ctx->deviceIdx, cache_file.c_str()); + ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); + if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return ERR_OCL_API; } - char* BuildLog = (char*)malloc(len + 1); - BuildLog[0] = '\0'; - - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); + if(ret != CL_SUCCESS) { + size_t len; + printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); + + if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + return ERR_OCL_API; + } + + char* BuildLog = (char*)malloc(len + 1); + BuildLog[0] = '\0'; + + if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + { + free(BuildLog); + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + return ERR_OCL_API; + } + + printer::inst()->print_str("Build log:\n"); + std::cerr<print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); return ERR_OCL_API; } - - printer::inst()->print_str("Build log:\n"); - std::cerr<Program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); + + + std::vector devices_ids(num_devices); + clGetProgramInfo(ctx->Program, CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); + int dev_id = 0; + /* Search for the gpu within the program context. + * The id can be different to ctx->DeviceID. + */ + for(auto & ocl_device : devices_ids) + { + if(ocl_device == ctx->DeviceID) + break; + dev_id++; + } + + cl_build_status status; + do + { + if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + return ERR_OCL_API; + } + port_sleep(1); + } + while(status == CL_BUILD_IN_PROGRESS); + + std::vector binary_sizes(num_devices); + clGetProgramInfo (ctx->Program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); + + std::vector all_programs(num_devices); + std::vector> program_storage; - cl_build_status status; - do + int p_id = 0; + size_t mem_size = 0; + // create memory structure to query all OpenCL program binaries + for(auto & p : all_programs) + { + program_storage.emplace_back(std::vector(binary_sizes[p_id])); + all_programs[p_id] = program_storage[p_id].data(); + mem_size += binary_sizes[p_id]; + p_id++; + } + + if( ret = clGetProgramInfo(ctx->Program, CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); + return ERR_OCL_API; + } + + std::ofstream file_stream; + std::cout<print_msg(L1, "OpenCL device %u - OpenCL binary file stored in file %s.",ctx->deviceIdx, cache_file.c_str()); + } + else { - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + printer::inst()->print_msg(L1, "OpenCL device %u - Load OpenCL binary file %s",ctx->deviceIdx, cache_file.c_str()); + std::ostringstream ss; + ss << clBinFile.rdbuf(); + std::string s = ss.str(); + + size_t bin_size = s.size(); + auto data_ptr = s.data(); + + cl_int clStatus; + ctx->Program = clCreateProgramWithBinary( + opencl_ctx, 1, &ctx->DeviceID, &bin_size, + (const unsigned char **)&data_ptr, &clStatus, &ret + ); + if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); + return ERR_OCL_API; + } + ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, NULL, NULL, NULL); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } - port_sleep(1); } - while(status == CL_BUILD_IN_PROGRESS); const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; for(int i = 0; i < 7; ++i) @@ -491,7 +641,7 @@ std::vector getAMDDevices(int index) printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k); continue; } - + std::string devVendor(devVendorVec.data()); if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos) { @@ -716,6 +866,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL); + // create a directory for the OpenCL compile cache + create_directory(get_home() + "/.openclcache"); + for(int i = 0; i < num_gpus; ++i) { if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) -- cgit v1.1 From 702ab5669912dd1bc9c15f9dd6854889cc09af5e Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 13 Mar 2018 20:32:07 +0100 Subject: fix shadowed variable A redefinition of a variable in a local scope avoid that the intensity is rounded to a multiple of the work size. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 79afa00..c45f211 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -1053,7 +1053,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput) if(ctx->compMode) { // round up to next multiple of w_size - size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size; + g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size; // number of global threads must be a multiple of the work group size (w_size) assert(g_thd%w_size == 0); } -- cgit v1.1 From 1e7911e653a267ffd71199cdf7afaf1cfed5bad0 Mon Sep 17 00:00:00 2001 From: xmr-stak-devs Date: Sun, 25 Mar 2018 13:21:57 +0100 Subject: XMR-Stak 2.3.0 RC Co-authored-by: psychocrypt Co-authored-by: fireice-uk Co-authored-by: Lee Clagett Co-authored-by: curie-kief --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 98 +++++--- xmrstak/backend/amd/amd_gpu/gpu.hpp | 7 +- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 273 ++++++++++++++++++++-- xmrstak/backend/amd/autoAdjust.hpp | 12 +- xmrstak/backend/amd/minethd.cpp | 38 ++- 5 files changed, 371 insertions(+), 57 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index c45f211..7547083 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -306,21 +306,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - size_t hashMemSize; - int threadMemMask; - int hasIterations; - if(::jconf::inst()->IsCurrencyMonero()) - { - hashMemSize = MONERO_MEMORY; - threadMemMask = MONERO_MASK; - hasIterations = MONERO_ITER; - } - else - { - hashMemSize = AEON_MEMORY; - threadMemMask = AEON_MASK; - hasIterations = AEON_ITER; - } + size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); + int threadMemMask = cn_select_mask(::jconf::inst()->GetMiningAlgo()); + int hashIterations = cn_select_iter(::jconf::inst()->GetMiningAlgo()); size_t g_thd = ctx->rawIntensity; ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret); @@ -384,11 +372,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - char options[256]; - snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d", - hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0); + auto miner_algo = ::jconf::inst()->GetMiningAlgo(); + char options[512]; + snprintf(options, sizeof(options), + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", + hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0, + int_port(hashMemSize), int(miner_algo)); /* create a hash for the compile time cache * used data: * - source code @@ -529,8 +519,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } } - const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; - for(int i = 0; i < 7; ++i) + const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero" }; + for(int i = 0; i < 8; ++i) { ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret); if(ret != CL_SUCCESS) @@ -887,7 +877,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) return ERR_SUCCESS; } -size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target) +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version) { cl_int ret; @@ -932,29 +922,65 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return(ERR_OCL_API); } - // CN2 Kernel + if(miner_algo == cryptonight_heavy) + { + // version + if ((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_uint), &version)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 4.", err_to_str(ret)); + return ERR_OCL_API; + } + } + + // CN1 Kernel + + /// @todo only activate if currency is monero + int cn_kernel_offset = 0; + if(miner_algo == cryptonight_monero && version >= 7) + { + cn_kernel_offset = 6; + } // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); return(ERR_OCL_API); } + if(miner_algo == cryptonight_monero && version >= 7) + { + // Input + if ((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, arugment 4(input buffer).", err_to_str(ret)); + return ERR_OCL_API; + } + } + else if(miner_algo == cryptonight_heavy) + { + // version + if ((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_uint), &version)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 3 (version).", err_to_str(ret)); + return ERR_OCL_API; + } + } + // CN3 Kernel // Scratchpads if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) @@ -1005,6 +1031,16 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return(ERR_OCL_API); } + if(miner_algo == cryptonight_heavy) + { + // version + if ((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_uint), &version)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 7.", err_to_str(ret)); + return ERR_OCL_API; + } + } + for(int i = 0; i < 4; ++i) { // States @@ -1039,7 +1075,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_SUCCESS; } -size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput) +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version) { cl_int ret; cl_uint zero = 0; @@ -1092,7 +1128,13 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput) }*/ size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + /// @todo only activate if currency is monero + int cn_kernel_offset = 0; + if(miner_algo == cryptonight_monero && version >= 7) + { + cn_kernel_offset = 6; + } + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1 + cn_kernel_offset], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 8fb7168..a387b15 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -1,6 +1,7 @@ #pragma once #include "xmrstak/misc/console.hpp" +#include "xmrstak/jconf.hpp" #if defined(__APPLE__) #include @@ -35,7 +36,7 @@ struct GpuContext cl_mem OutputBuffer; cl_mem ExtraBuffers[6]; cl_program Program; - cl_kernel Kernels[7]; + cl_kernel Kernels[8]; size_t freeMem; int computeUnits; std::string name; @@ -49,7 +50,7 @@ int getAMDPlatformIdx(); std::vector getAMDDevices(int index); size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); -size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target); -size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput); +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version); +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 9383b04..7a36357 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -433,8 +433,18 @@ inline ulong getIdx() #endif } +inline uint4 mix_and_propagate(__local uint4 xin[8][WORKSIZE]) +{ + return xin[(get_local_id(1)) % 8][get_local_id(0)] ^ xin[(get_local_id(1) + 1) % 8][get_local_id(0)]; +} + __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) -__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) +__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads +// cryptonight_heavy +#if (ALGO == 4) + , uint version +#endif +) { ulong State[25]; uint ExpandedKey1[40]; @@ -464,11 +474,11 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul states += 25 * gIdx; #if(STRIDED_INDEX==0) - Scratchpad += gIdx * (ITERATIONS >> 2); + Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) Scratchpad += gIdx; #elif(STRIDED_INDEX==2) - Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); + Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif ((ulong8 *)State)[0] = vload8(0, input); @@ -507,13 +517,41 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul } mem_fence(CLK_LOCAL_MEM_FENCE); + +// cryptonight_heavy +#if (ALGO == 4) + if(version >= 3) + { + __local uint4 xin[8][WORKSIZE]; + + /* Also left over threads performe this loop. + * The left over thread results will be ignored + */ + for(size_t i=0; i < 16; i++) + { + #pragma unroll + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]); + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); + } + } +#endif + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) #endif { + int iterations = MEMORY >> 7; +#if (ALGO == 4) + if(version < 3) + iterations >>= 1; +#endif #pragma unroll 2 - for(int i = 0; i < (ITERATIONS >> 5); ++i) + for(int i = 0; i < iterations; ++i) { #pragma unroll for(int j = 0; j < 10; ++j) @@ -525,8 +563,22 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul mem_fence(CLK_GLOBAL_MEM_FENCE); } +#define VARIANT1_1(p) \ + uint table = 0x75310U; \ + uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \ + (p).s2 ^= ((table >> index) & 0x30U) << 24 + +#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2 + +#define VARIANT1_INIT() \ + tweak1_2 = as_uint2(input[4]); \ + tweak1_2.s0 >>= 24; \ + tweak1_2.s0 |= tweak1_2.s1 << 8; \ + tweak1_2.s1 = get_global_id(0); \ + tweak1_2 ^= as_uint2(states[24]) + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads) +__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, __global ulong *input) { ulong a[2], b[2]; __local uint AES0[256], AES1[256], AES2[256], AES3[256]; @@ -544,6 +596,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre barrier(CLK_LOCAL_MEM_FENCE); + uint2 tweak1_2; uint4 b_x; #if(COMP_MODE==1) // do not use early return here @@ -552,11 +605,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre { states += 25 * gIdx; #if(STRIDED_INDEX==0) - Scratchpad += gIdx * (ITERATIONS >> 2); + Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) Scratchpad += gIdx; #elif(STRIDED_INDEX==2) - Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); + Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif a[0] = states[0] ^ states[4]; @@ -565,6 +618,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre b[1] = states[3] ^ states[7]; b_x = ((uint4 *)b)[0]; + VARIANT1_INIT(); } mem_fence(CLK_LOCAL_MEM_FENCE); @@ -581,9 +635,10 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre ((uint4 *)c)[0] = Scratchpad[IDX((a[0] & MASK) >> 4)]; ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); - //b_x ^= ((uint4 *)c)[0]; - Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0]; + b_x ^= ((uint4 *)c)[0]; + VARIANT1_1(b_x); + Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x; uint4 tmp; tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; @@ -591,18 +646,129 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); + VARIANT1_2(a[1]); Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + VARIANT1_2(a[1]); + + ((uint4 *)a)[0] ^= tmp; + + b_x = ((uint4 *)c)[0]; + } + } + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads +// cryptonight_heavy +#if (ALGO == 4) + , uint version +#endif +) +{ + ulong a[2], b[2]; + __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + + const ulong gIdx = getIdx(); + + for(int i = get_local_id(0); i < 256; i += WORKSIZE) + { + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + AES2[i] = rotate(tmp, 16U); + AES3[i] = rotate(tmp, 24U); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 b_x; +#if(COMP_MODE==1) + // do not use early return here + if(gIdx < Threads) +#endif + { + states += 25 * gIdx; +#if(STRIDED_INDEX==0) + Scratchpad += gIdx * (MEMORY >> 4); +#elif(STRIDED_INDEX==1) + Scratchpad += gIdx; +#elif(STRIDED_INDEX==2) + Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); +#endif + + a[0] = states[0] ^ states[4]; + b[0] = states[2] ^ states[6]; + a[1] = states[1] ^ states[5]; + b[1] = states[3] ^ states[7]; + + b_x = ((uint4 *)b)[0]; + } + + mem_fence(CLK_LOCAL_MEM_FENCE); + +#if(COMP_MODE==1) + // do not use early return here + if(gIdx < Threads) +#endif + { + ulong idx0 = a[0]; + ulong mask = MASK; + + int iterations = ITERATIONS; +#if (ALGO == 4) + if(version < 3) + { + iterations <<= 1; + mask -= 0x200000; + } +#endif + #pragma unroll 8 + for(int i = 0; i < iterations; ++i) + { + ulong c[2]; + + ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & mask) >> 4)]; + ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); + //b_x ^= ((uint4 *)c)[0]; + + Scratchpad[IDX((idx0 & mask) >> 4)] = b_x ^ ((uint4 *)c)[0]; + + uint4 tmp; + tmp = Scratchpad[IDX((c[0] & mask) >> 4)]; + + a[1] += c[0] * as_ulong2(tmp).s0; + a[0] += mul_hi(c[0], as_ulong2(tmp).s0); + + Scratchpad[IDX((c[0] & mask) >> 4)] = ((uint4 *)a)[0]; ((uint4 *)a)[0] ^= tmp; + idx0 = a[0]; b_x = ((uint4 *)c)[0]; +// cryptonight_heavy +#if (ALGO == 4) + if(version >= 3) + { + long n = *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0 & mask) >> 4))))[2]; + long q = n / (d | 0x5); + *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))) = n ^ q; + idx0 = d ^ q; + } +#endif } } mem_fence(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) -__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads) +__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads +// cryptonight_heavy +#if (ALGO == 4) + , uint version +#endif + ) { __local uint AES0[256], AES1[256], AES2[256], AES3[256]; uint ExpandedKey2[40]; @@ -631,11 +797,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u { states += 25 * gIdx; #if(STRIDED_INDEX==0) - Scratchpad += gIdx * (ITERATIONS >> 2); + Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) Scratchpad += gIdx; #elif(STRIDED_INDEX==2) - Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0); + Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); #endif #if defined(__Tahiti__) || defined(__Pitcairn__) @@ -655,13 +821,67 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u barrier(CLK_LOCAL_MEM_FENCE); +#if (ALGO == 4) + __local uint4 xin[8][WORKSIZE]; +#endif + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) #endif { + int iterations = MEMORY >> 7; +#if (ALGO == 4) + if(version < 3) + { + iterations >>= 1; + #pragma unroll 2 + for(int i = 0; i < iterations; ++i) + { + text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + } + } + else + { + #pragma unroll 2 + for(int i = 0; i < iterations; ++i) + { + text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + + + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); + } + + #pragma unroll 2 + for(int i = 0; i < iterations; ++i) + { + text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + + + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); + } + } +#else #pragma unroll 2 - for(int i = 0; i < (ITERATIONS >> 5); ++i) + for(int i = 0; i < iterations; ++i) { text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; @@ -669,7 +889,34 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u for(int j = 0; j < 10; ++j) text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); } +#endif + } + +// cryptonight_heavy +#if (ALGO == 4) + if(version >= 3) + { + /* Also left over threads performe this loop. + * The left over thread results will be ignored + */ + for(size_t i=0; i < 16; i++) + { + #pragma unroll + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); + } + } +#endif +#if(COMP_MODE==1) + // do not use early return here + if(gIdx < Threads) +#endif + { vstore2(as_ulong2(text), get_local_id(1) + 4, states); } diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 8950105..ea057a0 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -83,15 +83,7 @@ private: constexpr size_t byteToMiB = 1024u * 1024u; - size_t hashMemSize; - if(::jconf::inst()->IsCurrencyMonero()) - { - hashMemSize = MONERO_MEMORY; - } - else - { - hashMemSize = AEON_MEMORY; - } + size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); std::string conf; for(auto& ctx : devVec) @@ -118,7 +110,7 @@ private: maxThreads = 2024u; } // increase all intensity limits by two for aeon - if(!::jconf::inst()->IsCurrencyMonero()) + if(::jconf::inst()->GetMiningAlgo() == cryptonight_lite) maxThreads *= 2u; // keep 128MiB memory free (value is randomly chosen) diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 8dfbce5..46a04d5 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -191,9 +191,20 @@ void minethd::work_main() uint64_t iCount = 0; cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); - cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, ::jconf::inst()->IsCurrencyMonero()); + auto miner_algo = ::jconf::inst()->GetMiningAlgo(); + cn_hash_fun hash_fun; + if(miner_algo == cryptonight_monero || miner_algo == cryptonight_heavy) + { + // start with cryptonight and switch later if fork version is reached + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight); + } + else + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); + globalStates::inst().iConsumeCnt++; + uint8_t version = 0; + while (bQuit == 0) { if (oWork.bStall) @@ -207,6 +218,16 @@ void minethd::work_main() std::this_thread::sleep_for(std::chrono::milliseconds(100)); consume_work(); + uint8_t new_version = oWork.getVersion(); + if(miner_algo == cryptonight_monero && version < 7 && new_version >= 7) + { + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_monero); + } + else if(miner_algo == cryptonight_heavy && version < 3 && new_version >= 3) + { + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_heavy); + } + version = new_version; continue; } @@ -215,7 +236,8 @@ void minethd::work_main() assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); uint64_t target = oWork.iTarget; - XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target); + /// \todo add monero hard for version + XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, version); if(oWork.bNiceHash) pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39); @@ -231,7 +253,7 @@ void minethd::work_main() cl_uint results[0x100]; memset(results,0,sizeof(cl_uint)*(0x100)); - XMRRunJob(pGpuCtx, results); + XMRRunJob(pGpuCtx, results, miner_algo, version); for(size_t i = 0; i < results[0xFF]; i++) { @@ -258,6 +280,16 @@ void minethd::work_main() } consume_work(); + uint8_t new_version = oWork.getVersion(); + if(miner_algo == cryptonight_monero && version < 7 && new_version >= 7) + { + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_monero); + } + else if(miner_algo == cryptonight_heavy && version < 3 && new_version >= 3) + { + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_heavy); + } + version = new_version; } } -- cgit v1.1 From 0f9392f1171b33981b98b493b401a524ad68a756 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 25 Mar 2018 20:09:39 +0200 Subject: OpenCL precompiled code message change warning when precompiled OpenCL code is not found --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'xmrstak/backend/amd') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 7547083..8d0fd32 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -395,7 +395,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary); if(!clBinFile.good()) { - printer::inst()->print_msg(L1,"WARNING: OpenCL device %u - OpenCL binary %s not found.",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); if(ret != CL_SUCCESS) { @@ -489,11 +489,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary); file_stream.write(all_programs[dev_id], binary_sizes[dev_id]); file_stream.close(); - printer::inst()->print_msg(L1, "OpenCL device %u - OpenCL binary file stored in file %s.",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str()); } else { - printer::inst()->print_msg(L1, "OpenCL device %u - Load OpenCL binary file %s",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled cod from file %s",ctx->deviceIdx, cache_file.c_str()); std::ostringstream ss; ss << clBinFile.rdbuf(); std::string s = ss.str(); -- cgit v1.1