diff options
-rw-r--r-- | doc/compile_Windows.md | 2 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl | 76 | ||||
-rw-r--r-- | xmrstak/backend/amd/autoAdjust.hpp | 30 | ||||
-rw-r--r-- | xmrstak/backend/cpu/minethd.cpp | 7 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu | 6 |
5 files changed, 84 insertions, 37 deletions
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md index e32906c..a512373 100644 --- a/doc/compile_Windows.md +++ b/doc/compile_Windows.md @@ -25,7 +25,7 @@ - Download and install https://developer.nvidia.com/cuda-downloads - For minimal install choose `Custom installation options` during the install and select - - CUDA/Develpment + - CUDA/Development - CUDA/Visual Studio Integration (ignore the warning during the install that VS2017 is not supported) - CUDA/Runtime - Driver components diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl index 868757b..279b652 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl @@ -3,6 +3,7 @@ R"===( #define WOLF_SKEIN_CL // Vectorized Skein implementation macros and functions by Wolf +// Updated by taisel #define SKEIN_KS_PARITY 0x1BD11BDAA9FC1A22 @@ -22,11 +23,11 @@ static const __constant ulong SKEIN512_256_IV[8] = 0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL }; -#define SKEIN_INJECT_KEY(p, s) do { \ +#define SKEIN_INJECT_KEY(p, s, q) do { \ p += h; \ - p.s5 += t[s % 3]; \ - p.s6 += t[(s + 1) % 3]; \ - p.s7 += s; \ + p.s5 += t[s]; \ + p.s6 += t[select(s + 1U, 0U, s == 2U)]; \ + p.s7 += q; \ } while(0) ulong SKEIN_ROT(const uint2 x, const uint y) @@ -35,55 +36,55 @@ ulong SKEIN_ROT(const uint2 x, const uint y) else return(as_ulong(amd_bitalign(x.s10, x, 32 - (y - 32)))); } -void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const uint rc0, const uint rc1, const uint rc2, const uint rc3) +void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const ulong4 rc) { *pv0 += *pv1; - (*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc0); - (*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc1); - (*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc2); - (*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc3); + (*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc.s0); + (*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc.s1); + (*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc.s2); + (*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc.s3); *pv1 ^= *pv0; } -ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s) +ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, const uint q) { - SKEIN_INJECT_KEY(p, s); + SKEIN_INJECT_KEY(p, s, q); ulong4 pv0 = p.even, pv1 = p.odd; - SkeinMix8(&pv0, &pv1, 46, 36, 19, 37); + SkeinMix8(&pv0, &pv1, (ulong4)(46, 36, 19, 37)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 33, 27, 14, 42); + SkeinMix8(&pv0, &pv1, (ulong4)(33, 27, 14, 42)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 17, 49, 36, 39); + SkeinMix8(&pv0, &pv1, (ulong4)(17, 49, 36, 39)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 44, 9, 54, 56); + SkeinMix8(&pv0, &pv1, (ulong4)(44, 9, 54, 56)); return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5))); } -ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s) +ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, const uint q) { - SKEIN_INJECT_KEY(p, s); + SKEIN_INJECT_KEY(p, s, q); ulong4 pv0 = p.even, pv1 = p.odd; - SkeinMix8(&pv0, &pv1, 39, 30, 34, 24); + SkeinMix8(&pv0, &pv1, (ulong4)(39, 30, 34, 24)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 13, 50, 10, 17); + SkeinMix8(&pv0, &pv1, (ulong4)(13, 50, 10, 17)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 25, 29, 39, 43); + SkeinMix8(&pv0, &pv1, (ulong4)(25, 29, 39, 43)); pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); - SkeinMix8(&pv0, &pv1, 8, 35, 56, 22); + SkeinMix8(&pv0, &pv1, (ulong4)(8, 35, 56, 22)); return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5))); } @@ -92,20 +93,47 @@ ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t) #pragma unroll for(int i = 0; i < 18; ++i) { - p = SkeinEvenRound(p, h, t, i); + p = SkeinEvenRound(p, h, t, 0U, i); ++i; ulong tmp = h.s0; h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); h.s7 = h8; h8 = tmp; - p = SkeinOddRound(p, h, t, i); + p = SkeinOddRound(p, h, t, 1U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinEvenRound(p, h, t, 2U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 0U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinEvenRound(p, h, t, 1U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 2U, i); tmp = h.s0; h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); h.s7 = h8; h8 = tmp; } - SKEIN_INJECT_KEY(p, 18); + p += h; + p.s5 += t[0]; + p.s6 += t[1]; + p.s7 += 18; return(p); } diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 511a712..93b71ba 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -94,7 +94,6 @@ private: } std::string conf; - int i = 0; for(auto& ctx : devVec) { /* 1000 is a magic selected limit, the reason is that more than 2GiB memory @@ -127,14 +126,27 @@ private: size_t possibleIntensity = std::min( maxThreads , maxIntensity ); // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; - conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n"; - conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n"; - // set 8 threads per block (this is a good value for the most gpus) - conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + - " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : true\n" - " },\n"; - ++i; + //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity + if (intensity == 0) + { + printer::inst()->print_msg(L0, "WARNING: Auto detected intensity unexpectedly low. Try to set the environment variable GPU_SINGLE_ALLOC_PERCENT."); + intensity = possibleIntensity; + + } + if (intensity != 0) + { + conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n"; + conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n"; + // set 8 threads per block (this is a good value for the most gpus) + conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : true\n" + " },\n"; + } + else + { + printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str()); + } } configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex)); diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 1d9165e..cef4f8e 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -448,12 +448,13 @@ void minethd::work_main() globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk); } - *piNonce = ++result.iNonce; + *piNonce = result.iNonce; hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); if (*piHashVal < oWork.iTarget) executor::inst()->push_event(ex_event(result, oWork.iPoolId)); + result.iNonce++; std::this_thread::yield(); } @@ -637,7 +638,7 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi) } for (size_t i = 0; i < N; i++) - *piNonce[i] = ++iNonce; + *piNonce[i] = iNonce++; hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx); @@ -645,7 +646,7 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi) { if (*piHashVal[i] < oWork.iTarget) { - executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + 1 + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId)); + executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId)); } } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index d865e13..d5d0039 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -441,6 +441,12 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) maxMemUsage = size_t(1024u) * byteToMiB; } + if(props.multiProcessorCount <= 6) + { + // limit memory usage for low end devices to reduce the number of threads + maxMemUsage = size_t(1024u) * byteToMiB; + } + int* tmp; cudaError_t err; // a device must be selected to get the right memory usage later on |