diff options
author | xmr-stak-devs <email@example.com> | 2018-03-25 13:21:57 +0100 |
---|---|---|
committer | fireice-uk <fireice-uk@users.noreply.github.com> | 2018-03-25 13:28:40 +0100 |
commit | 1e7911e653a267ffd71199cdf7afaf1cfed5bad0 (patch) | |
tree | 29efb953d9851b352298369104bf777fa3bf34e1 /xmrstak/backend/nvidia | |
parent | 5014bdda628f64ab780d02de371bac4997573d10 (diff) | |
download | xmr-stak-1e7911e653a267ffd71199cdf7afaf1cfed5bad0.zip xmr-stak-1e7911e653a267ffd71199cdf7afaf1cfed5bad0.tar.gz |
XMR-Stak 2.3.0 RC
Co-authored-by: psychocrypt <psychocryptHPC@gmail.com>
Co-authored-by: fireice-uk <fireice-uk@users.noreply.github.com>
Co-authored-by: Lee Clagett <code@leeclagett.com>
Co-authored-by: curie-kief <curie-kief@users.noreply.github.com>
Diffstat (limited to 'xmrstak/backend/nvidia')
-rw-r--r-- | xmrstak/backend/nvidia/minethd.cpp | 38 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp | 10 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 231 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu | 156 |
4 files changed, 338 insertions, 97 deletions
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 867a998..153e4e3 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -237,12 +237,20 @@ void minethd::work_main() uint64_t iCount = 0; cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); - cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, ::jconf::inst()->IsCurrencyMonero()); + auto miner_algo = ::jconf::inst()->GetMiningAlgo(); + cn_hash_fun hash_fun; + if(miner_algo == cryptonight_monero || miner_algo == cryptonight_heavy) + { + // start with cryptonight and switch later if fork version is reached + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight); + } + else + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); uint32_t iNonce; globalStates::inst().iConsumeCnt++; - bool mineMonero = strcmp_i(::jconf::inst()->GetCurrency(), "monero"); + uint8_t version = 0; while (bQuit == 0) { @@ -257,6 +265,16 @@ void minethd::work_main() std::this_thread::sleep_for(std::chrono::milliseconds(100)); consume_work(); + uint8_t new_version = oWork.getVersion(); + if(miner_algo == cryptonight_monero && version < 7 && new_version >= 7) + { + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_monero); + } + else if(miner_algo == cryptonight_heavy && version < 3 && new_version >= 3) + { + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_heavy); + } + version = new_version; continue; } @@ -281,11 +299,11 @@ void minethd::work_main() uint32_t foundNonce[10]; uint32_t foundCount; - cryptonight_extra_cpu_prepare(&ctx, iNonce); + cryptonight_extra_cpu_prepare(&ctx, iNonce, miner_algo, version); - cryptonight_core_cpu_hash(&ctx, mineMonero); + cryptonight_core_cpu_hash(&ctx, miner_algo, iNonce, version); - cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce); + cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce, miner_algo, version); for(size_t i = 0; i < foundCount; i++) { @@ -316,6 +334,16 @@ void minethd::work_main() } consume_work(); + uint8_t new_version = oWork.getVersion(); + if(miner_algo == cryptonight_monero && version < 7 && new_version >= 7) + { + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_monero); + } + else if(miner_algo == cryptonight_heavy && version < 3 && new_version >= 3) + { + hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_heavy); + } + version = new_version; } } diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index afbdbaf..29a3523 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -3,6 +3,9 @@ #include <stdint.h> #include <string> +#include "xmrstak/jconf.hpp" +#include "xmrstak/backend/cryptonight.hpp" + typedef struct { int device_id; const char *device_name; @@ -20,6 +23,7 @@ typedef struct { uint32_t *d_result_nonce; uint32_t *d_long_state; uint32_t *d_ctx_state; + uint32_t *d_ctx_state2; uint32_t *d_ctx_a; uint32_t *d_ctx_b; uint32_t *d_ctx_key1; @@ -41,8 +45,8 @@ int cuda_get_devicecount( int* deviceCount); int cuda_get_deviceinfo(nvid_ctx *ctx); int cryptonight_extra_cpu_init(nvid_ctx *ctx); void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len); -void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce); -void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce); +void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo, uint8_t version); +void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo, uint8_t version); } -void cryptonight_core_cpu_hash(nvid_ctx* ctx, bool mineMonero); +void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce, uint8_t version); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index cc97274..ede578f 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -6,6 +6,8 @@ #include <cuda.h> #include <cuda_runtime.h> +#include "xmrstak/jconf.hpp" + #ifdef _WIN32 #include <windows.h> extern "C" void compat_usleep(uint64_t waitTime) @@ -106,8 +108,18 @@ __device__ __forceinline__ void storeGlobal32( T* addr, T const & val ) #endif } -template<size_t ITERATIONS, uint32_t THREAD_SHIFT> -__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1 ) +template< typename T > +__device__ __forceinline__ void storeGlobal64( T* addr, T const & val ) +{ +#if (__CUDA_ARCH__ < 700) + asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) ); +#else + *addr = val; +#endif +} + +template<size_t ITERATIONS, uint32_t MEMORY> +__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) { __shared__ uint32_t sharedMemory[1024]; @@ -117,7 +129,7 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; const int sub = ( threadIdx.x & 7 ) << 2; - const int batchsize = ITERATIONS >> bfactor; + const int batchsize = MEMORY >> bfactor; const int start = partidx * batchsize; const int end = start + batchsize; @@ -131,18 +143,18 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti if( partidx == 0 ) { // first round - MEMCPY8( text, ctx_state + thread * 50 + sub + 16, 2 ); + MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 ); } else { // load previous text data - MEMCPY8( text, &long_state[( (uint64_t) thread << THREAD_SHIFT ) + sub + start - 32], 2 ); + MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 ); } __syncthreads( ); for ( int i = start; i < end; i += 32 ) { cn_aes_pseudo_round_mut( sharedMemory, text, key ); - MEMCPY8(&long_state[((uint64_t) thread << THREAD_SHIFT) + (sub + i)], text, 2); + MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2); } } @@ -157,33 +169,37 @@ __forceinline__ __device__ void unusedVar( const T& ) * - this method can be used with all compute architectures * - for <sm_30 shared memory is needed * + * group_n - must be a power of 2! + * * @param ptr pointer to shared memory, size must be `threadIdx.x * sizeof(uint32_t)` * value can be NULL for compute architecture >=sm_30 - * @param sub thread number within the group, range [0;4) + * @param sub thread number within the group, range [0:group_n] * @param value value to share with other threads within the group - * @param src thread number within the group from where the data is read, range [0;4) + * @param src thread number within the group from where the data is read, range [0:group_n] */ +template<size_t group_n> __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src) { #if( __CUDA_ARCH__ < 300 ) ptr[sub] = val; - return ptr[src&3]; + return ptr[src & (group_n-1)]; #else unusedVar( ptr ); unusedVar( sub ); # if(__CUDACC_VER_MAJOR__ >= 9) - return __shfl_sync(0xFFFFFFFF, val, src, 4 ); + return __shfl_sync(0xFFFFFFFF, val, src, group_n ); # else - return __shfl( val, src, 4 ); + return __shfl( val, src, group_n ); # endif #endif } -template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK> +template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO> #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) #endif -__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b ) +__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, + uint32_t startNonce, uint32_t * __restrict__ d_input ) { __shared__ uint32_t sharedMemory[1024]; @@ -192,6 +208,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti __syncthreads( ); const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2; + const uint32_t nonce = startNonce + thread; const int sub = threadIdx.x & 3; const int sub2 = sub & 2; @@ -205,30 +222,48 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti return; int i, k; - uint32_t j; + uint32_t j; const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor ); const int start = partidx * batchsize; const int end = start + batchsize; - uint32_t * long_state = &d_long_state[(IndexType) thread << THREAD_SHIFT]; - uint32_t * ctx_a = d_ctx_a + thread * 4; - uint32_t * ctx_b = d_ctx_b + thread * 4; - uint32_t a, d[2]; + uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY]; + uint32_t a, d[2], idx0; uint32_t t1[2], t2[2], res; - a = ctx_a[sub]; - d[1] = ctx_b[sub]; + uint32_t tweak1_2[2]; + if (ALGO == cryptonight_monero) + { + uint32_t * state = d_ctx_state + thread * 50; + tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8); + tweak1_2[0] ^= state[48]; + tweak1_2[1] = nonce; + tweak1_2[1] ^= state[49]; + } + + a = (d_ctx_a + thread * 4)[sub]; + idx0 = shuffle<4>(sPtr,sub, a, 0); + if(ALGO == cryptonight_heavy) + { + if(partidx != 0) + { + // state is stored after all ctx_b states + idx0 = *(d_ctx_b + threads * 4 + thread); + } + } + d[1] = (d_ctx_b + thread * 4)[sub]; + #pragma unroll 2 for ( i = start; i < end; ++i ) { #pragma unroll 2 for ( int x = 0; x < 2; ++x ) { - j = ( ( shuffle(sPtr,sub, a, 0) & MASK ) >> 2 ) + sub; + j = ( ( idx0 & MASK ) >> 2 ) + sub; const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j ); - const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1); - const uint32_t x_2 = shuffle(sPtr,sub, x_0, sub + 2); - const uint32_t x_3 = shuffle(sPtr,sub, x_0, sub + 3); + const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); + const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); + const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); d[x] = a ^ t_fn0( x_0 & 0xff ) ^ t_fn1( (x_1 >> 8) & 0xff ) ^ @@ -237,41 +272,74 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti //XOR_BLOCKS_DST(c, b, &long_state[j]); - t1[0] = shuffle(sPtr,sub, d[x], 0); - //long_state[j] = d[0] ^ d[1]; - storeGlobal32( long_state + j, d[0] ^ d[1] ); - + t1[0] = shuffle<4>(sPtr,sub, d[x], 0); + + const uint32_t z = d[0] ^ d[1]; + if(ALGO == cryptonight_monero) + { + const uint32_t table = 0x75310U; + const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2); + const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24; + storeGlobal32( long_state + j, sub == 2 ? fork_7 : z ); + } + else + storeGlobal32( long_state + j, z ); + //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]); j = ( ( *t1 & MASK ) >> 2 ) + sub; uint32_t yy[2]; *( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) ); uint32_t zz[2]; - zz[0] = shuffle(sPtr,sub, yy[0], 0); - zz[1] = shuffle(sPtr,sub, yy[1], 0); + zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); + zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); - t1[1] = shuffle(sPtr,sub, d[x], 1); + t1[1] = shuffle<4>(sPtr,sub, d[x], 1); #pragma unroll for ( k = 0; k < 2; k++ ) - t2[k] = shuffle(sPtr,sub, a, k + sub2); + t2[k] = shuffle<4>(sPtr,sub, a, k + sub2); *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) ); res = *( (uint64_t *) t2 ) >> ( sub & 1 ? 32 : 0 ); - storeGlobal32( long_state + j, res ); + + if(ALGO == cryptonight_monero) + { + const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res; + const uint32_t long_state_update = sub2 ? tweaked_res : res; + storeGlobal32( long_state + j, long_state_update ); + } + else + storeGlobal32( long_state + j, res ); + a = ( sub & 1 ? yy[1] : yy[0] ) ^ res; + idx0 = shuffle<4>(sPtr,sub, a, 0); + if(ALGO == cryptonight_heavy) + { + int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); + int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); + int64_t q = n / (d | 0x5); + + if(sub&1) + storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); + + idx0 = d ^ q; + } } } if ( bfactor > 0 ) { - ctx_a[sub] = a; - ctx_b[sub] = d[1]; + (d_ctx_a + thread * 4)[sub] = a; + (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_heavy) + if(sub&1) + *(d_ctx_b + threads * 4 + thread) = idx0; } } -template<size_t ITERATIONS, uint32_t THREAD_SHIFT> +template<size_t ITERATIONS, uint32_t MEMORY, xmrstak_algo ALGO> __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 ) { __shared__ uint32_t sharedMemory[1024]; @@ -280,9 +348,10 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti __syncthreads( ); int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; - int sub = ( threadIdx.x & 7 ) << 2; + int subv = ( threadIdx.x & 7 ); + int sub = subv << 2; - const int batchsize = ITERATIONS >> bfactor; + const int batchsize = MEMORY >> bfactor; const int start = partidx * batchsize; const int end = start + batchsize; @@ -294,20 +363,53 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 ); __syncthreads( ); + +#if( __CUDA_ARCH__ < 300 ) + extern __shared__ uint32_t shuffleMem[]; + volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC)); +#else + volatile uint32_t* sPtr = NULL; +#endif + for ( int i = start; i < end; i += 32 ) { #pragma unroll for ( int j = 0; j < 4; ++j ) - text[j] ^= long_state[((IndexType) thread << THREAD_SHIFT) + (sub + i + j)]; + text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; cn_aes_pseudo_round_mut( sharedMemory, text, key ); + + if(ALGO == cryptonight_heavy) + { +#pragma unroll + for ( int j = 0; j < 4; ++j ) + text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); + } + } + + if(ALGO == cryptonight_heavy) + { + __syncthreads( ); + + for ( int i = start; i < end; i += 32 ) + { +#pragma unroll + for ( int j = 0; j < 4; ++j ) + text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; + + cn_aes_pseudo_round_mut( sharedMemory, text, key ); + +#pragma unroll + for ( int j = 0; j < 4; ++j ) + text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); + } } MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); } -template<size_t ITERATIONS, uint32_t MASK, uint32_t THREAD_SHIFT> -void cryptonight_core_gpu_hash(nvid_ctx* ctx) +template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO> +void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) { dim3 grid( ctx->device_blocks ); dim3 block( ctx->device_threads ); @@ -329,9 +431,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx) for ( int i = 0; i < partcountOneThree; i++ ) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,THREAD_SHIFT><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,MEMORY><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, - ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key1 )); + ctx->d_long_state, + (ALGO == cryptonight_heavy ? ctx->d_ctx_state2 : ctx->d_ctx_state), + ctx->d_ctx_key1 )); if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); } @@ -342,7 +446,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx) CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", - cryptonight_core_gpu_phase2<ITERATIONS,THREAD_SHIFT,MASK><<< + cryptonight_core_gpu_phase2<ITERATIONS,MEMORY,MASK,ALGO><<< grid, block4, block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) @@ -352,7 +456,10 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx) i, ctx->d_long_state, ctx->d_ctx_a, - ctx->d_ctx_b + ctx->d_ctx_b, + ctx->d_ctx_state, + nonce, + ctx->d_input ) ); @@ -361,25 +468,39 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx) for ( int i = 0; i < partcountOneThree; i++ ) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,THREAD_SHIFT><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key2 )); } } -void cryptonight_core_cpu_hash(nvid_ctx* ctx, bool mineMonero) +void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce, uint8_t version) { -#ifndef CONF_NO_MONERO - if(mineMonero) + + if(miner_algo == cryptonight_monero) { - cryptonight_core_gpu_hash<MONERO_ITER, MONERO_MASK, 19u>(ctx); + if(version >= 7) + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce); + else + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce); } -#endif -#ifndef CONF_NO_AEON - if(!mineMonero) + else if(miner_algo == cryptonight_heavy) { - cryptonight_core_gpu_hash<AEON_ITER, AEON_MASK, 18u>(ctx); + if(version >= 3) + cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>(ctx, startNonce); + else + { + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce); + } } -#endif + else if(miner_algo == cryptonight) + { + cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce); + } + else if(miner_algo == cryptonight_lite) + { + cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>(ctx, startNonce); + } + } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 92259db..2f08a1a 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -28,6 +28,7 @@ uint64_t keccakf_rndc[24] ={ typedef unsigned char BitSequence; typedef unsigned long long DataLength; +#include "xmrstak/backend/cryptonight.hpp" #include "cryptonight.hpp" #include "cuda_extra.hpp" #include "cuda_keccak.hpp" @@ -36,6 +37,7 @@ typedef unsigned long long DataLength; #include "cuda_jh.hpp" #include "cuda_skein.hpp" #include "cuda_device.hpp" +#include "cuda_aes.hpp" __constant__ uint8_t d_sub_byte[16][16] ={ {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 }, @@ -90,10 +92,33 @@ __device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__ } } -__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 ) +__device__ __forceinline__ void mix_and_propagate( uint32_t* state ) +{ + uint32_t tmp0[4]; + for(size_t x = 0; x < 4; ++x) + tmp0[x] = (state)[x]; + + // set destination [0,6] + for(size_t t = 0; t < 7; ++t) + for(size_t x = 0; x < 4; ++x) + (state + 4 * t)[x] = (state + 4 * t)[x] ^ (state + 4 * (t + 1))[x]; + + // set destination 7 + for(size_t x = 0; x < 4; ++x) + (state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x]; +} + +template<xmrstak_algo ALGO> +__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 ) { int thread = ( blockDim.x * blockIdx.x + threadIdx.x ); + __shared__ uint32_t sharedMemory[1024]; + if(ALGO == cryptonight_heavy) + { + cn_aes_gpu_init( sharedMemory ); + __syncthreads( ); + } if ( thread >= threads ) return; @@ -113,20 +138,45 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state ); cryptonight_aes_set_key( ctx_key1, ctx_state ); cryptonight_aes_set_key( ctx_key2, ctx_state + 8 ); + XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); - - memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 ); memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); + memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 ); + + if(ALGO == cryptonight_heavy) + { + + for(int i=0; i < 16; i++) + { + for(size_t t = 4; t < 12; ++t) + { + cn_aes_pseudo_round_mut( sharedMemory, ctx_state + 4u * t, ctx_key1 ); + } + // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values + mix_and_propagate(ctx_state + 4 * 4); + } + // double buffer to move manipulated state into phase1 + memcpy( d_ctx_state2 + thread * 50, ctx_state, 50 * 4 ); + } } -__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state ) +template<xmrstak_algo ALGO> +__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 ) { const int thread = blockDim.x * blockIdx.x + threadIdx.x; + __shared__ uint32_t sharedMemory[1024]; + + if(ALGO == cryptonight_heavy) + { + cn_aes_gpu_init( sharedMemory ); + __syncthreads( ); + } if ( thread >= threads ) return; @@ -134,11 +184,28 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50; uint64_t hash[4]; uint32_t state[50]; - -#pragma unroll + + #pragma unroll for ( i = 0; i < 50; i++ ) state[i] = ctx_state[i]; + if(ALGO == cryptonight_heavy) + { + uint32_t key[40]; + + // load keys + MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); + + for(int i=0; i < 16; i++) + { + for(size_t t = 4; t < 12; ++t) + { + cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key ); + } + // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values + mix_and_propagate(state + 4 * 4); + } + } cn_keccakf2( (uint64_t *) state ); switch ( ( (uint8_t *) state )[0] & 0x03 ) @@ -212,23 +279,26 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) if(gpuArch < 70) CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); - size_t hashMemSize; - if(::jconf::inst()->IsCurrencyMonero()) - { - hashMemSize = MONERO_MEMORY; - } - else - { - hashMemSize = AEON_MEMORY; - } + size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); size_t wsize = ctx->device_blocks * ctx->device_threads; CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize)); + size_t ctx_b_size = 4 * sizeof(uint32_t) * wsize; + if(cryptonight_heavy == ::jconf::inst()->GetMiningAlgo()) + { + // extent ctx_b to hold the state of idx0 + ctx_b_size += sizeof(uint32_t) * wsize; + // create a double buffer for the state to exchange the mixed state to phase1 + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); + } + else + ctx->d_ctx_state2 = ctx->d_ctx_state; + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_text, 32 * sizeof(uint32_t) * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize)); - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, 4 * sizeof(uint32_t) * wsize)); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) )); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) )); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) )); @@ -239,7 +309,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) return 1; } -extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce) +extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo, uint8_t version) { int threadsperblock = 128; uint32_t wsize = ctx->device_blocks * ctx->device_threads; @@ -247,11 +317,22 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock ); dim3 block( threadsperblock ); - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + if(miner_algo == cryptonight_heavy && version >= 3) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_heavy><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } + else + { + /* pass two times d_ctx_state because the second state is used later in phase1, + * the first is used than in phase3 + */ + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<invalid_algo><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } } -extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce) +extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo, uint8_t version) { int threadsperblock = 128; uint32_t wsize = ctx->device_blocks * ctx->device_threads; @@ -262,11 +343,23 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) )); CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) )); - CUDA_CHECK_MSG_KERNEL( - ctx->device_id, - "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state ) - ); + if(miner_algo == cryptonight_heavy && version >= 3) + { + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", + cryptonight_extra_gpu_final<cryptonight_heavy><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) + ); + } + else + { + // fallback for all other algorithms + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", + cryptonight_extra_gpu_final<invalid_algo><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) + ); + } CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost )); CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )); @@ -482,15 +575,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->total_device_memory = totalMemory; ctx->free_device_memory = freeMemory; - size_t hashMemSize; - if(::jconf::inst()->IsCurrencyMonero()) - { - hashMemSize = MONERO_MEMORY; - } - else - { - hashMemSize = AEON_MEMORY; - } + size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); #ifdef WIN32 /* We use in windows bfactor (split slow kernel into smaller parts) to avoid @@ -520,6 +605,9 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) // up to 16kibyte extra memory is used per thread for some kernel (lmem/local memory) // 680bytes are extra meta data memory per hash size_t perThread = hashMemSize + 16192u + 680u; + if(cryptonight_heavy == ::jconf::inst()->GetMiningAlgo()) + perThread += 50 * 4; // state double buffer + size_t max_intensity = limitedMemory / perThread; ctx->device_threads = max_intensity / ctx->device_blocks; // use only odd number of threads |