#include "xmrstak/backend/cryptonight.hpp" #include #include #include #include #include #include "xmrstak/jconf.hpp" #ifdef _WIN32 #include extern "C" void compat_usleep(uint64_t waitTime) { if (waitTime > 0) { if (waitTime > 100) { // use a waitable timer for larger intervals > 0.1ms HANDLE timer; LARGE_INTEGER ft; ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time timer = CreateWaitableTimer(NULL, TRUE, NULL); SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); WaitForSingleObject(timer, INFINITE); CloseHandle(timer); } else { // use a polling loop for short intervals <= 100ms LARGE_INTEGER perfCnt, start, now; __int64 elapsed; QueryPerformanceFrequency(&perfCnt); QueryPerformanceCounter(&start); do { SwitchToThread(); QueryPerformanceCounter((LARGE_INTEGER*) &now); elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000); } while ( elapsed < waitTime ); } } } #else #include extern "C" void compat_usleep(uint64_t waitTime) { usleep(waitTime); } #endif #include "cryptonight.hpp" #include "cuda_extra.hpp" #include "cuda_aes.hpp" #include "cuda_device.hpp" /* sm_2X is limited to 2GB due to the small TLB * therefore we never use 64bit indices */ #if defined(XMR_STAK_LARGEGRID) && (__CUDA_ARCH__ >= 300) typedef uint64_t IndexType; #else typedef int IndexType; #endif __device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi ) { *product_hi = __umul64hi( multiplier, multiplicand ); return (multiplier * multiplicand ); } template< typename T > __device__ __forceinline__ T loadGlobal64( T * const addr ) { #if (__CUDA_ARCH__ < 700) T x; asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) ); return x; #else return *addr; #endif } template< typename T > __device__ __forceinline__ T loadGlobal32( T * const addr ) { #if (__CUDA_ARCH__ < 700) T x; asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) ); return x; #else return *addr; #endif } template< typename T > __device__ __forceinline__ void storeGlobal32( T* addr, T const & val ) { #if (__CUDA_ARCH__ < 700) asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) ); #else *addr = val; #endif } template< typename T > __device__ __forceinline__ void storeGlobal64( T* addr, T const & val ) { #if (__CUDA_ARCH__ < 700) asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) ); #else *addr = val; #endif } template __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) { __shared__ uint32_t sharedMemory[1024]; cn_aes_gpu_init( sharedMemory ); __syncthreads( ); const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; const int sub = ( threadIdx.x & 7 ) << 2; const int batchsize = MEMORY >> bfactor; const int start = partidx * batchsize; const int end = start + batchsize; if ( thread >= threads ) return; uint32_t key[40], text[4]; MEMCPY8( key, ctx_key1 + thread * 40, 20 ); if( partidx == 0 ) { // first round MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 ); } else { // load previous text data MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 ); } __syncthreads( ); for ( int i = start; i < end; i += 32 ) { cn_aes_pseudo_round_mut( sharedMemory, text, key ); MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2); } } /** avoid warning `unused parameter` */ template< typename T > __forceinline__ __device__ void unusedVar( const T& ) { } /** shuffle data for * * - this method can be used with all compute architectures * - for =sm_30 * @param sub thread number within the group, range [0:group_n] * @param value value to share with other threads within the group * @param src thread number within the group from where the data is read, range [0:group_n] */ template __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src) { #if( __CUDA_ARCH__ < 300 ) ptr[sub] = val; return ptr[src & (group_n-1)]; #else unusedVar( ptr ); unusedVar( sub ); # if(__CUDACC_VER_MAJOR__ >= 9) return __shfl_sync(0xFFFFFFFF, val, src, group_n ); # else return __shfl( val, src, group_n ); # endif #endif } template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) #endif __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, uint32_t startNonce, uint32_t * __restrict__ d_input ) { __shared__ uint32_t sharedMemory[1024]; cn_aes_gpu_init( sharedMemory ); __syncthreads( ); const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2; const uint32_t nonce = startNonce + thread; const int sub = threadIdx.x & 3; const int sub2 = sub & 2; #if( __CUDA_ARCH__ < 300 ) extern __shared__ uint32_t shuffleMem[]; volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC)); #else volatile uint32_t* sPtr = NULL; #endif if ( thread >= threads ) return; int i, k; uint32_t j; const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor ); const int start = partidx * batchsize; const int end = start + batchsize; uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY]; uint32_t a, d[2], idx0; uint32_t t1[2], t2[2], res; uint32_t tweak1_2[2]; if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite) { uint32_t * state = d_ctx_state + thread * 50; tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8); tweak1_2[0] ^= state[48]; tweak1_2[1] = nonce; tweak1_2[1] ^= state[49]; } a = (d_ctx_a + thread * 4)[sub]; idx0 = shuffle<4>(sPtr,sub, a, 0); if(ALGO == cryptonight_heavy) { if(partidx != 0) { // state is stored after all ctx_b states idx0 = *(d_ctx_b + threads * 4 + thread); } } d[1] = (d_ctx_b + thread * 4)[sub]; #pragma unroll 2 for ( i = start; i < end; ++i ) { #pragma unroll 2 for ( int x = 0; x < 2; ++x ) { j = ( ( idx0 & MASK ) >> 2 ) + sub; const uint32_t x_0 = loadGlobal32( long_state + j ); const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); d[x] = a ^ t_fn0( x_0 & 0xff ) ^ t_fn1( (x_1 >> 8) & 0xff ) ^ t_fn2( (x_2 >> 16) & 0xff ) ^ t_fn3( ( x_3 >> 24 ) ); //XOR_BLOCKS_DST(c, b, &long_state[j]); t1[0] = shuffle<4>(sPtr,sub, d[x], 0); const uint32_t z = d[0] ^ d[1]; if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite) { const uint32_t table = 0x75310U; if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc) { const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2); const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24; storeGlobal32( long_state + j, sub == 2 ? fork_7 : z ); } else if(ALGO == cryptonight_stellite) { const uint32_t index = ((z >> 27) & 12) | ((z >> 23) & 2); const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24; storeGlobal32( long_state + j, sub == 2 ? fork_7 : z ); } } else storeGlobal32( long_state + j, z ); //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]); j = ( ( *t1 & MASK ) >> 2 ) + sub; uint32_t yy[2]; *( (uint64_t*) yy ) = loadGlobal64( ( (uint64_t *) long_state )+( j >> 1 ) ); uint32_t zz[2]; zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); t1[1] = shuffle<4>(sPtr,sub, d[x], 1); #pragma unroll for ( k = 0; k < 2; k++ ) t2[k] = shuffle<4>(sPtr,sub, a, k + sub2); *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) ); res = *( (uint64_t *) t2 ) >> ( sub & 1 ? 32 : 0 ); if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite) { const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res; uint32_t long_state_update = sub2 ? tweaked_res : res; if (ALGO == cryptonight_ipbc) { uint32_t value = shuffle<4>(sPtr,sub, long_state_update, sub & 1) ^ long_state_update; long_state_update = sub >= 2 ? value : long_state_update; } storeGlobal32( long_state + j, long_state_update ); } else storeGlobal32( long_state + j, res ); a = ( sub & 1 ? yy[1] : yy[0] ) ^ res; idx0 = shuffle<4>(sPtr,sub, a, 0); if(ALGO == cryptonight_heavy) { int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); int64_t q = n / (d | 0x5); if(sub&1) storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); idx0 = d ^ q; } } } if ( bfactor > 0 ) { (d_ctx_a + thread * 4)[sub] = a; (d_ctx_b + thread * 4)[sub] = d[1]; if(ALGO == cryptonight_heavy) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; } } template __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 ) { __shared__ uint32_t sharedMemory[1024]; cn_aes_gpu_init( sharedMemory ); __syncthreads( ); int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; int subv = ( threadIdx.x & 7 ); int sub = subv << 2; const int batchsize = MEMORY >> bfactor; const int start = (partidx % (1 << bfactor)) * batchsize; const int end = start + batchsize; if ( thread >= threads ) return; uint32_t key[40], text[4]; MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 ); __syncthreads( ); #if( __CUDA_ARCH__ < 300 ) extern __shared__ uint32_t shuffleMem[]; volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8)); #else volatile uint32_t* sPtr = NULL; #endif for ( int i = start; i < end; i += 32 ) { #pragma unroll for ( int j = 0; j < 4; ++j ) text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; cn_aes_pseudo_round_mut( sharedMemory, text, key ); if(ALGO == cryptonight_heavy) { #pragma unroll for ( int j = 0; j < 4; ++j ) text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); } } MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); } template void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) { dim3 grid( ctx->device_blocks ); dim3 block( ctx->device_threads ); dim3 block4( ctx->device_threads << 2 ); dim3 block8( ctx->device_threads << 3 ); int partcount = 1 << ctx->device_bfactor; /* bfactor for phase 1 and 3 * * phase 1 and 3 consume less time than phase 2, therefore we begin with the * kernel splitting if the user defined a `bfactor >= 5` */ int bfactorOneThree = ctx->device_bfactor - 4; if( bfactorOneThree < 0 ) bfactorOneThree = 0; int partcountOneThree = 1 << bfactorOneThree; for ( int i = 0; i < partcountOneThree; i++ ) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, (ALGO == cryptonight_heavy ? ctx->d_ctx_state2 : ctx->d_ctx_state), ctx->d_ctx_key1 )); if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); } if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); for ( int i = 0; i < partcount; i++ ) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", cryptonight_core_gpu_phase2<<< grid, block4, block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) >>>( ctx->device_blocks*ctx->device_threads, ctx->device_bfactor, i, ctx->d_long_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_state, nonce, ctx->d_input ) ); if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); } int roundsPhase3 = partcountOneThree; if(ALGO == cryptonight_heavy) { // cryptonight_heavy used two full rounds over the scratchpad memory roundsPhase3 *= 2; } for ( int i = 0; i < roundsPhase3; i++ ) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< grid, block8, block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) >>>( ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key2 )); } } void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { if(miner_algo == cryptonight_monero) { cryptonight_core_gpu_hash(ctx, startNonce); } else if(miner_algo == cryptonight_heavy) { cryptonight_core_gpu_hash(ctx, startNonce); } else if(miner_algo == cryptonight) { cryptonight_core_gpu_hash(ctx, startNonce); } else if(miner_algo == cryptonight_lite) { cryptonight_core_gpu_hash(ctx, startNonce); } else if(miner_algo == cryptonight_aeon) { cryptonight_core_gpu_hash(ctx, startNonce); } else if(miner_algo == cryptonight_ipbc) { cryptonight_core_gpu_hash(ctx, startNonce); } else if(miner_algo == cryptonight_stellite) { cryptonight_core_gpu_hash(ctx, startNonce); } }