diff options
Diffstat (limited to 'xmrstak')
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 46 | ||||
-rw-r--r-- | xmrstak/version.cpp | 2 |
2 files changed, 21 insertions, 27 deletions
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 7aa44e8..43740d2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -352,7 +352,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti int sub = subv << 2; const int batchsize = MEMORY >> bfactor; - const int start = partidx * batchsize; + const int start = (partidx % (1 << bfactor)) * batchsize; const int end = start + batchsize; if ( thread >= threads ) @@ -365,15 +365,15 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti __syncthreads( ); #if( __CUDA_ARCH__ < 300 ) - extern __shared__ uint32_t shuffleMem[]; - volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC)); + extern __shared__ uint32_t shuffleMem[]; + volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8)); #else - volatile uint32_t* sPtr = NULL; + volatile uint32_t* sPtr = NULL; #endif for ( int i = start; i < end; i += 32 ) { -#pragma unroll + #pragma unroll for ( int j = 0; j < 4; ++j ) text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; @@ -381,25 +381,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti if(ALGO == cryptonight_heavy) { -#pragma unroll - for ( int j = 0; j < 4; ++j ) - text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); - } - } - - if(ALGO == cryptonight_heavy) - { - __syncthreads( ); - - for ( int i = start; i < end; i += 32 ) - { -#pragma unroll - for ( int j = 0; j < 4; ++j ) - text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; - - cn_aes_pseudo_round_mut( sharedMemory, text, key ); - -#pragma unroll + #pragma unroll for ( int j = 0; j < 4; ++j ) text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); } @@ -466,9 +448,21 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); } - for ( int i = 0; i < partcountOneThree; i++ ) + int roundsPhase3 = partcountOneThree; + + if(ALGO == cryptonight_heavy) + { + // cryptonight_heavy used two full rounds over the scratchpad memory + roundsPhase3 *= 2; + } + + for ( int i = 0; i < roundsPhase3; i++ ) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< + grid, + block8, + block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key2 )); diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index 04948d1..579c27a 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -18,7 +18,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.4.1" +#define XMR_STAK_VERSION "2.4.2" #if defined(_WIN32) #define OS_TYPE "win" |