diff options
author | fireice-uk <fireice-uk@users.noreply.github.com> | 2018-04-04 22:35:50 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-04-04 22:35:50 +0100 |
commit | e10e8e67492cf3118af8b7d7609937e85e572305 (patch) | |
tree | 9d2301933ad71b4c83e35e18a206d3e69d443a4b | |
parent | 737383a5a5cac85577024aeba1da16572893cc91 (diff) | |
parent | 945524b3805e022a7ecaa41a557bee3828619ecd (diff) | |
download | xmr-stak-e10e8e67492cf3118af8b7d7609937e85e572305.zip xmr-stak-e10e8e67492cf3118af8b7d7609937e85e572305.tar.gz |
Merge pull request #1301 from fireice-uk/dev2.4.2
Bug fix release 2.4.2
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 46 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu | 2 | ||||
-rw-r--r-- | xmrstak/version.cpp | 2 |
3 files changed, 22 insertions, 28 deletions
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 7aa44e8..43740d2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -352,7 +352,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti int sub = subv << 2; const int batchsize = MEMORY >> bfactor; - const int start = partidx * batchsize; + const int start = (partidx % (1 << bfactor)) * batchsize; const int end = start + batchsize; if ( thread >= threads ) @@ -365,15 +365,15 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti __syncthreads( ); #if( __CUDA_ARCH__ < 300 ) - extern __shared__ uint32_t shuffleMem[]; - volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC)); + extern __shared__ uint32_t shuffleMem[]; + volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8)); #else - volatile uint32_t* sPtr = NULL; + volatile uint32_t* sPtr = NULL; #endif for ( int i = start; i < end; i += 32 ) { -#pragma unroll + #pragma unroll for ( int j = 0; j < 4; ++j ) text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; @@ -381,25 +381,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti if(ALGO == cryptonight_heavy) { -#pragma unroll - for ( int j = 0; j < 4; ++j ) - text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); - } - } - - if(ALGO == cryptonight_heavy) - { - __syncthreads( ); - - for ( int i = start; i < end; i += 32 ) - { -#pragma unroll - for ( int j = 0; j < 4; ++j ) - text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; - - cn_aes_pseudo_round_mut( sharedMemory, text, key ); - -#pragma unroll + #pragma unroll for ( int j = 0; j < 4; ++j ) text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); } @@ -466,9 +448,21 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); } - for ( int i = 0; i < partcountOneThree; i++ ) + int roundsPhase3 = partcountOneThree; + + if(ALGO == cryptonight_heavy) + { + // cryptonight_heavy used two full rounds over the scratchpad memory + roundsPhase3 *= 2; + } + + for ( int i = 0; i < roundsPhase3; i++ ) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< + grid, + block8, + block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key2 )); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index e2f0b2d..02c157e 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -483,7 +483,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) * with a sm_20 only compiled binary */ for(int i = 0; i < arch.size(); ++i) - if(minSupportedArch == 0 || (arch[i] >= 30 && arch[i] < minSupportedArch)) + if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch)) minSupportedArch = arch[i]; if(minSupportedArch < 30 || gpuArch < minSupportedArch) { diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index 04948d1..579c27a 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -18,7 +18,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.4.1" +#define XMR_STAK_VERSION "2.4.2" #if defined(_WIN32) #define OS_TYPE "win" |