summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorfireice-uk <fireice-uk@users.noreply.github.com>2018-04-04 22:35:50 +0100
committerGitHub <noreply@github.com>2018-04-04 22:35:50 +0100
commite10e8e67492cf3118af8b7d7609937e85e572305 (patch)
tree9d2301933ad71b4c83e35e18a206d3e69d443a4b
parent737383a5a5cac85577024aeba1da16572893cc91 (diff)
parent945524b3805e022a7ecaa41a557bee3828619ecd (diff)
downloadxmr-stak-e10e8e67492cf3118af8b7d7609937e85e572305.zip
xmr-stak-e10e8e67492cf3118af8b7d7609937e85e572305.tar.gz
Merge pull request #1301 from fireice-uk/dev2.4.2
Bug fix release 2.4.2
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cuda_core.cu46
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu2
-rw-r--r--xmrstak/version.cpp2
3 files changed, 22 insertions, 28 deletions
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 7aa44e8..43740d2 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -352,7 +352,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
int sub = subv << 2;
const int batchsize = MEMORY >> bfactor;
- const int start = partidx * batchsize;
+ const int start = (partidx % (1 << bfactor)) * batchsize;
const int end = start + batchsize;
if ( thread >= threads )
@@ -365,15 +365,15 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
__syncthreads( );
#if( __CUDA_ARCH__ < 300 )
- extern __shared__ uint32_t shuffleMem[];
- volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
+ extern __shared__ uint32_t shuffleMem[];
+ volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8));
#else
- volatile uint32_t* sPtr = NULL;
+ volatile uint32_t* sPtr = NULL;
#endif
for ( int i = start; i < end; i += 32 )
{
-#pragma unroll
+ #pragma unroll
for ( int j = 0; j < 4; ++j )
text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
@@ -381,25 +381,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
if(ALGO == cryptonight_heavy)
{
-#pragma unroll
- for ( int j = 0; j < 4; ++j )
- text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
- }
- }
-
- if(ALGO == cryptonight_heavy)
- {
- __syncthreads( );
-
- for ( int i = start; i < end; i += 32 )
- {
-#pragma unroll
- for ( int j = 0; j < 4; ++j )
- text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
-
- cn_aes_pseudo_round_mut( sharedMemory, text, key );
-
-#pragma unroll
+ #pragma unroll
for ( int j = 0; j < 4; ++j )
text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
}
@@ -466,9 +448,21 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
}
- for ( int i = 0; i < partcountOneThree; i++ )
+ int roundsPhase3 = partcountOneThree;
+
+ if(ALGO == cryptonight_heavy)
+ {
+ // cryptonight_heavy used two full rounds over the scratchpad memory
+ roundsPhase3 *= 2;
+ }
+
+ for ( int i = 0; i < roundsPhase3; i++ )
{
- CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+ CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<<
+ grid,
+ block8,
+ block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+ >>>( ctx->device_blocks*ctx->device_threads,
bfactorOneThree, i,
ctx->d_long_state,
ctx->d_ctx_state, ctx->d_ctx_key2 ));
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index e2f0b2d..02c157e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -483,7 +483,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
* with a sm_20 only compiled binary
*/
for(int i = 0; i < arch.size(); ++i)
- if(minSupportedArch == 0 || (arch[i] >= 30 && arch[i] < minSupportedArch))
+ if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch))
minSupportedArch = arch[i];
if(minSupportedArch < 30 || gpuArch < minSupportedArch)
{
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 04948d1..579c27a 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
#endif
#define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.4.1"
+#define XMR_STAK_VERSION "2.4.2"
#if defined(_WIN32)
#define OS_TYPE "win"
OpenPOWER on IntegriCloud