summaryrefslogtreecommitdiffstats
path: root/xmrstak/backend/nvidia/nvcc_code
diff options
context:
space:
mode:
authorfireice-uk <fireice-uk@users.noreply.github.com>2018-03-25 22:40:01 +0100
committerGitHub <noreply@github.com>2018-03-25 22:40:01 +0100
commita036cd81592e3b3de804ba88bb8f94729ab60b7d (patch)
treef835fc9823d80e43bdbb65023b2aed5718ee1627 /xmrstak/backend/nvidia/nvcc_code
parent2ae7260b90fe3dbe835ba2489519510f0e57d770 (diff)
parent09a5dcce2c51d87d77244970d2c09bea3207da7a (diff)
downloadxmr-stak-a036cd81592e3b3de804ba88bb8f94729ab60b7d.zip
xmr-stak-a036cd81592e3b3de804ba88bb8f94729ab60b7d.tar.gz
Merge pull request #1208 from fireice-uk/dev2.3.0
release 2.3.0
Diffstat (limited to 'xmrstak/backend/nvidia/nvcc_code')
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp10
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cuda_core.cu243
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu171
3 files changed, 331 insertions, 93 deletions
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index afbdbaf..29a3523 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -3,6 +3,9 @@
#include <stdint.h>
#include <string>
+#include "xmrstak/jconf.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
+
typedef struct {
int device_id;
const char *device_name;
@@ -20,6 +23,7 @@ typedef struct {
uint32_t *d_result_nonce;
uint32_t *d_long_state;
uint32_t *d_ctx_state;
+ uint32_t *d_ctx_state2;
uint32_t *d_ctx_a;
uint32_t *d_ctx_b;
uint32_t *d_ctx_key1;
@@ -41,8 +45,8 @@ int cuda_get_devicecount( int* deviceCount);
int cuda_get_deviceinfo(nvid_ctx *ctx);
int cryptonight_extra_cpu_init(nvid_ctx *ctx);
void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
-void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce);
-void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce);
+void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo, uint8_t version);
+void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo, uint8_t version);
}
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, bool mineMonero);
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce, uint8_t version);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 15a6f36..ede578f 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -6,6 +6,8 @@
#include <cuda.h>
#include <cuda_runtime.h>
+#include "xmrstak/jconf.hpp"
+
#ifdef _WIN32
#include <windows.h>
extern "C" void compat_usleep(uint64_t waitTime)
@@ -74,28 +76,50 @@ __device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t m
template< typename T >
__device__ __forceinline__ T loadGlobal64( T * const addr )
{
+#if (__CUDA_ARCH__ < 700)
T x;
asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) );
return x;
+#else
+ return *addr;
+#endif
}
template< typename T >
__device__ __forceinline__ T loadGlobal32( T * const addr )
{
+#if (__CUDA_ARCH__ < 700)
T x;
asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) );
return x;
+#else
+ return *addr;
+#endif
}
template< typename T >
__device__ __forceinline__ void storeGlobal32( T* addr, T const & val )
{
+#if (__CUDA_ARCH__ < 700)
asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) );
+#else
+ *addr = val;
+#endif
+}
+
+template< typename T >
+__device__ __forceinline__ void storeGlobal64( T* addr, T const & val )
+{
+#if (__CUDA_ARCH__ < 700)
+ asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) );
+#else
+ *addr = val;
+#endif
}
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT>
-__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1 )
+template<size_t ITERATIONS, uint32_t MEMORY>
+__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
{
__shared__ uint32_t sharedMemory[1024];
@@ -105,7 +129,7 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti
const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
const int sub = ( threadIdx.x & 7 ) << 2;
- const int batchsize = ITERATIONS >> bfactor;
+ const int batchsize = MEMORY >> bfactor;
const int start = partidx * batchsize;
const int end = start + batchsize;
@@ -119,18 +143,18 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti
if( partidx == 0 )
{
// first round
- MEMCPY8( text, ctx_state + thread * 50 + sub + 16, 2 );
+ MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 );
}
else
{
// load previous text data
- MEMCPY8( text, &long_state[( (uint64_t) thread << THREAD_SHIFT ) + sub + start - 32], 2 );
+ MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 );
}
__syncthreads( );
for ( int i = start; i < end; i += 32 )
{
cn_aes_pseudo_round_mut( sharedMemory, text, key );
- MEMCPY8(&long_state[((uint64_t) thread << THREAD_SHIFT) + (sub + i)], text, 2);
+ MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2);
}
}
@@ -145,33 +169,37 @@ __forceinline__ __device__ void unusedVar( const T& )
* - this method can be used with all compute architectures
* - for <sm_30 shared memory is needed
*
+ * group_n - must be a power of 2!
+ *
* @param ptr pointer to shared memory, size must be `threadIdx.x * sizeof(uint32_t)`
* value can be NULL for compute architecture >=sm_30
- * @param sub thread number within the group, range [0;4)
+ * @param sub thread number within the group, range [0:group_n]
* @param value value to share with other threads within the group
- * @param src thread number within the group from where the data is read, range [0;4)
+ * @param src thread number within the group from where the data is read, range [0:group_n]
*/
+template<size_t group_n>
__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src)
{
#if( __CUDA_ARCH__ < 300 )
ptr[sub] = val;
- return ptr[src&3];
+ return ptr[src & (group_n-1)];
#else
unusedVar( ptr );
unusedVar( sub );
# if(__CUDACC_VER_MAJOR__ >= 9)
- return __shfl_sync(0xFFFFFFFF, val, src, 4 );
+ return __shfl_sync(0xFFFFFFFF, val, src, group_n );
# else
- return __shfl( val, src, 4 );
+ return __shfl( val, src, group_n );
# endif
#endif
}
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK>
+template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
#ifdef XMR_STAK_THREADS
__launch_bounds__( XMR_STAK_THREADS * 4 )
#endif
-__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
+__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+ uint32_t startNonce, uint32_t * __restrict__ d_input )
{
__shared__ uint32_t sharedMemory[1024];
@@ -180,6 +208,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
__syncthreads( );
const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
+ const uint32_t nonce = startNonce + thread;
const int sub = threadIdx.x & 3;
const int sub2 = sub & 2;
@@ -193,30 +222,48 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
return;
int i, k;
- uint32_t j;
+ uint32_t j;
const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
const int start = partidx * batchsize;
const int end = start + batchsize;
- uint32_t * long_state = &d_long_state[(IndexType) thread << THREAD_SHIFT];
- uint32_t * ctx_a = d_ctx_a + thread * 4;
- uint32_t * ctx_b = d_ctx_b + thread * 4;
- uint32_t a, d[2];
+ uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
+ uint32_t a, d[2], idx0;
uint32_t t1[2], t2[2], res;
- a = ctx_a[sub];
- d[1] = ctx_b[sub];
+ uint32_t tweak1_2[2];
+ if (ALGO == cryptonight_monero)
+ {
+ uint32_t * state = d_ctx_state + thread * 50;
+ tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8);
+ tweak1_2[0] ^= state[48];
+ tweak1_2[1] = nonce;
+ tweak1_2[1] ^= state[49];
+ }
+
+ a = (d_ctx_a + thread * 4)[sub];
+ idx0 = shuffle<4>(sPtr,sub, a, 0);
+ if(ALGO == cryptonight_heavy)
+ {
+ if(partidx != 0)
+ {
+ // state is stored after all ctx_b states
+ idx0 = *(d_ctx_b + threads * 4 + thread);
+ }
+ }
+ d[1] = (d_ctx_b + thread * 4)[sub];
+
#pragma unroll 2
for ( i = start; i < end; ++i )
{
#pragma unroll 2
for ( int x = 0; x < 2; ++x )
{
- j = ( ( shuffle(sPtr,sub, a, 0) & MASK ) >> 2 ) + sub;
+ j = ( ( idx0 & MASK ) >> 2 ) + sub;
const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
- const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1);
- const uint32_t x_2 = shuffle(sPtr,sub, x_0, sub + 2);
- const uint32_t x_3 = shuffle(sPtr,sub, x_0, sub + 3);
+ const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
+ const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
+ const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
d[x] = a ^
t_fn0( x_0 & 0xff ) ^
t_fn1( (x_1 >> 8) & 0xff ) ^
@@ -225,41 +272,74 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
//XOR_BLOCKS_DST(c, b, &long_state[j]);
- t1[0] = shuffle(sPtr,sub, d[x], 0);
- //long_state[j] = d[0] ^ d[1];
- storeGlobal32( long_state + j, d[0] ^ d[1] );
-
+ t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
+
+ const uint32_t z = d[0] ^ d[1];
+ if(ALGO == cryptonight_monero)
+ {
+ const uint32_t table = 0x75310U;
+ const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2);
+ const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
+ storeGlobal32( long_state + j, sub == 2 ? fork_7 : z );
+ }
+ else
+ storeGlobal32( long_state + j, z );
+
//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
j = ( ( *t1 & MASK ) >> 2 ) + sub;
uint32_t yy[2];
*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
uint32_t zz[2];
- zz[0] = shuffle(sPtr,sub, yy[0], 0);
- zz[1] = shuffle(sPtr,sub, yy[1], 0);
+ zz[0] = shuffle<4>(sPtr,sub, yy[0], 0);
+ zz[1] = shuffle<4>(sPtr,sub, yy[1], 0);
- t1[1] = shuffle(sPtr,sub, d[x], 1);
+ t1[1] = shuffle<4>(sPtr,sub, d[x], 1);
#pragma unroll
for ( k = 0; k < 2; k++ )
- t2[k] = shuffle(sPtr,sub, a, k + sub2);
+ t2[k] = shuffle<4>(sPtr,sub, a, k + sub2);
*( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) );
res = *( (uint64_t *) t2 ) >> ( sub & 1 ? 32 : 0 );
- storeGlobal32( long_state + j, res );
+
+ if(ALGO == cryptonight_monero)
+ {
+ const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res;
+ const uint32_t long_state_update = sub2 ? tweaked_res : res;
+ storeGlobal32( long_state + j, long_state_update );
+ }
+ else
+ storeGlobal32( long_state + j, res );
+
a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
+ idx0 = shuffle<4>(sPtr,sub, a, 0);
+ if(ALGO == cryptonight_heavy)
+ {
+ int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
+ int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
+ int64_t q = n / (d | 0x5);
+
+ if(sub&1)
+ storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
+
+ idx0 = d ^ q;
+ }
}
}
if ( bfactor > 0 )
{
- ctx_a[sub] = a;
- ctx_b[sub] = d[1];
+ (d_ctx_a + thread * 4)[sub] = a;
+ (d_ctx_b + thread * 4)[sub] = d[1];
+ if(ALGO == cryptonight_heavy)
+ if(sub&1)
+ *(d_ctx_b + threads * 4 + thread) = idx0;
}
}
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT>
+template<size_t ITERATIONS, uint32_t MEMORY, xmrstak_algo ALGO>
__global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
{
__shared__ uint32_t sharedMemory[1024];
@@ -268,9 +348,10 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
__syncthreads( );
int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
- int sub = ( threadIdx.x & 7 ) << 2;
+ int subv = ( threadIdx.x & 7 );
+ int sub = subv << 2;
- const int batchsize = ITERATIONS >> bfactor;
+ const int batchsize = MEMORY >> bfactor;
const int start = partidx * batchsize;
const int end = start + batchsize;
@@ -282,20 +363,53 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 );
__syncthreads( );
+
+#if( __CUDA_ARCH__ < 300 )
+ extern __shared__ uint32_t shuffleMem[];
+ volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
+#else
+ volatile uint32_t* sPtr = NULL;
+#endif
+
for ( int i = start; i < end; i += 32 )
{
#pragma unroll
for ( int j = 0; j < 4; ++j )
- text[j] ^= long_state[((IndexType) thread << THREAD_SHIFT) + (sub + i + j)];
+ text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
cn_aes_pseudo_round_mut( sharedMemory, text, key );
+
+ if(ALGO == cryptonight_heavy)
+ {
+#pragma unroll
+ for ( int j = 0; j < 4; ++j )
+ text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
+ }
+ }
+
+ if(ALGO == cryptonight_heavy)
+ {
+ __syncthreads( );
+
+ for ( int i = start; i < end; i += 32 )
+ {
+#pragma unroll
+ for ( int j = 0; j < 4; ++j )
+ text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
+
+ cn_aes_pseudo_round_mut( sharedMemory, text, key );
+
+#pragma unroll
+ for ( int j = 0; j < 4; ++j )
+ text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
+ }
}
MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
}
-template<size_t ITERATIONS, uint32_t MASK, uint32_t THREAD_SHIFT>
-void cryptonight_core_gpu_hash(nvid_ctx* ctx)
+template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO>
+void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
{
dim3 grid( ctx->device_blocks );
dim3 block( ctx->device_threads );
@@ -317,9 +431,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx)
for ( int i = 0; i < partcountOneThree; i++ )
{
- CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,THREAD_SHIFT><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+ CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,MEMORY><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
bfactorOneThree, i,
- ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key1 ));
+ ctx->d_long_state,
+ (ALGO == cryptonight_heavy ? ctx->d_ctx_state2 : ctx->d_ctx_state),
+ ctx->d_ctx_key1 ));
if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
}
@@ -330,7 +446,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx)
CUDA_CHECK_MSG_KERNEL(
ctx->device_id,
"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
- cryptonight_core_gpu_phase2<ITERATIONS,THREAD_SHIFT,MASK><<<
+ cryptonight_core_gpu_phase2<ITERATIONS,MEMORY,MASK,ALGO><<<
grid,
block4,
block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
@@ -340,7 +456,10 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx)
i,
ctx->d_long_state,
ctx->d_ctx_a,
- ctx->d_ctx_b
+ ctx->d_ctx_b,
+ ctx->d_ctx_state,
+ nonce,
+ ctx->d_input
)
);
@@ -349,25 +468,39 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx)
for ( int i = 0; i < partcountOneThree; i++ )
{
- CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,THREAD_SHIFT><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+ CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
bfactorOneThree, i,
ctx->d_long_state,
ctx->d_ctx_state, ctx->d_ctx_key2 ));
}
}
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, bool mineMonero)
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce, uint8_t version)
{
-#ifndef CONF_NO_MONERO
- if(mineMonero)
+
+ if(miner_algo == cryptonight_monero)
{
- cryptonight_core_gpu_hash<MONERO_ITER, MONERO_MASK, 19u>(ctx);
+ if(version >= 7)
+ cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce);
+ else
+ cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce);
}
-#endif
-#ifndef CONF_NO_AEON
- if(!mineMonero)
+ else if(miner_algo == cryptonight_heavy)
+ {
+ if(version >= 3)
+ cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>(ctx, startNonce);
+ else
+ {
+ cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce);
+ }
+ }
+ else if(miner_algo == cryptonight)
{
- cryptonight_core_gpu_hash<AEON_ITER, AEON_MASK, 18u>(ctx);
+ cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce);
}
-#endif
+ else if(miner_algo == cryptonight_lite)
+ {
+ cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>(ctx, startNonce);
+ }
+
}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index d865e13..c2a1f87 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -28,6 +28,7 @@ uint64_t keccakf_rndc[24] ={
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
+#include "xmrstak/backend/cryptonight.hpp"
#include "cryptonight.hpp"
#include "cuda_extra.hpp"
#include "cuda_keccak.hpp"
@@ -36,6 +37,7 @@ typedef unsigned long long DataLength;
#include "cuda_jh.hpp"
#include "cuda_skein.hpp"
#include "cuda_device.hpp"
+#include "cuda_aes.hpp"
__constant__ uint8_t d_sub_byte[16][16] ={
{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
@@ -90,10 +92,33 @@ __device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__
}
}
-__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
+__device__ __forceinline__ void mix_and_propagate( uint32_t* state )
+{
+ uint32_t tmp0[4];
+ for(size_t x = 0; x < 4; ++x)
+ tmp0[x] = (state)[x];
+
+ // set destination [0,6]
+ for(size_t t = 0; t < 7; ++t)
+ for(size_t x = 0; x < 4; ++x)
+ (state + 4 * t)[x] = (state + 4 * t)[x] ^ (state + 4 * (t + 1))[x];
+
+ // set destination 7
+ for(size_t x = 0; x < 4; ++x)
+ (state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x];
+}
+
+template<xmrstak_algo ALGO>
+__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
{
int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
+ __shared__ uint32_t sharedMemory[1024];
+ if(ALGO == cryptonight_heavy)
+ {
+ cn_aes_gpu_init( sharedMemory );
+ __syncthreads( );
+ }
if ( thread >= threads )
return;
@@ -113,20 +138,45 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state );
cryptonight_aes_set_key( ctx_key1, ctx_state );
cryptonight_aes_set_key( ctx_key2, ctx_state + 8 );
+
XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
-
- memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+
memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
+ memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
+
+ if(ALGO == cryptonight_heavy)
+ {
+
+ for(int i=0; i < 16; i++)
+ {
+ for(size_t t = 4; t < 12; ++t)
+ {
+ cn_aes_pseudo_round_mut( sharedMemory, ctx_state + 4u * t, ctx_key1 );
+ }
+ // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
+ mix_and_propagate(ctx_state + 4 * 4);
+ }
+ // double buffer to move manipulated state into phase1
+ memcpy( d_ctx_state2 + thread * 50, ctx_state, 50 * 4 );
+ }
}
-__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state )
+template<xmrstak_algo ALGO>
+__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
{
const int thread = blockDim.x * blockIdx.x + threadIdx.x;
+ __shared__ uint32_t sharedMemory[1024];
+
+ if(ALGO == cryptonight_heavy)
+ {
+ cn_aes_gpu_init( sharedMemory );
+ __syncthreads( );
+ }
if ( thread >= threads )
return;
@@ -134,11 +184,28 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50;
uint64_t hash[4];
uint32_t state[50];
-
-#pragma unroll
+
+ #pragma unroll
for ( i = 0; i < 50; i++ )
state[i] = ctx_state[i];
+ if(ALGO == cryptonight_heavy)
+ {
+ uint32_t key[40];
+
+ // load keys
+ MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
+
+ for(int i=0; i < 16; i++)
+ {
+ for(size_t t = 4; t < 12; ++t)
+ {
+ cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key );
+ }
+ // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
+ mix_and_propagate(state + 4 * 4);
+ }
+ }
cn_keccakf2( (uint64_t *) state );
switch ( ( (uint8_t *) state )[0] & 0x03 )
@@ -204,25 +271,35 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
break;
};
- CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+ const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1];
- size_t hashMemSize;
- if(::jconf::inst()->IsCurrencyMonero())
- {
- hashMemSize = MONERO_MEMORY;
- }
- else
- {
- hashMemSize = AEON_MEMORY;
- }
+ /* Disable L1 cache for GPUs before Volta.
+ * L1 speed is increased and latency reduced with Volta.
+ */
+ if(gpuArch < 70)
+ CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+
+ size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
size_t wsize = ctx->device_blocks * ctx->device_threads;
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
+ size_t ctx_b_size = 4 * sizeof(uint32_t) * wsize;
+ if(cryptonight_heavy == ::jconf::inst()->GetMiningAlgo())
+ {
+ // extent ctx_b to hold the state of idx0
+ ctx_b_size += sizeof(uint32_t) * wsize;
+ // create a double buffer for the state to exchange the mixed state to phase1
+ CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
+ }
+ else
+ ctx->d_ctx_state2 = ctx->d_ctx_state;
+
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_text, 32 * sizeof(uint32_t) * wsize));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize));
- CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, 4 * sizeof(uint32_t) * wsize));
+ CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size));
+ // POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
@@ -233,7 +310,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
return 1;
}
-extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce)
+extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo, uint8_t version)
{
int threadsperblock = 128;
uint32_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -241,11 +318,22 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
dim3 block( threadsperblock );
- CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
- ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+ if(miner_algo == cryptonight_heavy && version >= 3)
+ {
+ CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_heavy><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+ ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+ }
+ else
+ {
+ /* pass two times d_ctx_state because the second state is used later in phase1,
+ * the first is used than in phase3
+ */
+ CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<invalid_algo><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+ ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+ }
}
-extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce)
+extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo, uint8_t version)
{
int threadsperblock = 128;
uint32_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -256,11 +344,23 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) ));
- CUDA_CHECK_MSG_KERNEL(
- ctx->device_id,
- "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
- cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state )
- );
+ if(miner_algo == cryptonight_heavy && version >= 3)
+ {
+ CUDA_CHECK_MSG_KERNEL(
+ ctx->device_id,
+ "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+ cryptonight_extra_gpu_final<cryptonight_heavy><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+ );
+ }
+ else
+ {
+ // fallback for all other algorithms
+ CUDA_CHECK_MSG_KERNEL(
+ ctx->device_id,
+ "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+ cryptonight_extra_gpu_final<invalid_algo><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+ );
+ }
CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
@@ -441,6 +541,12 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
maxMemUsage = size_t(1024u) * byteToMiB;
}
+ if(props.multiProcessorCount <= 6)
+ {
+ // limit memory usage for low end devices to reduce the number of threads
+ maxMemUsage = size_t(1024u) * byteToMiB;
+ }
+
int* tmp;
cudaError_t err;
// a device must be selected to get the right memory usage later on
@@ -470,15 +576,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
ctx->total_device_memory = totalMemory;
ctx->free_device_memory = freeMemory;
- size_t hashMemSize;
- if(::jconf::inst()->IsCurrencyMonero())
- {
- hashMemSize = MONERO_MEMORY;
- }
- else
- {
- hashMemSize = AEON_MEMORY;
- }
+ size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
#ifdef WIN32
/* We use in windows bfactor (split slow kernel into smaller parts) to avoid
@@ -508,6 +606,9 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
// up to 16kibyte extra memory is used per thread for some kernel (lmem/local memory)
// 680bytes are extra meta data memory per hash
size_t perThread = hashMemSize + 16192u + 680u;
+ if(cryptonight_heavy == ::jconf::inst()->GetMiningAlgo())
+ perThread += 50 * 4; // state double buffer
+
size_t max_intensity = limitedMemory / perThread;
ctx->device_threads = max_intensity / ctx->device_blocks;
// use only odd number of threads
OpenPOWER on IntegriCloud