5 files changed, 63 insertions, 22 deletions
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index cf58a21..237dcaf 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -32,6 +32,7 @@
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/environment.hpp"
 #include "xmrstak/backend/cpu/hwlocMemory.hpp"
+#include "../cryptonight.hpp"
 
 #include <assert.h>
 #include <cmath>
@@ -208,7 +209,7 @@ void minethd::work_main()
 	uint64_t iCount = 0;
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
-	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/);
+	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, ::jconf::inst()->IsCurrencyXMR());
 	uint32_t iNonce;
 
 	globalStates::inst().iConsumeCnt++;
@@ -218,6 +219,9 @@ void minethd::work_main()
 		printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
 		std::exit(0);
 	}
+
+	bool useXMR = ::jconf::inst()->GetCurrency().compare("xmr") == 0;
+	bool useAEON = ::jconf::inst()->GetCurrency().compare("aeon") == 0;
 	
 	while (bQuit == 0)
 	{
@@ -256,7 +260,18 @@ void minethd::work_main()
 			uint32_t foundCount;
 
 			cryptonight_extra_cpu_prepare(&ctx, iNonce);
-			cryptonight_core_cpu_hash(&ctx);
+#ifndef CONF_NO_XMR
+			if(useXMR)
+			{
+				cryptonight_core_cpu_hash<XMR_ITER, XMR_MASK, 19>(&ctx);
+			}
+#endif
+#ifndef CONF_NO_AEON
+			if(useAEON)
+			{
+				cryptonight_core_cpu_hash<XMR_ITER, XMR_MASK, 18>(&ctx);
+			}
+#endif
 			cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce);
 
 			for(size_t i = 0; i < foundCount; i++)
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 784c38d..4e0ace7 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -41,8 +41,9 @@ int cuda_get_deviceinfo(nvid_ctx *ctx);
 int cryptonight_extra_cpu_init(nvid_ctx *ctx);
 void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
 void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce);
-void cryptonight_core_cpu_hash(nvid_ctx* ctx);
 void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce);
-
 }
 
+template<size_t ITERATIONS, size_t THREAD_SHIFT, uint32_t MASK>
+void cryptonight_core_cpu_hash(nvid_ctx* ctx);
+
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index e1b78ce..5bea230 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -92,6 +92,7 @@ __device__ __forceinline__ void storeGlobal32( T* addr, T const & val )
 	asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) );
 }
 
+template<size_t ITERATIONS, size_t THREAD_SHIFT>
 __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1 )
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -102,7 +103,7 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti
 	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
 	const int sub = ( threadIdx.x & 7 ) << 2;
 
-	const int batchsize = 0x80000 >> bfactor;
+	const int batchsize = ITERATIONS >> bfactor;
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 
@@ -121,13 +122,13 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti
 	else
 	{
 		// load previous text data
-		MEMCPY8( text, &long_state[( (uint64_t) thread << 19 ) + sub + start - 32], 2 );
+		MEMCPY8( text, &long_state[( (uint64_t) thread << THREAD_SHIFT ) + sub + start - 32], 2 );
 	}
 	__syncthreads( );
 	for ( int i = start; i < end; i += 32 )
 	{
 		cn_aes_pseudo_round_mut( sharedMemory, text, key );
-		MEMCPY8(&long_state[((uint64_t) thread << 19) + (sub + i)], text, 2);
+		MEMCPY8(&long_state[((uint64_t) thread << THREAD_SHIFT) + (sub + i)], text, 2);
 	}
 }
 
@@ -167,6 +168,7 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
 #endif
+template<size_t ITERATIONS, size_t THREAD_SHIFT, uint32_t MASK>
 __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -190,10 +192,10 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 	int i, k;
         uint32_t j;
-	const int batchsize = ITER >> ( 2 + bfactor );
+	const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
-	uint32_t * long_state = &d_long_state[(IndexType) thread << 19];
+	uint32_t * long_state = &d_long_state[(IndexType) thread << THREAD_SHIFT];
 	uint32_t * ctx_a = d_ctx_a + thread * 4;
 	uint32_t * ctx_b = d_ctx_b + thread * 4;
 	uint32_t a, d[2];
@@ -207,7 +209,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 		#pragma unroll 2
 		for ( int x = 0; x < 2; ++x )
 		{
-			j = ( ( shuffle(sPtr,sub, a, 0) & 0x1FFFF0 ) >> 2 ) + sub;
+			j = ( ( shuffle(sPtr,sub, a, 0) & MASK ) >> 2 ) + sub;
 
 			const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
 			const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1);
@@ -225,8 +227,8 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 			//long_state[j] = d[0] ^ d[1];
 			storeGlobal32( long_state + j, d[0] ^ d[1] );
 
-			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
-			j = ( ( *t1 & 0x1FFFF0 ) >> 2 ) + sub;
+			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
+			j = ( ( *t1 & MASK ) >> 2 ) + sub;
 
 			uint32_t yy[2];
 			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
@@ -255,6 +257,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 	}
 }
 
+template<size_t ITERATIONS, size_t THREAD_SHIFT>
 __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -265,7 +268,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
 	int sub = ( threadIdx.x & 7 ) << 2;
 
-	const int batchsize = 0x80000 >> bfactor;
+	const int batchsize = ITERATIONS >> bfactor;
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 
@@ -281,7 +284,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	{
 #pragma unroll
 		for ( int j = 0; j < 4; ++j )
-			text[j] ^= long_state[((IndexType) thread << 19) + (sub + i + j)];
+			text[j] ^= long_state[((IndexType) thread << THREAD_SHIFT) + (sub + i + j)];
 
 		cn_aes_pseudo_round_mut( sharedMemory, text, key );
 	}
@@ -289,7 +292,8 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
 }
 
-extern "C" void cryptonight_core_cpu_hash(nvid_ctx* ctx)
+template<size_t ITERATIONS, size_t THREAD_SHIFT, uint32_t MASK>
+void cryptonight_core_cpu_hash(nvid_ctx* ctx)
 {
 	dim3 grid( ctx->device_blocks );
 	dim3 block( ctx->device_threads );
@@ -311,7 +315,7 @@ extern "C" void cryptonight_core_cpu_hash(nvid_ctx* ctx)
 
 	for ( int i = 0; i < partcountOneThree; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,THREAD_SHIFT><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key1 ));
 
@@ -321,7 +325,7 @@ extern "C" void cryptonight_core_cpu_hash(nvid_ctx* ctx)
 
 	for ( int i = 0; i < partcount; i++ )
 	{
-        CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase2<<<
+        CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase2<ITERATIONS,THREAD_SHIFT,MASK><<<
             grid,
             block4,
             block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
@@ -339,7 +343,7 @@ extern "C" void cryptonight_core_cpu_hash(nvid_ctx* ctx)
 
 	for ( int i = 0; i < partcountOneThree; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,THREAD_SHIFT><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
 			ctx->d_ctx_state, ctx->d_ctx_key2 ));
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 7734473..40dcc7e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -5,6 +5,7 @@
 #include <cuda_runtime.h>
 #include <device_functions.hpp>
 #include  <algorithm>
+#include "../../../jconf.hpp"
 
 #ifdef __CUDACC__
 __constant__
@@ -188,8 +189,18 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 
+	size_t hashMemSize;
+	if(::jconf::inst()->IsCurrencyXMR())
+	{
+		hashMemSize = XMR_MEMORY;
+	}
+	else
+	{
+		hashMemSize = AEON_MEMORY;
+	}
+
 	size_t wsize = ctx->device_blocks * ctx->device_threads;
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, (size_t)MEMORY * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, hashMemSize * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize));
@@ -343,13 +354,23 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		ctx->total_device_memory = totalMemory;
 		ctx->free_device_memory = freeMemory;
 
+		size_t hashMemSize;
+		if(::jconf::inst()->IsCurrencyXMR())
+		{
+			hashMemSize = XMR_MEMORY;
+		}
+		else
+		{
+			hashMemSize = AEON_MEMORY;
+		}
+
 		// keep 64MiB memory free (value is randomly chosen)
 		// 200byte are meta data memory (result nonce, ...)
 		size_t availableMem = freeMemory - (64u * 1024 * 1024) - 200u;
 		size_t limitedMemory = std::min(availableMem, maxMemUsage);
 		// up to 920bytes extra memory is used per thread for some kernel (lmem/local memory)
 		// 680bytes are extra meta data memory per hash
-		size_t perThread = size_t(MEMORY) + 740u + 680u;
+		size_t perThread = hashMemSize + 740u + 680u;
 		size_t max_intensity = limitedMemory / perThread;
 		ctx->device_threads = max_intensity / ctx->device_blocks;
 		// use only odd number of threads
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
index 3ccdcd6..c622110 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "../../cryptonight.hpp"
+
 #ifdef __INTELLISENSE__
 #define __CUDA_ARCH__ 520
 /* avoid red underlining */
@@ -18,8 +20,6 @@ struct uint3  blockDim;
 #define __shfl(a,b,c) 1
 #endif
 
-#define MEMORY         (1 << 21) // 2 MiB / 2097152 B
-#define ITER           (1 << 20) // 1048576
 #define AES_BLOCK_SIZE  16
 #define AES_KEY_SIZE    32
 #define INIT_SIZE_BLK   8