From 9ef2a83be3ddcc36546277f996c9387c38090014 Mon Sep 17 00:00:00 2001
From: Takeshi Suzuki <takeshibaconsuzuki@gmail.com>
Date: Thu, 5 Apr 2018 20:58:14 -0500
Subject: Fix spelling mistake in gpu.cpp

---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'xmrstak/backend/amd')
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 006a7ed..13f018e 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -378,7 +378,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	char options[512];
 	snprintf(options, sizeof(options),
 		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d  -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
-		hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0, 
+		hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
 		int_port(hashMemSize), int(miner_algo));
 	/* create a hash for the compile time cache
 	 * used data:
@@ -497,7 +497,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	}
 	else
 	{
-		printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled cod from file %s",ctx->deviceIdx, cache_file.c_str());
+		printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str());
 		std::ostringstream ss;
 		ss << clBinFile.rdbuf();
 		std::string s = ss.str();
-- 
cgit v1.1


From 75ae1dd60165141700c36ab699656e9e073e186d Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Thu, 5 Apr 2018 23:19:48 -0600
Subject: Repair all 'namepsace' to 'namespace' (all within comments)

---
 xmrstak/backend/amd/autoAdjust.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'xmrstak/backend/amd')

diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index ea057a0..c798cf3 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -155,4 +155,4 @@ private:
 };
 
 } // namespace amd
-} // namepsace xmrstak
+} // namespace xmrstak
-- 
cgit v1.1


From a57976439b1322eecb5e29198f3d9676d6f06909 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 7 Apr 2018 22:53:43 +0200
Subject: amd simplify kernel for different algorithms

- remove version numbers within the kernel
- create seperate program context for each mining algorithm
- remove kernel `cn1_monero` is now integrated in `cn1`
- remname `cnX` kernel in `cnX + algorithmNumber`
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp               | 413 +++++++++++-----------
 xmrstak/backend/amd/amd_gpu/gpu.hpp               |   8 +-
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 301 +++++-----------
 xmrstak/backend/amd/minethd.cpp                   |   4 +-
 4 files changed, 295 insertions(+), 431 deletions(-)

(limited to 'xmrstak/backend/amd')

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 13f018e..b9cc9b6 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -26,6 +26,7 @@
 #include <algorithm>
 #include <regex>
 #include <cassert>
+#include <algorithm> 
 
 #include <fstream>
 #include <sstream>
@@ -307,12 +308,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
+	size_t scratchPadSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
 	int threadMemMask = cn_select_mask(::jconf::inst()->GetMiningAlgo());
 	int hashIterations = cn_select_iter(::jconf::inst()->GetMiningAlgo());
 
 	size_t g_thd = ctx->rawIntensity;
-	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret);
+	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
@@ -373,167 +374,202 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	auto miner_algo = ::jconf::inst()->GetMiningAlgo();
+	xmrstak_algo miner_algo[2] = {
+		::jconf::inst()->GetMiningAlgo(),
+		::jconf::inst()->GetMiningAlgoRoot()
+	};
+	int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2;
 
-	char options[512];
-	snprintf(options, sizeof(options),
-		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d  -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
-		hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
-		int_port(hashMemSize), int(miner_algo));
-	/* create a hash for the compile time cache
-	 * used data:
-	 *   - source code
-	 *   - device name
-	 *   - compile paramater
-	 */
-	std::string src_str(source_code);
-	src_str += options;
-	src_str += devNameVec.data();
-	std::string hash_hex_str;
-	picosha2::hash256_hex_string(src_str, hash_hex_str);
-
-	std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin";
-	std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary);
-	if(xmrstak::params::inst().AMDCache == false || !clBinFile.good())
+	for(int ii = 0; ii < num_algos; ++ii)
 	{
-		if(xmrstak::params::inst().AMDCache)
-			printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
-		ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
-		if(ret != CL_SUCCESS)
-		{
-			printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
-			return ERR_OCL_API;
-		}
+		size_t hashMemSize = cn_select_memory(miner_algo[ii]);
+		int threadMemMask = cn_select_mask(miner_algo[ii]);
+		int hashIterations = cn_select_iter(miner_algo[ii]);
 
-		ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
-		if(ret != CL_SUCCESS)
+		char options[512];
+		snprintf(options, sizeof(options),
+			"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d  -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
+		hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
+			int_port(hashMemSize), int(miner_algo[ii]));
+		/* create a hash for the compile time cache
+		 * used data:
+		 *   - source code
+		 *   - device name
+		 *   - compile paramater
+		 */
+		std::string src_str(source_code);
+		src_str += options;
+		src_str += devNameVec.data();
+		std::string hash_hex_str;
+		picosha2::hash256_hex_string(src_str, hash_hex_str);
+
+		std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin";
+		std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary);
+		if(xmrstak::params::inst().AMDCache == false || !clBinFile.good())
 		{
-			size_t len;
-			printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
-
-			if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+			if(xmrstak::params::inst().AMDCache)
+				printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
+			ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
+			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
 				return ERR_OCL_API;
 			}
 
-			char* BuildLog = (char*)malloc(len + 1);
-			BuildLog[0] = '\0';
-
-			if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL);
+			if(ret != CL_SUCCESS)
 			{
+				size_t len;
+				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+
+				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+					return ERR_OCL_API;
+				}
+
+				char* BuildLog = (char*)malloc(len + 1);
+				BuildLog[0] = '\0';
+
+				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+				{
+					free(BuildLog);
+					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+					return ERR_OCL_API;
+				}
+
+				printer::inst()->print_str("Build log:\n");
+				std::cerr<<BuildLog<<std::endl;
+
 				free(BuildLog);
-				printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
 				return ERR_OCL_API;
 			}
 
-			printer::inst()->print_str("Build log:\n");
-			std::cerr<<BuildLog<<std::endl;
-
-			free(BuildLog);
-			return ERR_OCL_API;
-		}
-
-		cl_uint num_devices;
-		clGetProgramInfo(ctx->Program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
+			cl_uint num_devices;
+			clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
 
 
-		std::vector<cl_device_id> devices_ids(num_devices);
-		clGetProgramInfo(ctx->Program, CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
-		int dev_id = 0;
-		/* Search for the gpu within the program context.
-		 * The id can be different to  ctx->DeviceID.
-		 */
-		for(auto & ocl_device : devices_ids)
-		{
-			if(ocl_device == ctx->DeviceID)
-				break;
-			dev_id++;
-		}
+			std::vector<cl_device_id> devices_ids(num_devices);
+			clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
+			int dev_id = 0;
+			/* Search for the gpu within the program context.
+			 * The id can be different to  ctx->DeviceID.
+			 */
+			for(auto & ocl_device : devices_ids)
+			{
+				if(ocl_device == ctx->DeviceID)
+					break;
+				dev_id++;
+			}
 
-		cl_build_status status;
-		do
-		{
-			if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+			cl_build_status status;
+			do
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
-				return ERR_OCL_API;
+				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+					return ERR_OCL_API;
+				}
+				port_sleep(1);
 			}
-			port_sleep(1);
-		}
-		while(status == CL_BUILD_IN_PROGRESS);
+			while(status == CL_BUILD_IN_PROGRESS);
+
+			std::vector<size_t> binary_sizes(num_devices);
+			clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
 
-		std::vector<size_t> binary_sizes(num_devices);
-		clGetProgramInfo (ctx->Program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
+			std::vector<char*> all_programs(num_devices);
+			std::vector<std::vector<char>> program_storage;
 
-		std::vector<char*> all_programs(num_devices);
-		std::vector<std::vector<char>> program_storage;
+			if(xmrstak::params::inst().AMDCache)
+			{
+				int p_id = 0;
+				size_t mem_size = 0;
+				// create memory  structure to query all OpenCL program binaries
+				for(auto & p : all_programs)
+				{
+					program_storage.emplace_back(std::vector<char>(binary_sizes[p_id]));
+					all_programs[p_id] = program_storage[p_id].data();
+					mem_size += binary_sizes[p_id];
+					p_id++;
+				}
 
-		if(xmrstak::params::inst().AMDCache)
+			if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
+					return ERR_OCL_API;
+				}
+
+				std::ofstream file_stream;
+				file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary);
+				file_stream.write(all_programs[dev_id], binary_sizes[dev_id]);
+				file_stream.close();
+				printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str());
+			}
+		}
+		else
 		{
-			int p_id = 0;
-			size_t mem_size = 0;
-			// create memory  structure to query all OpenCL program binaries
-			for(auto & p : all_programs)
+			printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str());
+			std::ostringstream ss;
+			ss << clBinFile.rdbuf();
+			std::string s = ss.str();
+
+			size_t bin_size = s.size();
+			auto data_ptr = s.data();
+
+			cl_int clStatus;
+			ctx->Program[ii] = clCreateProgramWithBinary(
+				opencl_ctx, 1, &ctx->DeviceID, &bin_size,
+				(const unsigned char **)&data_ptr, &clStatus, &ret
+			);
+			if(ret != CL_SUCCESS)
 			{
-				program_storage.emplace_back(std::vector<char>(binary_sizes[p_id]));
-				all_programs[p_id] = program_storage[p_id].data();
-				mem_size += binary_sizes[p_id];
-				p_id++;
+				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+				return ERR_OCL_API;
 			}
-
-		if((ret = clGetProgramInfo(ctx->Program, CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL);
+			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
+				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
 				return ERR_OCL_API;
 			}
-
-			std::ofstream file_stream;
-			file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary);
-			file_stream.write(all_programs[dev_id], binary_sizes[dev_id]);
-			file_stream.close();
-			printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str());
 		}
-	}
-	else
-	{
-		printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str());
-		std::ostringstream ss;
-		ss << clBinFile.rdbuf();
-		std::string s = ss.str();
-
-		size_t bin_size = s.size();
-		auto data_ptr = s.data();
-
-		cl_int clStatus;
-		ctx->Program = clCreateProgramWithBinary(
-			opencl_ctx, 1, &ctx->DeviceID, &bin_size,
-			(const unsigned char **)&data_ptr, &clStatus, &ret
-		);
-		if(ret != CL_SUCCESS)
+
+		std::vector<std::string> KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
+		// append algorithm number to kernel name
+		for(int k = 0; k < 3; k++)
+			KernelNames[k] += std::to_string(miner_algo[ii]);
+
+		if(ii == 0)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
-			return ERR_OCL_API;
+			for(int i = 0; i < 7; ++i)
+			{
+				ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
+				if(ret != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
+					return ERR_OCL_API;
+				}
+			}
 		}
-		ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, NULL, NULL, NULL);
-		if(ret != CL_SUCCESS)
+		else
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
-			return ERR_OCL_API;
-		}
-	}
+			for(int i = 0; i < 3; ++i)
+			{
+				ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
+				if(ret != CL_SUCCESS)
+				{
+					printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str());
+					return ERR_OCL_API;
+				}
+			}
+			// move kernel from the main algorithm into the root algorithm kernel space
+			for(int i = 3; i < 7; ++i)
+			{
+				ctx->Kernels[ii][i] = ctx->Kernels[0][i];
+			}
 
-	const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero" };
-	for(int i = 0; i < 8; ++i)
-	{
-		ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret);
-		if(ret != CL_SUCCESS)
-		{
-			printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel %s.", err_to_str(ret), KernelNames[i]);
-			return ERR_OCL_API;
 		}
 	}
-
 	ctx->Nonce = 0;
 	return 0;
 }
@@ -881,8 +917,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	return ERR_SUCCESS;
 }
 
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version)
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo)
 {
+	// switch to the kernel storage 
+	int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1;
+
 	cl_int ret;
 
 	if(input_len > 84)
@@ -899,71 +938,51 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		return ERR_OCL_API;
 	}
 
-	if((ret = clSetKernelArg(ctx->Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
 	}
 
-	/* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version.
-	 * Do NOT use the variable `miner_algo` because this variable is changed dynamicly
-	 */
-	if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy)
-	{
-		// version
-		if ((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_uint), &version)) != CL_SUCCESS)
-		{
-			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 4.", err_to_str(ret));
-			return ERR_OCL_API;
-		}
-	}
-
 	// CN1 Kernel
 
-	/// @todo only activate if currency is monero
-	int cn_kernel_offset = 0;
-	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon)
-	{
-		cn_kernel_offset = 6;
-	}
-
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -972,113 +991,88 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon )
 	{
 		// Input
-		if ((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+		if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, arugment 4(input buffer).", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 	}
-	/* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version.
-	 * Do NOT use the variable `miner_algo` because this variable is changed dynamicly
-	 */
-	else if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy)
-	{
-		// version
-		if ((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_uint), &version)) != CL_SUCCESS)
-		{
-			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 3 (version).", err_to_str(ret));
-			return ERR_OCL_API;
-		}
-	}
 
 	// CN3 Kernel
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 0
-	if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 1
-	if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 2
-	if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 3
-	if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 		return(ERR_OCL_API);
 	}
 
-    /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version.
-	 * Do NOT use the variable `miner_algo` because this variable is changed dynamicly
-	 */
-	if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy)
-	{
-		// version
-		if ((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_uint), &version)) != CL_SUCCESS)
-		{
-			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 7.", err_to_str(ret));
-			return ERR_OCL_API;
-		}
-	}
-
 	for(int i = 0; i < 4; ++i)
 	{
 		// States
-		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
 			return ERR_OCL_API;
 		}
 
 		// Nonce buffer
-		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
 			return ERR_OCL_API;
 		}
 
 		// Output
-		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
 			return ERR_OCL_API;
 		}
 
 		// Target
-		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 			return ERR_OCL_API;
@@ -1088,8 +1082,11 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	return ERR_SUCCESS;
 }
 
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version)
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
+	// switch to the kernel storage
+	int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1;
+	
 	cl_int ret;
 	cl_uint zero = 0;
 	size_t BranchNonces[4];
@@ -1125,35 +1122,21 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo,
 	clFinish(ctx->CommandQueues);
 
 	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 };
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
 		return ERR_OCL_API;
 	}
 
-	/*for(int i = 1; i < 3; ++i)
-	{
-		if((ret = clEnqueueNDRangeKernel(*ctx->CommandQueues, ctx->Kernels[i], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
-		{
-			Log(LOG_CRITICAL, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i);
-			return(ERR_OCL_API);
-		}
-	}*/
-
 	size_t tmpNonce = ctx->Nonce;
-	/// @todo only activate if currency is monero
-	int cn_kernel_offset = 0;
-	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon)
-	{
-		cn_kernel_offset = 6;
-	}
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1 + cn_kernel_offset], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
 		return ERR_OCL_API;
@@ -1190,7 +1173,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo,
 		if(BranchNonces[i])
 		{
 			// Threads
-			if((clSetKernelArg(ctx->Kernels[i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
+			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
 				return(ERR_OCL_API);
@@ -1201,7 +1184,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo,
 			// number of global threads must be a multiple of the work group size (w_size)
 			assert(BranchNonces[i]%w_size == 0);
 			size_t tmpNonce = ctx->Nonce;
-			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
 				return ERR_OCL_API;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index a387b15..0db6c90 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -35,8 +35,8 @@ struct GpuContext
 	cl_mem InputBuffer;
 	cl_mem OutputBuffer;
 	cl_mem ExtraBuffers[6];
-	cl_program Program;
-	cl_kernel Kernels[8];
+	cl_program Program[2];
+	cl_kernel Kernels[2][8];
 	size_t freeMem;
 	int computeUnits;
 	std::string name;
@@ -50,7 +50,7 @@ int getAMDPlatformIdx();
 std::vector<GpuContext> getAMDDevices(int index);
 
 size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version);
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version);
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
 
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 5d4e66c..d2ae1a7 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -433,15 +433,13 @@ inline ulong getIdx()
 #endif
 }
 
-#define  mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
+#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
+		
+#define JOIN_DO(x,y) x##y
+#define JOIN(x,y) JOIN_DO(x,y)
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads
-// cryptonight_heavy
-#if (ALGO == 4)
-		, uint version
-#endif
-)
+__kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
 {
 	ulong State[25];
 	uint ExpandedKey1[40];
@@ -517,23 +515,20 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 		
 // cryptonight_heavy
 #if (ALGO == 4)
-	if(version >= 3)
-	{
-		__local uint4 xin[8][WORKSIZE];
+	__local uint4 xin[8][WORKSIZE];
 
-		/* Also left over threads performe this loop.
-		 * The left over thread results will be ignored
-		 */
-		for(size_t i=0; i < 16; i++)
-		{
-			#pragma unroll
-			for(int j = 0; j < 10; ++j)
-				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
-			barrier(CLK_LOCAL_MEM_FENCE);
-			xin[get_local_id(1)][get_local_id(0)] = text;
-			barrier(CLK_LOCAL_MEM_FENCE);
-			text = mix_and_propagate(xin);
-		}
+	/* Also left over threads performe this loop.
+	 * The left over thread results will be ignored
+	 */
+	for(size_t i=0; i < 16; i++)
+	{
+		#pragma unroll
+		for(int j = 0; j < 10; ++j)
+			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+		xin[get_local_id(1)][get_local_id(0)] = text;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		text = mix_and_propagate(xin);
 	}
 #endif
 
@@ -542,13 +537,9 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	if(gIdx < Threads)
 #endif
 	{
-		int iterations = MEMORY >> 7;
-#if (ALGO == 4)
-		if(version < 3)
-			iterations >>= 1;
-#endif
+
 		#pragma unroll 2
-		for(int i = 0; i < iterations; ++i)
+		for(int i = 0; i < (MEMORY >> 7); ++i)
 		{
 			#pragma unroll
 			for(int j = 0; j < 10; ++j)
@@ -560,22 +551,13 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
-#define VARIANT1_1(p) \
-	uint table = 0x75310U; \
-	uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
-	(p).s2 ^= ((table >> index) & 0x30U) << 24
-
-#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2
-
-#define VARIANT1_INIT() \
-	tweak1_2 = as_uint2(input[4]); \
-	tweak1_2.s0 >>= 24; \
-	tweak1_2.s0 |= tweak1_2.s1 << 8; \
-	tweak1_2.s1 = get_global_id(0); \
-	tweak1_2 ^= as_uint2(states[24])
-
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, __global ulong *input)
+__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
+// cryptonight_monero || cryptonight_aeon
+#if(ALGO == 3 || ALGO == 5)
+, __global ulong *input
+#endif
+)
 {
 	ulong a[2], b[2];
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
@@ -592,8 +574,9 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
 	}
 
 	barrier(CLK_LOCAL_MEM_FENCE);
-
+#if(ALGO == 3 || ALGO == 5)
     uint2 tweak1_2;
+#endif
 	uint4 b_x;
 #if(COMP_MODE==1)
 	// do not use early return here
@@ -615,7 +598,13 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
 		b[1] = states[3] ^ states[7];
 
 		b_x = ((uint4 *)b)[0];
-	    VARIANT1_INIT();
+#if(ALGO == 3 || ALGO == 5)
+		tweak1_2 = as_uint2(input[4]);
+		tweak1_2.s0 >>= 24;
+		tweak1_2.s0 |= tweak1_2.s1 << 8;
+		tweak1_2.s1 = get_global_id(0);
+		tweak1_2 ^= as_uint2(states[24]);
+#endif
 	}
 
 	mem_fence(CLK_LOCAL_MEM_FENCE);
@@ -625,17 +614,23 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
 	if(gIdx < Threads)
 #endif
 	{
+		ulong idx0 = a[0];
+
 		#pragma unroll 8
 		for(int i = 0; i < ITERATIONS; ++i)
 		{
 			ulong c[2];
 
-			((uint4 *)c)[0] = Scratchpad[IDX((a[0] & MASK) >> 4)];
+			((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
 			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 
 			b_x ^= ((uint4 *)c)[0];
-			VARIANT1_1(b_x);
-			Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x;
+#if(ALGO == 3 || ALGO == 5)
+			uint table = 0x75310U;
+			uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2);
+			b_x.s2 ^= ((table >> index) & 0x30U) << 24;
+#endif
+			Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
 
 			uint4 tmp;
 			tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
@@ -643,101 +638,14 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
 			a[1] += c[0] * as_ulong2(tmp).s0;
 			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
 
-			VARIANT1_2(a[1]);
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
-			VARIANT1_2(a[1]);
-
-			((uint4 *)a)[0] ^= tmp;
-
-			b_x = ((uint4 *)c)[0];
-		}
-	}
-	mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads
-// cryptonight_heavy
-#if (ALGO == 4)
-		, uint version
-#endif
-)
-{
-	ulong a[2], b[2];
-	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
-
-	const ulong gIdx = getIdx();
-
-	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
-	{
-		const uint tmp = AES0_C[i];
-		AES0[i] = tmp;
-		AES1[i] = rotate(tmp, 8U);
-		AES2[i] = rotate(tmp, 16U);
-		AES3[i] = rotate(tmp, 24U);
-	}
-
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	uint4 b_x;
-#if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
-#endif
-	{
-		states += 25 * gIdx;
-#if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (MEMORY >> 4);
-#elif(STRIDED_INDEX==1)
-		Scratchpad += gIdx;
-#elif(STRIDED_INDEX==2)
-		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
-#endif
-
-		a[0] = states[0] ^ states[4];
-		b[0] = states[2] ^ states[6];
-		a[1] = states[1] ^ states[5];
-		b[1] = states[3] ^ states[7];
-
-		b_x = ((uint4 *)b)[0];
-	}
-
-	mem_fence(CLK_LOCAL_MEM_FENCE);
-
-#if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
-#endif
-	{
-		ulong idx0 = a[0];
-		ulong mask = MASK;
 
-		int iterations = ITERATIONS;
-#if (ALGO == 4)
-		if(version < 3)
-		{
-			iterations <<= 1;
-			mask -= 0x200000;
-		}
+#if(ALGO == 3 || ALGO == 5)
+			((uint2 *)&(a[1]))[0] ^= tweak1_2;
+			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			((uint2 *)&(a[1]))[0] ^= tweak1_2;
+#else
+			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
 #endif
-		#pragma unroll 8
-		for(int i = 0; i < iterations; ++i)
-		{
-			ulong c[2];
-
-			((uint4 *)c)[0] = Scratchpad[IDX((idx0 & mask) >> 4)];
-			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
-			//b_x ^= ((uint4 *)c)[0];
-
-			Scratchpad[IDX((idx0 & mask) >> 4)] = b_x ^ ((uint4 *)c)[0];
-
-			uint4 tmp;
-			tmp = Scratchpad[IDX((c[0] & mask) >> 4)];
-
-			a[1] += c[0] * as_ulong2(tmp).s0;
-			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
-
-			Scratchpad[IDX((c[0] & mask) >> 4)] = ((uint4 *)a)[0];
 
 			((uint4 *)a)[0] ^= tmp;
 			idx0 = a[0];
@@ -745,14 +653,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 			b_x = ((uint4 *)c)[0];
 // cryptonight_heavy
 #if (ALGO == 4)
-			if(version >= 3)
-			{
-				long n = *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4))));
-				int d = ((__global int*)(Scratchpad + (IDX((idx0 & mask) >> 4))))[2];
-				long q = n / (d | 0x5);
-				*((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))) = n ^ q;
-				idx0 = d ^ q;
-			}
+			long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+			long q = n / (d | 0x5);
+			*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
+			idx0 = d ^ q;
 #endif
 		}
 	}
@@ -760,12 +665,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads
-// cryptonight_heavy
-#if (ALGO == 4)
-	, uint version
-#endif
-		)
+__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
 {
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 	uint ExpandedKey2[40];
@@ -827,58 +727,42 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 	if(gIdx < Threads)
 #endif
 	{
-		int iterations = MEMORY >> 7;
 #if (ALGO == 4)
-		if(version < 3)
-		{
-			iterations >>= 1;
-			#pragma unroll 2
-			for(int i = 0; i < iterations; ++i)
-			{
-				text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
-
-				#pragma unroll 10
-				for(int j = 0; j < 10; ++j)
-					text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-			}
-		}
-		else
+		#pragma unroll 2
+		for(int i = 0; i < (MEMORY >> 7); ++i)
 		{
-			#pragma unroll 2
-			for(int i = 0; i < iterations; ++i)
-			{
-				text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
 
-				#pragma unroll 10
-				for(int j = 0; j < 10; ++j)
-					text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
 
 
-				barrier(CLK_LOCAL_MEM_FENCE);
-				xin[get_local_id(1)][get_local_id(0)] = text;
-				barrier(CLK_LOCAL_MEM_FENCE);
-				text = mix_and_propagate(xin);
-			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+			xin[get_local_id(1)][get_local_id(0)] = text;
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text = mix_and_propagate(xin);
+		}
 
-			#pragma unroll 2
-			for(int i = 0; i < iterations; ++i)
-			{
-				text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+		#pragma unroll 2
+		for(int i = 0; i < (MEMORY >> 7); ++i)
+		{
+			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
 
-				#pragma unroll 10
-				for(int j = 0; j < 10; ++j)
-					text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
 
 
-				barrier(CLK_LOCAL_MEM_FENCE);
-				xin[get_local_id(1)][get_local_id(0)] = text;
-				barrier(CLK_LOCAL_MEM_FENCE);
-				text = mix_and_propagate(xin);
-			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+			xin[get_local_id(1)][get_local_id(0)] = text;
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text = mix_and_propagate(xin);
 		}
+		
 #else
 		#pragma unroll 2
-		for(int i = 0; i < iterations; ++i)
+		for(int i = 0; i < (MEMORY >> 7); ++i)
 		{
 			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
 
@@ -891,21 +775,18 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 
 // cryptonight_heavy
 #if (ALGO == 4)
-	if(version >= 3)
+	/* Also left over threads perform this loop.
+	 * The left over thread results will be ignored
+	 */
+	for(size_t i=0; i < 16; i++)
 	{
-		/* Also left over threads performe this loop.
-		 * The left over thread results will be ignored
-		 */
-		for(size_t i=0; i < 16; i++)
-		{
-			#pragma unroll
-			for(int j = 0; j < 10; ++j)
-				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-			barrier(CLK_LOCAL_MEM_FENCE);
-			xin[get_local_id(1)][get_local_id(0)] = text;
-			barrier(CLK_LOCAL_MEM_FENCE);
-			text = mix_and_propagate(xin);
-		}
+		#pragma unroll
+		for(int j = 0; j < 10; ++j)
+			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+		xin[get_local_id(1)][get_local_id(0)] = text;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		text = mix_and_propagate(xin);
 	}
 #endif
 
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index cab5ad9..f15b480 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -233,7 +233,7 @@ void minethd::work_main()
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
 		uint64_t target = oWork.iTarget;
 		
-		XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, version);
+		XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo);
 
 		if(oWork.bNiceHash)
 			pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39);
@@ -249,7 +249,7 @@ void minethd::work_main()
 			cl_uint results[0x100];
 			memset(results,0,sizeof(cl_uint)*(0x100));
 
-			XMRRunJob(pGpuCtx, results, miner_algo, version);
+			XMRRunJob(pGpuCtx, results, miner_algo);
 
 			for(size_t i = 0; i < results[0xFF]; i++)
 			{
-- 
cgit v1.1


From 1b18f598aa1190a0e6126ed2c70e052e9403d180 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 8 Apr 2018 20:40:54 +0200
Subject: refactor scratchpad creation

Use the maximum scratchpad size from before and after the fork.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 10 ++++++----
 xmrstak/backend/amd/autoAdjust.hpp  |  5 ++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'xmrstak/backend/amd')

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index b9cc9b6..79e80bd 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -308,9 +308,10 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	size_t scratchPadSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
-	int threadMemMask = cn_select_mask(::jconf::inst()->GetMiningAlgo());
-	int hashIterations = cn_select_iter(::jconf::inst()->GetMiningAlgo());
+	size_t scratchPadSize = std::max(
+		cn_select_memory(::jconf::inst()->GetMiningAlgo()),
+		cn_select_memory(::jconf::inst()->GetMiningAlgoRoot())
+	);
 
 	size_t g_thd = ctx->rawIntensity;
 	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret);
@@ -382,6 +383,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 	for(int ii = 0; ii < num_algos; ++ii)
 	{
+		// scratchpad size for the selected mining algorithm
 		size_t hashMemSize = cn_select_memory(miner_algo[ii]);
 		int threadMemMask = cn_select_mask(miner_algo[ii]);
 		int hashIterations = cn_select_iter(miner_algo[ii]);
@@ -493,7 +495,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 					p_id++;
 				}
 
-			if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
 				{
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
 					return ERR_OCL_API;
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index c798cf3..6df0eea 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -83,7 +83,10 @@ private:
 
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
-		size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
+		size_t hashMemSize = std::max(
+			cn_select_memory(::jconf::inst()->GetMiningAlgo()),
+			cn_select_memory(::jconf::inst()->GetMiningAlgoRoot())
+		);
 
 		std::string conf;
 		for(auto& ctx : devVec)
-- 
cgit v1.1


From 4682b28a5d304436ca20469e5089f97814f3f4ab Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 14 Apr 2018 23:17:33 +0200
Subject: allow non AMD OpenCL driver and devices

- add CLI flag to explicitly use non AMD OpenCL and devices
- adjust OpenCL output (use OpenCL instead of AMD if --altOpenCL is sued)
- optimize NVIDIA OpenCL auto suggestion
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 30 ++++++++++++++++++++++--------
 xmrstak/backend/amd/amd_gpu/gpu.hpp |  1 +
 xmrstak/backend/amd/autoAdjust.hpp  | 23 ++++++++++++++++++++---
 xmrstak/backend/amd/minethd.cpp     |  6 ++++--
 4 files changed, 47 insertions(+), 13 deletions(-)

(limited to 'xmrstak/backend/amd')

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 79e80bd..9a4ba73 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -675,11 +675,18 @@ std::vector<GpuContext> getAMDDevices(int index)
 		}
 
 		std::string devVendor(devVendorVec.data());
-		if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos)
+
+		bool isAMDDevice = devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos;
+		bool isNVIDIADevice = devVendor.find("NVIDIA Corporation") != std::string::npos || devVendor.find("NVIDIA") != std::string::npos;
+
+		std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor;
+		if((isAMDDevice && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA"))
 		{
 			GpuContext ctx;
 			std::vector<char> devNameVec(1024);
 			size_t maxMem;
+			if( devVendor.find("NVIDIA Corporation") != std::string::npos)
+				ctx.isNVIDIA = true;
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
 			{
@@ -699,6 +706,10 @@ std::vector<GpuContext> getAMDDevices(int index)
 				continue;
 			}
 
+			// the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation
+			if(ctx.isNVIDIA)
+				maxMem = ctx.freeMem;
+
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
@@ -747,13 +758,15 @@ int getAMDPlatformIdx()
 
 			clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
 			std::string platformName(platformNameVec.data());
-			if( platformName.find("Advanced Micro Devices") != std::string::npos ||
+
+			bool isAMDOpenCL = platformName.find("Advanced Micro Devices") != std::string::npos ||
 				platformName.find("Apple") != std::string::npos ||
-				platformName.find("Mesa") != std::string::npos
-			)
+				platformName.find("Mesa") != std::string::npos;
+			bool isNVIDIADevice = platformName.find("NVIDIA Corporation") != std::string::npos || platformName.find("NVIDIA") != std::string::npos;
+			std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor;
+			if((isAMDOpenCL && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA"))
 			{
-
-				printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str());
+				printer::inst()->print_msg(L0,"Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i , platformName.c_str());
 				if(platformName.find("Mesa") != std::string::npos)
 					mesaPlatform = i;
 				else
@@ -819,7 +832,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	std::vector<char> platformNameVec(infoSize);
 	clGetPlatformInfo(PlatformIDList[platform_idx], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
 	std::string platformName(platformNameVec.data());
-	if( platformName.find("Advanced Micro Devices") == std::string::npos)
+	if(xmrstak::params::inst().openCLVendor == "AMD" && platformName.find("Advanced Micro Devices") == std::string::npos)
 	{
 		printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str());
 	}
@@ -907,7 +920,8 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 		{
 			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
 			ctx[i].rawIntensity = reduced_intensity;
-			printer::inst()->print_msg(L0, "WARNING AMD: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", ctx[i].deviceIdx, int(reduced_intensity));
+			const std::string backendName = xmrstak::params::inst().openCLVendor;
+			printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
 		}
 
 		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 0db6c90..5ab80b8 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -27,6 +27,7 @@ struct GpuContext
 	size_t workSize;
 	int stridedIndex;
 	int memChunk;
+	bool isNVIDIA = false;
 	int compMode;
 
 	/*Output vars*/
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 6df0eea..e7e98d4 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -91,6 +91,7 @@ private:
 		std::string conf;
 		for(auto& ctx : devVec)
 		{
+			size_t minFreeMem = 128u * byteToMiB;
 			/* 1000 is a magic selected limit, the reason is that more than 2GiB memory
 			 * sowing down the memory performance because of TLB cache misses
 			 */
@@ -112,12 +113,26 @@ private:
 				 */
 				maxThreads = 2024u;
 			}
+
+			// NVIDIA optimizations
+			if(
+				ctx.isNVIDIA && (
+					ctx.name.find("P100") != std::string::npos ||
+				    ctx.name.find("V100") != std::string::npos
+				)
+			)
+			{
+				// do not limit the number of threads
+				maxThreads = 40000u;
+				minFreeMem = 512u * byteToMiB;
+			}
+
 			// increase all intensity limits by two for aeon
 			if(::jconf::inst()->GetMiningAlgo() == cryptonight_lite)
 				maxThreads *= 2u;
 
 			// keep 128MiB memory free (value is randomly chosen)
-			size_t availableMem = ctx.freeMem - (128u * byteToMiB);
+			size_t availableMem = ctx.freeMem - minFreeMem;
 			// 224byte extra memory is used per thread for meta data
 			size_t perThread = hashMemSize + 224u;
 			size_t maxIntensity = availableMem / perThread;
@@ -138,7 +153,7 @@ private:
 				// set 8 threads per block (this is a good value for the most gpus)
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 2,\n"
+					"    \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n"
 					"    \"comp_mode\" : true\n" +
 					"  },\n";
 			}
@@ -151,7 +166,9 @@ private:
 		configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex));
 		configTpl.replace("GPUCONFIG",conf);
 		configTpl.write(params::inst().configFileAMD);
-		printer::inst()->print_msg(L0, "AMD: GPU configuration stored in file '%s'", params::inst().configFileAMD.c_str());
+
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
+		printer::inst()->print_msg(L0, "%s: GPU (OpenCL) configuration stored in file '%s'", backendName.c_str(), params::inst().configFileAMD.c_str());
 	}
 
 	std::vector<GpuContext> devVec;
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index f15b480..9bc3676 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -137,6 +137,8 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	for (i = 0; i < n; i++)
 	{
 		jconf::inst()->GetThreadConfig(i, cfg);
+
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
 		
 		if(cfg.cpu_aff >= 0)
 		{
@@ -144,10 +146,10 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 			printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
 #endif
 
-			printer::inst()->print_msg(L1, "Starting AMD GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff);
+			printer::inst()->print_msg(L1, "Starting %s GPU (OpenCL) thread %d, affinity: %d.", backendName.c_str(), i, (int)cfg.cpu_aff);
 		}
 		else
-			printer::inst()->print_msg(L1, "Starting AMD GPU thread %d, no affinity.", i);
+			printer::inst()->print_msg(L1, "Starting %s GPU (OpenCL) thread %d, no affinity.", backendName.c_str(), i);
 
 		minethd* thd = new minethd(pWork, i + threadOffset, &vGpuData[i], cfg);
 		pvThreads->push_back(thd);
-- 
cgit v1.1


From a5ddd040a6eeb0609e1d1b0f16c1d271d31c7377 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 16 Apr 2018 17:52:42 +0200
Subject: fix wrong algo selection

In the case where the dev pool mines on a higher version than a monero fork coin the miner is not resetting the algorithm.
This PR select the correct algorithm each time the block version hash changed.
---
 xmrstak/backend/amd/minethd.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'xmrstak/backend/amd')

diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 9bc3676..a3acf9d 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -201,6 +201,7 @@ void minethd::work_main()
 	globalStates::inst().iConsumeCnt++;
 
 	uint8_t version = 0;
+	size_t lastPoolId = 0;
 
 	while (bQuit == 0)
 	{
@@ -219,13 +220,19 @@ void minethd::work_main()
 		}
 
 		uint8_t new_version = oWork.getVersion();
-		if(new_version != version)
+		if(new_version != version || oWork.iPoolId != lastPoolId)
 		{
-			if(new_version >= ::jconf::inst()->GetMiningForkVersion())
+			if(new_version >= ::jconf::inst()->GetMiningForkVersion() || oWork.iPoolId == 0)
 			{
 				miner_algo = ::jconf::inst()->GetMiningAlgo();
 				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
+			else
+			{
+				miner_algo = ::jconf::inst()->GetMiningAlgoRoot();
+				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+			}
+			lastPoolId = oWork.iPoolId;
 			version = new_version;
 		}
 
-- 
cgit v1.1


From 0877e2f654b203c5145bb8154fcfb1ad46ba8265 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 16 Apr 2018 21:46:33 +0200
Subject: add independent dev pool coin description

- allow the dev pool to fork on a different block version than the user descriped coin

All algorithm are centered around the user coin description.
It is allowed to have two two different coin algorithms in the user coin description.
It is only allowed to use algorithms for the dev pool coin description those are used in the user coin description.
There are two ways to define a non forking coin.
  - set both user coin algorithm descriptions to the same algorithm and set version to zero
  - set the first algorithm in the user coin description to something you like to use in the dev pool
    and set the second algorithm to the correct representation of the coin. Set the version to 255.
    This will allow that the dev pool can mine on a different coin algorithm than the not forking user coin.
    Do not use an algorithm with different scratchpad size for the dev pool.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 12 ++++++------
 xmrstak/backend/amd/autoAdjust.hpp  |  6 +++---
 xmrstak/backend/amd/minethd.cpp     |  9 +++++----
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'xmrstak/backend/amd')

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 9a4ba73..b3d36e7 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -309,8 +309,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	}
 
 	size_t scratchPadSize = std::max(
-		cn_select_memory(::jconf::inst()->GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetMiningAlgoRoot())
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
 	);
 
 	size_t g_thd = ctx->rawIntensity;
@@ -376,8 +376,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	}
 
 	xmrstak_algo miner_algo[2] = {
-		::jconf::inst()->GetMiningAlgo(),
-		::jconf::inst()->GetMiningAlgoRoot()
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(),
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()
 	};
 	int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2;
 
@@ -936,7 +936,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo)
 {
 	// switch to the kernel storage 
-	int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1;
+	int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
 
 	cl_int ret;
 
@@ -1101,7 +1101,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
 	// switch to the kernel storage
-	int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1;
+	int kernel_storage = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
 	
 	cl_int ret;
 	cl_uint zero = 0;
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index e7e98d4..685890b 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -84,8 +84,8 @@ private:
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
 		size_t hashMemSize = std::max(
-			cn_select_memory(::jconf::inst()->GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetMiningAlgoRoot())
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
+			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
 		);
 
 		std::string conf;
@@ -128,7 +128,7 @@ private:
 			}
 
 			// increase all intensity limits by two for aeon
-			if(::jconf::inst()->GetMiningAlgo() == cryptonight_lite)
+			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
 				maxThreads *= 2u;
 
 			// keep 128MiB memory free (value is randomly chosen)
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index a3acf9d..4353e3d 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -195,7 +195,7 @@ void minethd::work_main()
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 	
 	// start with root algorithm and switch later if fork version is reached
-	auto miner_algo = ::jconf::inst()->GetMiningAlgoRoot();
+	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
 	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 
 	globalStates::inst().iConsumeCnt++;
@@ -222,14 +222,15 @@ void minethd::work_main()
 		uint8_t new_version = oWork.getVersion();
 		if(new_version != version || oWork.iPoolId != lastPoolId)
 		{
-			if(new_version >= ::jconf::inst()->GetMiningForkVersion() || oWork.iPoolId == 0)
+			coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId);
+			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
-				miner_algo = ::jconf::inst()->GetMiningAlgo();
+				miner_algo = coinDesc.GetMiningAlgo();
 				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
 			else
 			{
-				miner_algo = ::jconf::inst()->GetMiningAlgoRoot();
+				miner_algo = coinDesc.GetMiningAlgoRoot();
 				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
 			lastPoolId = oWork.iPoolId;
-- 
cgit v1.1


From 053dcb01d814daab686d66ba82da96ce2aec7747 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 16 Apr 2018 22:06:38 +0200
Subject: remove fork for sumokoin and monero

remove fork version for sumokoin and monero7
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'xmrstak/backend/amd')

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index b3d36e7..03100d0 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -1101,7 +1101,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
 	// switch to the kernel storage
-	int kernel_storage = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
+	int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
 	
 	cl_int ret;
 	cl_uint zero = 0;
-- 
cgit v1.1