summaryrefslogtreecommitdiffstats
path: root/xmrstak/backend/amd
diff options
context:
space:
mode:
authorpsychocrypt <psychocryptHPC@gmail.com>2018-04-07 22:53:43 +0200
committerTimothy Pearson <tpearson@raptorengineering.com>2018-06-04 21:07:11 +0000
commit571e37a9269f71bc5f1c14ad963dee9f89a71ae5 (patch)
treec8bb21c7d086f4e018d9479a2dc66c71cfb4083a /xmrstak/backend/amd
parentbf6dfc61038ab98a5102cd1eaeb3befc56836642 (diff)
downloadxmr-stak-571e37a9269f71bc5f1c14ad963dee9f89a71ae5.zip
xmr-stak-571e37a9269f71bc5f1c14ad963dee9f89a71ae5.tar.gz
amd simplify kernel for different algorithms
- remove version numbers within the kernel - create seperate program context for each mining algorithm - remove kernel `cn1_monero` is now integrated in `cn1` - remname `cnX` kernel in `cnX + algorithmNumber`
Diffstat (limited to 'xmrstak/backend/amd')
-rw-r--r--xmrstak/backend/amd/amd_gpu/gpu.cpp413
-rw-r--r--xmrstak/backend/amd/amd_gpu/gpu.hpp8
-rw-r--r--xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl301
-rw-r--r--xmrstak/backend/amd/minethd.cpp4
4 files changed, 295 insertions, 431 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 13f018e..b9cc9b6 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -26,6 +26,7 @@
#include <algorithm>
#include <regex>
#include <cassert>
+#include <algorithm>
#include <fstream>
#include <sstream>
@@ -307,12 +308,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
return ERR_OCL_API;
}
- size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
+ size_t scratchPadSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
int threadMemMask = cn_select_mask(::jconf::inst()->GetMiningAlgo());
int hashIterations = cn_select_iter(::jconf::inst()->GetMiningAlgo());
size_t g_thd = ctx->rawIntensity;
- ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret);
+ ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
@@ -373,167 +374,202 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
return ERR_OCL_API;
}
- auto miner_algo = ::jconf::inst()->GetMiningAlgo();
+ xmrstak_algo miner_algo[2] = {
+ ::jconf::inst()->GetMiningAlgo(),
+ ::jconf::inst()->GetMiningAlgoRoot()
+ };
+ int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2;
- char options[512];
- snprintf(options, sizeof(options),
- "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
- hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
- int_port(hashMemSize), int(miner_algo));
- /* create a hash for the compile time cache
- * used data:
- * - source code
- * - device name
- * - compile paramater
- */
- std::string src_str(source_code);
- src_str += options;
- src_str += devNameVec.data();
- std::string hash_hex_str;
- picosha2::hash256_hex_string(src_str, hash_hex_str);
-
- std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin";
- std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary);
- if(xmrstak::params::inst().AMDCache == false || !clBinFile.good())
+ for(int ii = 0; ii < num_algos; ++ii)
{
- if(xmrstak::params::inst().AMDCache)
- printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
- ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
- if(ret != CL_SUCCESS)
- {
- printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
- return ERR_OCL_API;
- }
+ size_t hashMemSize = cn_select_memory(miner_algo[ii]);
+ int threadMemMask = cn_select_mask(miner_algo[ii]);
+ int hashIterations = cn_select_iter(miner_algo[ii]);
- ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
- if(ret != CL_SUCCESS)
+ char options[512];
+ snprintf(options, sizeof(options),
+ "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
+ hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
+ int_port(hashMemSize), int(miner_algo[ii]));
+ /* create a hash for the compile time cache
+ * used data:
+ * - source code
+ * - device name
+ * - compile paramater
+ */
+ std::string src_str(source_code);
+ src_str += options;
+ src_str += devNameVec.data();
+ std::string hash_hex_str;
+ picosha2::hash256_hex_string(src_str, hash_hex_str);
+
+ std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin";
+ std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary);
+ if(xmrstak::params::inst().AMDCache == false || !clBinFile.good())
{
- size_t len;
- printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
-
- if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+ if(xmrstak::params::inst().AMDCache)
+ printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
+ ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
+ if(ret != CL_SUCCESS)
{
- printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+ printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
return ERR_OCL_API;
}
- char* BuildLog = (char*)malloc(len + 1);
- BuildLog[0] = '\0';
-
- if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+ ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL);
+ if(ret != CL_SUCCESS)
{
+ size_t len;
+ printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+
+ if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+
+ char* BuildLog = (char*)malloc(len + 1);
+ BuildLog[0] = '\0';
+
+ if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+ {
+ free(BuildLog);
+ printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+
+ printer::inst()->print_str("Build log:\n");
+ std::cerr<<BuildLog<<std::endl;
+
free(BuildLog);
- printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
return ERR_OCL_API;
}
- printer::inst()->print_str("Build log:\n");
- std::cerr<<BuildLog<<std::endl;
-
- free(BuildLog);
- return ERR_OCL_API;
- }
-
- cl_uint num_devices;
- clGetProgramInfo(ctx->Program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
+ cl_uint num_devices;
+ clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
- std::vector<cl_device_id> devices_ids(num_devices);
- clGetProgramInfo(ctx->Program, CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
- int dev_id = 0;
- /* Search for the gpu within the program context.
- * The id can be different to ctx->DeviceID.
- */
- for(auto & ocl_device : devices_ids)
- {
- if(ocl_device == ctx->DeviceID)
- break;
- dev_id++;
- }
+ std::vector<cl_device_id> devices_ids(num_devices);
+ clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
+ int dev_id = 0;
+ /* Search for the gpu within the program context.
+ * The id can be different to ctx->DeviceID.
+ */
+ for(auto & ocl_device : devices_ids)
+ {
+ if(ocl_device == ctx->DeviceID)
+ break;
+ dev_id++;
+ }
- cl_build_status status;
- do
- {
- if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+ cl_build_status status;
+ do
{
- printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
- return ERR_OCL_API;
+ if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+ port_sleep(1);
}
- port_sleep(1);
- }
- while(status == CL_BUILD_IN_PROGRESS);
+ while(status == CL_BUILD_IN_PROGRESS);
+
+ std::vector<size_t> binary_sizes(num_devices);
+ clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
- std::vector<size_t> binary_sizes(num_devices);
- clGetProgramInfo (ctx->Program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
+ std::vector<char*> all_programs(num_devices);
+ std::vector<std::vector<char>> program_storage;
- std::vector<char*> all_programs(num_devices);
- std::vector<std::vector<char>> program_storage;
+ if(xmrstak::params::inst().AMDCache)
+ {
+ int p_id = 0;
+ size_t mem_size = 0;
+ // create memory structure to query all OpenCL program binaries
+ for(auto & p : all_programs)
+ {
+ program_storage.emplace_back(std::vector<char>(binary_sizes[p_id]));
+ all_programs[p_id] = program_storage[p_id].data();
+ mem_size += binary_sizes[p_id];
+ p_id++;
+ }
- if(xmrstak::params::inst().AMDCache)
+ if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+
+ std::ofstream file_stream;
+ file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary);
+ file_stream.write(all_programs[dev_id], binary_sizes[dev_id]);
+ file_stream.close();
+ printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str());
+ }
+ }
+ else
{
- int p_id = 0;
- size_t mem_size = 0;
- // create memory structure to query all OpenCL program binaries
- for(auto & p : all_programs)
+ printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str());
+ std::ostringstream ss;
+ ss << clBinFile.rdbuf();
+ std::string s = ss.str();
+
+ size_t bin_size = s.size();
+ auto data_ptr = s.data();
+
+ cl_int clStatus;
+ ctx->Program[ii] = clCreateProgramWithBinary(
+ opencl_ctx, 1, &ctx->DeviceID, &bin_size,
+ (const unsigned char **)&data_ptr, &clStatus, &ret
+ );
+ if(ret != CL_SUCCESS)
{
- program_storage.emplace_back(std::vector<char>(binary_sizes[p_id]));
- all_programs[p_id] = program_storage[p_id].data();
- mem_size += binary_sizes[p_id];
- p_id++;
+ printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+ return ERR_OCL_API;
}
-
- if((ret = clGetProgramInfo(ctx->Program, CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+ ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL);
+ if(ret != CL_SUCCESS)
{
- printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
+ printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
return ERR_OCL_API;
}
-
- std::ofstream file_stream;
- file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary);
- file_stream.write(all_programs[dev_id], binary_sizes[dev_id]);
- file_stream.close();
- printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str());
}
- }
- else
- {
- printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str());
- std::ostringstream ss;
- ss << clBinFile.rdbuf();
- std::string s = ss.str();
-
- size_t bin_size = s.size();
- auto data_ptr = s.data();
-
- cl_int clStatus;
- ctx->Program = clCreateProgramWithBinary(
- opencl_ctx, 1, &ctx->DeviceID, &bin_size,
- (const unsigned char **)&data_ptr, &clStatus, &ret
- );
- if(ret != CL_SUCCESS)
+
+ std::vector<std::string> KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
+ // append algorithm number to kernel name
+ for(int k = 0; k < 3; k++)
+ KernelNames[k] += std::to_string(miner_algo[ii]);
+
+ if(ii == 0)
{
- printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
- return ERR_OCL_API;
+ for(int i = 0; i < 7; ++i)
+ {
+ ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
+ if(ret != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
+ return ERR_OCL_API;
+ }
+ }
}
- ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, NULL, NULL, NULL);
- if(ret != CL_SUCCESS)
+ else
{
- printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
- return ERR_OCL_API;
- }
- }
+ for(int i = 0; i < 3; ++i)
+ {
+ ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
+ if(ret != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str());
+ return ERR_OCL_API;
+ }
+ }
+ // move kernel from the main algorithm into the root algorithm kernel space
+ for(int i = 3; i < 7; ++i)
+ {
+ ctx->Kernels[ii][i] = ctx->Kernels[0][i];
+ }
- const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero" };
- for(int i = 0; i < 8; ++i)
- {
- ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret);
- if(ret != CL_SUCCESS)
- {
- printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel %s.", err_to_str(ret), KernelNames[i]);
- return ERR_OCL_API;
}
}
-
ctx->Nonce = 0;
return 0;
}
@@ -881,8 +917,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
return ERR_SUCCESS;
}
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version)
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo)
{
+ // switch to the kernel storage
+ int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1;
+
cl_int ret;
if(input_len > 84)
@@ -899,71 +938,51 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
return ERR_OCL_API;
}
- if((ret = clSetKernelArg(ctx->Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}
// Scratchpads
- if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}
// States
- if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
return ERR_OCL_API;
}
// Threads
- if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
return(ERR_OCL_API);
}
- /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version.
- * Do NOT use the variable `miner_algo` because this variable is changed dynamicly
- */
- if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy)
- {
- // version
- if ((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_uint), &version)) != CL_SUCCESS)
- {
- printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 4.", err_to_str(ret));
- return ERR_OCL_API;
- }
- }
-
// CN1 Kernel
- /// @todo only activate if currency is monero
- int cn_kernel_offset = 0;
- if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon)
- {
- cn_kernel_offset = 6;
- }
-
// Scratchpads
- if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}
// States
- if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}
// Threads
- if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
return(ERR_OCL_API);
@@ -972,113 +991,88 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon )
{
// Input
- if ((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+ if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, arugment 4(input buffer).", err_to_str(ret));
return ERR_OCL_API;
}
}
- /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version.
- * Do NOT use the variable `miner_algo` because this variable is changed dynamicly
- */
- else if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy)
- {
- // version
- if ((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_uint), &version)) != CL_SUCCESS)
- {
- printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 3 (version).", err_to_str(ret));
- return ERR_OCL_API;
- }
- }
// CN3 Kernel
// Scratchpads
- if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}
// States
- if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}
// Branch 0
- if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
return ERR_OCL_API;
}
// Branch 1
- if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
return ERR_OCL_API;
}
// Branch 2
- if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
return ERR_OCL_API;
}
// Branch 3
- if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
return ERR_OCL_API;
}
// Threads
- if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
return(ERR_OCL_API);
}
- /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version.
- * Do NOT use the variable `miner_algo` because this variable is changed dynamicly
- */
- if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy)
- {
- // version
- if ((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_uint), &version)) != CL_SUCCESS)
- {
- printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 7.", err_to_str(ret));
- return ERR_OCL_API;
- }
- }
-
for(int i = 0; i < 4; ++i)
{
// States
- if((ret = clSetKernelArg(ctx->Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
return ERR_OCL_API;
}
// Nonce buffer
- if((ret = clSetKernelArg(ctx->Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
return ERR_OCL_API;
}
// Output
- if((ret = clSetKernelArg(ctx->Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
return ERR_OCL_API;
}
// Target
- if((ret = clSetKernelArg(ctx->Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
return ERR_OCL_API;
@@ -1088,8 +1082,11 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
return ERR_SUCCESS;
}
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version)
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
{
+ // switch to the kernel storage
+ int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1;
+
cl_int ret;
cl_uint zero = 0;
size_t BranchNonces[4];
@@ -1125,35 +1122,21 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo,
clFinish(ctx->CommandQueues);
size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 };
- if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+ if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
return ERR_OCL_API;
}
- /*for(int i = 1; i < 3; ++i)
- {
- if((ret = clEnqueueNDRangeKernel(*ctx->CommandQueues, ctx->Kernels[i], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
- {
- Log(LOG_CRITICAL, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i);
- return(ERR_OCL_API);
- }
- }*/
-
size_t tmpNonce = ctx->Nonce;
- /// @todo only activate if currency is monero
- int cn_kernel_offset = 0;
- if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon)
- {
- cn_kernel_offset = 6;
- }
- if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1 + cn_kernel_offset], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+
+ if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
return ERR_OCL_API;
}
- if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+ if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
return ERR_OCL_API;
@@ -1190,7 +1173,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo,
if(BranchNonces[i])
{
// Threads
- if((clSetKernelArg(ctx->Kernels[i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
+ if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
return(ERR_OCL_API);
@@ -1201,7 +1184,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo,
// number of global threads must be a multiple of the work group size (w_size)
assert(BranchNonces[i]%w_size == 0);
size_t tmpNonce = ctx->Nonce;
- if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+ if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
return ERR_OCL_API;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index a387b15..0db6c90 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -35,8 +35,8 @@ struct GpuContext
cl_mem InputBuffer;
cl_mem OutputBuffer;
cl_mem ExtraBuffers[6];
- cl_program Program;
- cl_kernel Kernels[8];
+ cl_program Program[2];
+ cl_kernel Kernels[2][8];
size_t freeMem;
int computeUnits;
std::string name;
@@ -50,7 +50,7 @@ int getAMDPlatformIdx();
std::vector<GpuContext> getAMDDevices(int index);
size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version);
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version);
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 5d4e66c..d2ae1a7 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -433,15 +433,13 @@ inline ulong getIdx()
#endif
}
-#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
+#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
+
+#define JOIN_DO(x,y) x##y
+#define JOIN(x,y) JOIN_DO(x,y)
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads
-// cryptonight_heavy
-#if (ALGO == 4)
- , uint version
-#endif
-)
+__kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
{
ulong State[25];
uint ExpandedKey1[40];
@@ -517,23 +515,20 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
// cryptonight_heavy
#if (ALGO == 4)
- if(version >= 3)
- {
- __local uint4 xin[8][WORKSIZE];
+ __local uint4 xin[8][WORKSIZE];
- /* Also left over threads performe this loop.
- * The left over thread results will be ignored
- */
- for(size_t i=0; i < 16; i++)
- {
- #pragma unroll
- for(int j = 0; j < 10; ++j)
- text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
- barrier(CLK_LOCAL_MEM_FENCE);
- xin[get_local_id(1)][get_local_id(0)] = text;
- barrier(CLK_LOCAL_MEM_FENCE);
- text = mix_and_propagate(xin);
- }
+ /* Also left over threads performe this loop.
+ * The left over thread results will be ignored
+ */
+ for(size_t i=0; i < 16; i++)
+ {
+ #pragma unroll
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ xin[get_local_id(1)][get_local_id(0)] = text;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ text = mix_and_propagate(xin);
}
#endif
@@ -542,13 +537,9 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
if(gIdx < Threads)
#endif
{
- int iterations = MEMORY >> 7;
-#if (ALGO == 4)
- if(version < 3)
- iterations >>= 1;
-#endif
+
#pragma unroll 2
- for(int i = 0; i < iterations; ++i)
+ for(int i = 0; i < (MEMORY >> 7); ++i)
{
#pragma unroll
for(int j = 0; j < 10; ++j)
@@ -560,22 +551,13 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
-#define VARIANT1_1(p) \
- uint table = 0x75310U; \
- uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
- (p).s2 ^= ((table >> index) & 0x30U) << 24
-
-#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2
-
-#define VARIANT1_INIT() \
- tweak1_2 = as_uint2(input[4]); \
- tweak1_2.s0 >>= 24; \
- tweak1_2.s0 |= tweak1_2.s1 << 8; \
- tweak1_2.s1 = get_global_id(0); \
- tweak1_2 ^= as_uint2(states[24])
-
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, __global ulong *input)
+__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
+// cryptonight_monero || cryptonight_aeon
+#if(ALGO == 3 || ALGO == 5)
+, __global ulong *input
+#endif
+)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
@@ -592,8 +574,9 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
}
barrier(CLK_LOCAL_MEM_FENCE);
-
+#if(ALGO == 3 || ALGO == 5)
uint2 tweak1_2;
+#endif
uint4 b_x;
#if(COMP_MODE==1)
// do not use early return here
@@ -615,7 +598,13 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0];
- VARIANT1_INIT();
+#if(ALGO == 3 || ALGO == 5)
+ tweak1_2 = as_uint2(input[4]);
+ tweak1_2.s0 >>= 24;
+ tweak1_2.s0 |= tweak1_2.s1 << 8;
+ tweak1_2.s1 = get_global_id(0);
+ tweak1_2 ^= as_uint2(states[24]);
+#endif
}
mem_fence(CLK_LOCAL_MEM_FENCE);
@@ -625,17 +614,23 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
if(gIdx < Threads)
#endif
{
+ ulong idx0 = a[0];
+
#pragma unroll 8
for(int i = 0; i < ITERATIONS; ++i)
{
ulong c[2];
- ((uint4 *)c)[0] = Scratchpad[IDX((a[0] & MASK) >> 4)];
+ ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
b_x ^= ((uint4 *)c)[0];
- VARIANT1_1(b_x);
- Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x;
+#if(ALGO == 3 || ALGO == 5)
+ uint table = 0x75310U;
+ uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2);
+ b_x.s2 ^= ((table >> index) & 0x30U) << 24;
+#endif
+ Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
uint4 tmp;
tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
@@ -643,101 +638,14 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
- VARIANT1_2(a[1]);
- Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
- VARIANT1_2(a[1]);
-
- ((uint4 *)a)[0] ^= tmp;
-
- b_x = ((uint4 *)c)[0];
- }
- }
- mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads
-// cryptonight_heavy
-#if (ALGO == 4)
- , uint version
-#endif
-)
-{
- ulong a[2], b[2];
- __local uint AES0[256], AES1[256], AES2[256], AES3[256];
-
- const ulong gIdx = getIdx();
-
- for(int i = get_local_id(0); i < 256; i += WORKSIZE)
- {
- const uint tmp = AES0_C[i];
- AES0[i] = tmp;
- AES1[i] = rotate(tmp, 8U);
- AES2[i] = rotate(tmp, 16U);
- AES3[i] = rotate(tmp, 24U);
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- uint4 b_x;
-#if(COMP_MODE==1)
- // do not use early return here
- if(gIdx < Threads)
-#endif
- {
- states += 25 * gIdx;
-#if(STRIDED_INDEX==0)
- Scratchpad += gIdx * (MEMORY >> 4);
-#elif(STRIDED_INDEX==1)
- Scratchpad += gIdx;
-#elif(STRIDED_INDEX==2)
- Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
-#endif
-
- a[0] = states[0] ^ states[4];
- b[0] = states[2] ^ states[6];
- a[1] = states[1] ^ states[5];
- b[1] = states[3] ^ states[7];
-
- b_x = ((uint4 *)b)[0];
- }
-
- mem_fence(CLK_LOCAL_MEM_FENCE);
-
-#if(COMP_MODE==1)
- // do not use early return here
- if(gIdx < Threads)
-#endif
- {
- ulong idx0 = a[0];
- ulong mask = MASK;
- int iterations = ITERATIONS;
-#if (ALGO == 4)
- if(version < 3)
- {
- iterations <<= 1;
- mask -= 0x200000;
- }
+#if(ALGO == 3 || ALGO == 5)
+ ((uint2 *)&(a[1]))[0] ^= tweak1_2;
+ Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+ ((uint2 *)&(a[1]))[0] ^= tweak1_2;
+#else
+ Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
#endif
- #pragma unroll 8
- for(int i = 0; i < iterations; ++i)
- {
- ulong c[2];
-
- ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & mask) >> 4)];
- ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
- //b_x ^= ((uint4 *)c)[0];
-
- Scratchpad[IDX((idx0 & mask) >> 4)] = b_x ^ ((uint4 *)c)[0];
-
- uint4 tmp;
- tmp = Scratchpad[IDX((c[0] & mask) >> 4)];
-
- a[1] += c[0] * as_ulong2(tmp).s0;
- a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
-
- Scratchpad[IDX((c[0] & mask) >> 4)] = ((uint4 *)a)[0];
((uint4 *)a)[0] ^= tmp;
idx0 = a[0];
@@ -745,14 +653,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
b_x = ((uint4 *)c)[0];
// cryptonight_heavy
#if (ALGO == 4)
- if(version >= 3)
- {
- long n = *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4))));
- int d = ((__global int*)(Scratchpad + (IDX((idx0 & mask) >> 4))))[2];
- long q = n / (d | 0x5);
- *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))) = n ^ q;
- idx0 = d ^ q;
- }
+ long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
+ int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+ long q = n / (d | 0x5);
+ *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
+ idx0 = d ^ q;
#endif
}
}
@@ -760,12 +665,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
}
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads
-// cryptonight_heavy
-#if (ALGO == 4)
- , uint version
-#endif
- )
+__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
{
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
uint ExpandedKey2[40];
@@ -827,58 +727,42 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
if(gIdx < Threads)
#endif
{
- int iterations = MEMORY >> 7;
#if (ALGO == 4)
- if(version < 3)
- {
- iterations >>= 1;
- #pragma unroll 2
- for(int i = 0; i < iterations; ++i)
- {
- text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
-
- #pragma unroll 10
- for(int j = 0; j < 10; ++j)
- text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
- }
- }
- else
+ #pragma unroll 2
+ for(int i = 0; i < (MEMORY >> 7); ++i)
{
- #pragma unroll 2
- for(int i = 0; i < iterations; ++i)
- {
- text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+ text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
- #pragma unroll 10
- for(int j = 0; j < 10; ++j)
- text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+ #pragma unroll 10
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
- barrier(CLK_LOCAL_MEM_FENCE);
- xin[get_local_id(1)][get_local_id(0)] = text;
- barrier(CLK_LOCAL_MEM_FENCE);
- text = mix_and_propagate(xin);
- }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ xin[get_local_id(1)][get_local_id(0)] = text;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ text = mix_and_propagate(xin);
+ }
- #pragma unroll 2
- for(int i = 0; i < iterations; ++i)
- {
- text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+ #pragma unroll 2
+ for(int i = 0; i < (MEMORY >> 7); ++i)
+ {
+ text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
- #pragma unroll 10
- for(int j = 0; j < 10; ++j)
- text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+ #pragma unroll 10
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
- barrier(CLK_LOCAL_MEM_FENCE);
- xin[get_local_id(1)][get_local_id(0)] = text;
- barrier(CLK_LOCAL_MEM_FENCE);
- text = mix_and_propagate(xin);
- }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ xin[get_local_id(1)][get_local_id(0)] = text;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ text = mix_and_propagate(xin);
}
+
#else
#pragma unroll 2
- for(int i = 0; i < iterations; ++i)
+ for(int i = 0; i < (MEMORY >> 7); ++i)
{
text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
@@ -891,21 +775,18 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
// cryptonight_heavy
#if (ALGO == 4)
- if(version >= 3)
+ /* Also left over threads perform this loop.
+ * The left over thread results will be ignored
+ */
+ for(size_t i=0; i < 16; i++)
{
- /* Also left over threads performe this loop.
- * The left over thread results will be ignored
- */
- for(size_t i=0; i < 16; i++)
- {
- #pragma unroll
- for(int j = 0; j < 10; ++j)
- text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
- barrier(CLK_LOCAL_MEM_FENCE);
- xin[get_local_id(1)][get_local_id(0)] = text;
- barrier(CLK_LOCAL_MEM_FENCE);
- text = mix_and_propagate(xin);
- }
+ #pragma unroll
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ xin[get_local_id(1)][get_local_id(0)] = text;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ text = mix_and_propagate(xin);
}
#endif
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 2d6d2ba..ac21607 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -233,7 +233,7 @@ void minethd::work_main()
assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
uint64_t target = oWork.iTarget;
- XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, version);
+ XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo);
if(oWork.bNiceHash)
pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39);
@@ -249,7 +249,7 @@ void minethd::work_main()
cl_uint results[0x100];
memset(results,0,sizeof(cl_uint)*(0x100));
- XMRRunJob(pGpuCtx, results, miner_algo, version);
+ XMRRunJob(pGpuCtx, results, miner_algo);
for(size_t i = 0; i < results[0xFF]; i++)
{
OpenPOWER on IntegriCloud