diff options
Diffstat (limited to 'xmrstak/backend/amd/amd_gpu')
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/gpu.cpp | 413 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/gpu.hpp | 8 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 301 |
3 files changed, 293 insertions, 429 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 13f018e..b9cc9b6 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -26,6 +26,7 @@ #include <algorithm> #include <regex> #include <cassert> +#include <algorithm> #include <fstream> #include <sstream> @@ -307,12 +308,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); + size_t scratchPadSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); int threadMemMask = cn_select_mask(::jconf::inst()->GetMiningAlgo()); int hashIterations = cn_select_iter(::jconf::inst()->GetMiningAlgo()); size_t g_thd = ctx->rawIntensity; - ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret); + ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret); if(ret != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); @@ -373,167 +374,202 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - auto miner_algo = ::jconf::inst()->GetMiningAlgo(); + xmrstak_algo miner_algo[2] = { + ::jconf::inst()->GetMiningAlgo(), + ::jconf::inst()->GetMiningAlgoRoot() + }; + int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2; - char options[512]; - snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", - hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0, - int_port(hashMemSize), int(miner_algo)); - /* create a hash for the compile time cache - * used data: - * - source code - * - device name - * - compile paramater - */ - std::string src_str(source_code); - src_str += options; - src_str += devNameVec.data(); - std::string hash_hex_str; - picosha2::hash256_hex_string(src_str, hash_hex_str); - - std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin"; - std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary); - if(xmrstak::params::inst().AMDCache == false || !clBinFile.good()) + for(int ii = 0; ii < num_algos; ++ii) { - if(xmrstak::params::inst().AMDCache) - printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); - ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); - return ERR_OCL_API; - } + size_t hashMemSize = cn_select_memory(miner_algo[ii]); + int threadMemMask = cn_select_mask(miner_algo[ii]); + int hashIterations = cn_select_iter(miner_algo[ii]); - ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); - if(ret != CL_SUCCESS) + char options[512]; + snprintf(options, sizeof(options), + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", + hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0, + int_port(hashMemSize), int(miner_algo[ii])); + /* create a hash for the compile time cache + * used data: + * - source code + * - device name + * - compile paramater + */ + std::string src_str(source_code); + src_str += options; + src_str += devNameVec.data(); + std::string hash_hex_str; + picosha2::hash256_hex_string(src_str, hash_hex_str); + + std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin"; + std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary); + if(xmrstak::params::inst().AMDCache == false || !clBinFile.good()) { - size_t len; - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); - - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + if(xmrstak::params::inst().AMDCache) + printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); + ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); + if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return ERR_OCL_API; } - char* BuildLog = (char*)malloc(len + 1); - BuildLog[0] = '\0'; - - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL); + if(ret != CL_SUCCESS) { + size_t len; + printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); + + if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + return ERR_OCL_API; + } + + char* BuildLog = (char*)malloc(len + 1); + BuildLog[0] = '\0'; + + if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + { + free(BuildLog); + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + return ERR_OCL_API; + } + + printer::inst()->print_str("Build log:\n"); + std::cerr<<BuildLog<<std::endl; + free(BuildLog); - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); return ERR_OCL_API; } - printer::inst()->print_str("Build log:\n"); - std::cerr<<BuildLog<<std::endl; - - free(BuildLog); - return ERR_OCL_API; - } - - cl_uint num_devices; - clGetProgramInfo(ctx->Program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); + cl_uint num_devices; + clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); - std::vector<cl_device_id> devices_ids(num_devices); - clGetProgramInfo(ctx->Program, CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); - int dev_id = 0; - /* Search for the gpu within the program context. - * The id can be different to ctx->DeviceID. - */ - for(auto & ocl_device : devices_ids) - { - if(ocl_device == ctx->DeviceID) - break; - dev_id++; - } + std::vector<cl_device_id> devices_ids(num_devices); + clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); + int dev_id = 0; + /* Search for the gpu within the program context. + * The id can be different to ctx->DeviceID. + */ + for(auto & ocl_device : devices_ids) + { + if(ocl_device == ctx->DeviceID) + break; + dev_id++; + } - cl_build_status status; - do - { - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + cl_build_status status; + do { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); - return ERR_OCL_API; + if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + return ERR_OCL_API; + } + port_sleep(1); } - port_sleep(1); - } - while(status == CL_BUILD_IN_PROGRESS); + while(status == CL_BUILD_IN_PROGRESS); + + std::vector<size_t> binary_sizes(num_devices); + clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); - std::vector<size_t> binary_sizes(num_devices); - clGetProgramInfo (ctx->Program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); + std::vector<char*> all_programs(num_devices); + std::vector<std::vector<char>> program_storage; - std::vector<char*> all_programs(num_devices); - std::vector<std::vector<char>> program_storage; + if(xmrstak::params::inst().AMDCache) + { + int p_id = 0; + size_t mem_size = 0; + // create memory structure to query all OpenCL program binaries + for(auto & p : all_programs) + { + program_storage.emplace_back(std::vector<char>(binary_sizes[p_id])); + all_programs[p_id] = program_storage[p_id].data(); + mem_size += binary_sizes[p_id]; + p_id++; + } - if(xmrstak::params::inst().AMDCache) + if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); + return ERR_OCL_API; + } + + std::ofstream file_stream; + file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary); + file_stream.write(all_programs[dev_id], binary_sizes[dev_id]); + file_stream.close(); + printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str()); + } + } + else { - int p_id = 0; - size_t mem_size = 0; - // create memory structure to query all OpenCL program binaries - for(auto & p : all_programs) + printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str()); + std::ostringstream ss; + ss << clBinFile.rdbuf(); + std::string s = ss.str(); + + size_t bin_size = s.size(); + auto data_ptr = s.data(); + + cl_int clStatus; + ctx->Program[ii] = clCreateProgramWithBinary( + opencl_ctx, 1, &ctx->DeviceID, &bin_size, + (const unsigned char **)&data_ptr, &clStatus, &ret + ); + if(ret != CL_SUCCESS) { - program_storage.emplace_back(std::vector<char>(binary_sizes[p_id])); - all_programs[p_id] = program_storage[p_id].data(); - mem_size += binary_sizes[p_id]; - p_id++; + printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); + return ERR_OCL_API; } - - if((ret = clGetProgramInfo(ctx->Program, CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL); + if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); + printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } - - std::ofstream file_stream; - file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary); - file_stream.write(all_programs[dev_id], binary_sizes[dev_id]); - file_stream.close(); - printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str()); } - } - else - { - printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str()); - std::ostringstream ss; - ss << clBinFile.rdbuf(); - std::string s = ss.str(); - - size_t bin_size = s.size(); - auto data_ptr = s.data(); - - cl_int clStatus; - ctx->Program = clCreateProgramWithBinary( - opencl_ctx, 1, &ctx->DeviceID, &bin_size, - (const unsigned char **)&data_ptr, &clStatus, &ret - ); - if(ret != CL_SUCCESS) + + std::vector<std::string> KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; + // append algorithm number to kernel name + for(int k = 0; k < 3; k++) + KernelNames[k] += std::to_string(miner_algo[ii]); + + if(ii == 0) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); - return ERR_OCL_API; + for(int i = 0; i < 7; ++i) + { + ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); + return ERR_OCL_API; + } + } } - ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, NULL, NULL, NULL); - if(ret != CL_SUCCESS) + else { - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); - return ERR_OCL_API; - } - } + for(int i = 0; i < 3; ++i) + { + ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str()); + return ERR_OCL_API; + } + } + // move kernel from the main algorithm into the root algorithm kernel space + for(int i = 3; i < 7; ++i) + { + ctx->Kernels[ii][i] = ctx->Kernels[0][i]; + } - const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero" }; - for(int i = 0; i < 8; ++i) - { - ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel %s.", err_to_str(ret), KernelNames[i]); - return ERR_OCL_API; } } - ctx->Nonce = 0; return 0; } @@ -881,8 +917,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) return ERR_SUCCESS; } -size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version) +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo) { + // switch to the kernel storage + int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1; + cl_int ret; if(input_len > 84) @@ -899,71 +938,51 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_OCL_API; } - if((ret = clSetKernelArg(ctx->Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); return(ERR_OCL_API); } - /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version. - * Do NOT use the variable `miner_algo` because this variable is changed dynamicly - */ - if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy) - { - // version - if ((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_uint), &version)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 4.", err_to_str(ret)); - return ERR_OCL_API; - } - } - // CN1 Kernel - /// @todo only activate if currency is monero - int cn_kernel_offset = 0; - if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon) - { - cn_kernel_offset = 6; - } - // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); return(ERR_OCL_API); @@ -972,113 +991,88 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon ) { // Input - if ((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, arugment 4(input buffer).", err_to_str(ret)); return ERR_OCL_API; } } - /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version. - * Do NOT use the variable `miner_algo` because this variable is changed dynamicly - */ - else if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy) - { - // version - if ((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_uint), &version)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 3 (version).", err_to_str(ret)); - return ERR_OCL_API; - } - } // CN3 Kernel // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Branch 0 - if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Branch 1 - if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } // Branch 2 - if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } // Branch 3 - if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); return(ERR_OCL_API); } - /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version. - * Do NOT use the variable `miner_algo` because this variable is changed dynamicly - */ - if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy) - { - // version - if ((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_uint), &version)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 7.", err_to_str(ret)); - return ERR_OCL_API; - } - } - for(int i = 0; i < 4; ++i) { // States - if((ret = clSetKernelArg(ctx->Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); return ERR_OCL_API; } // Nonce buffer - if((ret = clSetKernelArg(ctx->Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); return ERR_OCL_API; } // Output - if((ret = clSetKernelArg(ctx->Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); return ERR_OCL_API; } // Target - if((ret = clSetKernelArg(ctx->Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; @@ -1088,8 +1082,11 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_SUCCESS; } -size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version) +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { + // switch to the kernel storage + int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1; + cl_int ret; cl_uint zero = 0; size_t BranchNonces[4]; @@ -1125,35 +1122,21 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, clFinish(ctx->CommandQueues); size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 }; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); return ERR_OCL_API; } - /*for(int i = 1; i < 3; ++i) - { - if((ret = clEnqueueNDRangeKernel(*ctx->CommandQueues, ctx->Kernels[i], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) - { - Log(LOG_CRITICAL, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i); - return(ERR_OCL_API); - } - }*/ - size_t tmpNonce = ctx->Nonce; - /// @todo only activate if currency is monero - int cn_kernel_offset = 0; - if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon) - { - cn_kernel_offset = 6; - } - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1 + cn_kernel_offset], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; } - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); return ERR_OCL_API; @@ -1190,7 +1173,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, if(BranchNonces[i]) { // Threads - if((clSetKernelArg(ctx->Kernels[i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS) + if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); return(ERR_OCL_API); @@ -1201,7 +1184,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, // number of global threads must be a multiple of the work group size (w_size) assert(BranchNonces[i]%w_size == 0); size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); return ERR_OCL_API; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index a387b15..0db6c90 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -35,8 +35,8 @@ struct GpuContext cl_mem InputBuffer; cl_mem OutputBuffer; cl_mem ExtraBuffers[6]; - cl_program Program; - cl_kernel Kernels[8]; + cl_program Program[2]; + cl_kernel Kernels[2][8]; size_t freeMem; int computeUnits; std::string name; @@ -50,7 +50,7 @@ int getAMDPlatformIdx(); std::vector<GpuContext> getAMDDevices(int index); size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); -size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version); -size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version); +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo); +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 5d4e66c..d2ae1a7 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -433,15 +433,13 @@ inline ulong getIdx() #endif } -#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] +#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] + +#define JOIN_DO(x,y) x##y +#define JOIN(x,y) JOIN_DO(x,y) __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) -__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads -// cryptonight_heavy -#if (ALGO == 4) - , uint version -#endif -) +__kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) { ulong State[25]; uint ExpandedKey1[40]; @@ -517,23 +515,20 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul // cryptonight_heavy #if (ALGO == 4) - if(version >= 3) - { - __local uint4 xin[8][WORKSIZE]; + __local uint4 xin[8][WORKSIZE]; - /* Also left over threads performe this loop. - * The left over thread results will be ignored - */ - for(size_t i=0; i < 16; i++) - { - #pragma unroll - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]); - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } + /* Also left over threads performe this loop. + * The left over thread results will be ignored + */ + for(size_t i=0; i < 16; i++) + { + #pragma unroll + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]); + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); } #endif @@ -542,13 +537,9 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul if(gIdx < Threads) #endif { - int iterations = MEMORY >> 7; -#if (ALGO == 4) - if(version < 3) - iterations >>= 1; -#endif + #pragma unroll 2 - for(int i = 0; i < iterations; ++i) + for(int i = 0; i < (MEMORY >> 7); ++i) { #pragma unroll for(int j = 0; j < 10; ++j) @@ -560,22 +551,13 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul mem_fence(CLK_GLOBAL_MEM_FENCE); } -#define VARIANT1_1(p) \ - uint table = 0x75310U; \ - uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \ - (p).s2 ^= ((table >> index) & 0x30U) << 24 - -#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2 - -#define VARIANT1_INIT() \ - tweak1_2 = as_uint2(input[4]); \ - tweak1_2.s0 >>= 24; \ - tweak1_2.s0 |= tweak1_2.s1 << 8; \ - tweak1_2.s1 = get_global_id(0); \ - tweak1_2 ^= as_uint2(states[24]) - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, __global ulong *input) +__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads +// cryptonight_monero || cryptonight_aeon +#if(ALGO == 3 || ALGO == 5) +, __global ulong *input +#endif +) { ulong a[2], b[2]; __local uint AES0[256], AES1[256], AES2[256], AES3[256]; @@ -592,8 +574,9 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo } barrier(CLK_LOCAL_MEM_FENCE); - +#if(ALGO == 3 || ALGO == 5) uint2 tweak1_2; +#endif uint4 b_x; #if(COMP_MODE==1) // do not use early return here @@ -615,7 +598,13 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo b[1] = states[3] ^ states[7]; b_x = ((uint4 *)b)[0]; - VARIANT1_INIT(); +#if(ALGO == 3 || ALGO == 5) + tweak1_2 = as_uint2(input[4]); + tweak1_2.s0 >>= 24; + tweak1_2.s0 |= tweak1_2.s1 << 8; + tweak1_2.s1 = get_global_id(0); + tweak1_2 ^= as_uint2(states[24]); +#endif } mem_fence(CLK_LOCAL_MEM_FENCE); @@ -625,17 +614,23 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo if(gIdx < Threads) #endif { + ulong idx0 = a[0]; + #pragma unroll 8 for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; - ((uint4 *)c)[0] = Scratchpad[IDX((a[0] & MASK) >> 4)]; + ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); b_x ^= ((uint4 *)c)[0]; - VARIANT1_1(b_x); - Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x; +#if(ALGO == 3 || ALGO == 5) + uint table = 0x75310U; + uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2); + b_x.s2 ^= ((table >> index) & 0x30U) << 24; +#endif + Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x; uint4 tmp; tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; @@ -643,101 +638,14 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); - VARIANT1_2(a[1]); - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; - VARIANT1_2(a[1]); - - ((uint4 *)a)[0] ^= tmp; - - b_x = ((uint4 *)c)[0]; - } - } - mem_fence(CLK_GLOBAL_MEM_FENCE); -} - -__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads -// cryptonight_heavy -#if (ALGO == 4) - , uint version -#endif -) -{ - ulong a[2], b[2]; - __local uint AES0[256], AES1[256], AES2[256], AES3[256]; - - const ulong gIdx = getIdx(); - - for(int i = get_local_id(0); i < 256; i += WORKSIZE) - { - const uint tmp = AES0_C[i]; - AES0[i] = tmp; - AES1[i] = rotate(tmp, 8U); - AES2[i] = rotate(tmp, 16U); - AES3[i] = rotate(tmp, 24U); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - uint4 b_x; -#if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) -#endif - { - states += 25 * gIdx; -#if(STRIDED_INDEX==0) - Scratchpad += gIdx * (MEMORY >> 4); -#elif(STRIDED_INDEX==1) - Scratchpad += gIdx; -#elif(STRIDED_INDEX==2) - Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); -#endif - - a[0] = states[0] ^ states[4]; - b[0] = states[2] ^ states[6]; - a[1] = states[1] ^ states[5]; - b[1] = states[3] ^ states[7]; - - b_x = ((uint4 *)b)[0]; - } - - mem_fence(CLK_LOCAL_MEM_FENCE); - -#if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) -#endif - { - ulong idx0 = a[0]; - ulong mask = MASK; - int iterations = ITERATIONS; -#if (ALGO == 4) - if(version < 3) - { - iterations <<= 1; - mask -= 0x200000; - } +#if(ALGO == 3 || ALGO == 5) + ((uint2 *)&(a[1]))[0] ^= tweak1_2; + Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + ((uint2 *)&(a[1]))[0] ^= tweak1_2; +#else + Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; #endif - #pragma unroll 8 - for(int i = 0; i < iterations; ++i) - { - ulong c[2]; - - ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & mask) >> 4)]; - ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); - //b_x ^= ((uint4 *)c)[0]; - - Scratchpad[IDX((idx0 & mask) >> 4)] = b_x ^ ((uint4 *)c)[0]; - - uint4 tmp; - tmp = Scratchpad[IDX((c[0] & mask) >> 4)]; - - a[1] += c[0] * as_ulong2(tmp).s0; - a[0] += mul_hi(c[0], as_ulong2(tmp).s0); - - Scratchpad[IDX((c[0] & mask) >> 4)] = ((uint4 *)a)[0]; ((uint4 *)a)[0] ^= tmp; idx0 = a[0]; @@ -745,14 +653,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre b_x = ((uint4 *)c)[0]; // cryptonight_heavy #if (ALGO == 4) - if(version >= 3) - { - long n = *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & mask) >> 4))))[2]; - long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))) = n ^ q; - idx0 = d ^ q; - } + long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; + long q = n / (d | 0x5); + *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; + idx0 = d ^ q; #endif } } @@ -760,12 +665,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre } __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) -__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads -// cryptonight_heavy -#if (ALGO == 4) - , uint version -#endif - ) +__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads) { __local uint AES0[256], AES1[256], AES2[256], AES3[256]; uint ExpandedKey2[40]; @@ -827,58 +727,42 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u if(gIdx < Threads) #endif { - int iterations = MEMORY >> 7; #if (ALGO == 4) - if(version < 3) - { - iterations >>= 1; - #pragma unroll 2 - for(int i = 0; i < iterations; ++i) - { - text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; - - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - } - } - else + #pragma unroll 2 + for(int i = 0; i < (MEMORY >> 7); ++i) { - #pragma unroll 2 - for(int i = 0; i < iterations; ++i) - { - text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); + } - #pragma unroll 2 - for(int i = 0; i < iterations; ++i) - { - text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + #pragma unroll 2 + for(int i = 0; i < (MEMORY >> 7); ++i) + { + text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); } + #else #pragma unroll 2 - for(int i = 0; i < iterations; ++i) + for(int i = 0; i < (MEMORY >> 7); ++i) { text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; @@ -891,21 +775,18 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u // cryptonight_heavy #if (ALGO == 4) - if(version >= 3) + /* Also left over threads perform this loop. + * The left over thread results will be ignored + */ + for(size_t i=0; i < 16; i++) { - /* Also left over threads performe this loop. - * The left over thread results will be ignored - */ - for(size_t i=0; i < 16; i++) - { - #pragma unroll - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } + #pragma unroll + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); } #endif |