From 9ef2a83be3ddcc36546277f996c9387c38090014 Mon Sep 17 00:00:00 2001 From: Takeshi Suzuki Date: Thu, 5 Apr 2018 20:58:14 -0500 Subject: Fix spelling mistake in gpu.cpp --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'xmrstak/backend/amd/amd_gpu') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 006a7ed..13f018e 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -378,7 +378,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[512]; snprintf(options, sizeof(options), "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", - hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0, + hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0, int_port(hashMemSize), int(miner_algo)); /* create a hash for the compile time cache * used data: @@ -497,7 +497,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } else { - printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled cod from file %s",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str()); std::ostringstream ss; ss << clBinFile.rdbuf(); std::string s = ss.str(); -- cgit v1.1 From a57976439b1322eecb5e29198f3d9676d6f06909 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 7 Apr 2018 22:53:43 +0200 Subject: amd simplify kernel for different algorithms - remove version numbers within the kernel - create seperate program context for each mining algorithm - remove kernel `cn1_monero` is now integrated in `cn1` - remname `cnX` kernel in `cnX + algorithmNumber` --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 413 +++++++++++----------- xmrstak/backend/amd/amd_gpu/gpu.hpp | 8 +- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 301 +++++----------- 3 files changed, 293 insertions(+), 429 deletions(-) (limited to 'xmrstak/backend/amd/amd_gpu') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 13f018e..b9cc9b6 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -307,12 +308,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); + size_t scratchPadSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); int threadMemMask = cn_select_mask(::jconf::inst()->GetMiningAlgo()); int hashIterations = cn_select_iter(::jconf::inst()->GetMiningAlgo()); size_t g_thd = ctx->rawIntensity; - ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret); + ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret); if(ret != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); @@ -373,167 +374,202 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - auto miner_algo = ::jconf::inst()->GetMiningAlgo(); + xmrstak_algo miner_algo[2] = { + ::jconf::inst()->GetMiningAlgo(), + ::jconf::inst()->GetMiningAlgoRoot() + }; + int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2; - char options[512]; - snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", - hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0, - int_port(hashMemSize), int(miner_algo)); - /* create a hash for the compile time cache - * used data: - * - source code - * - device name - * - compile paramater - */ - std::string src_str(source_code); - src_str += options; - src_str += devNameVec.data(); - std::string hash_hex_str; - picosha2::hash256_hex_string(src_str, hash_hex_str); - - std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin"; - std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary); - if(xmrstak::params::inst().AMDCache == false || !clBinFile.good()) + for(int ii = 0; ii < num_algos; ++ii) { - if(xmrstak::params::inst().AMDCache) - printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); - ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); - return ERR_OCL_API; - } + size_t hashMemSize = cn_select_memory(miner_algo[ii]); + int threadMemMask = cn_select_mask(miner_algo[ii]); + int hashIterations = cn_select_iter(miner_algo[ii]); - ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); - if(ret != CL_SUCCESS) + char options[512]; + snprintf(options, sizeof(options), + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", + hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0, + int_port(hashMemSize), int(miner_algo[ii])); + /* create a hash for the compile time cache + * used data: + * - source code + * - device name + * - compile paramater + */ + std::string src_str(source_code); + src_str += options; + src_str += devNameVec.data(); + std::string hash_hex_str; + picosha2::hash256_hex_string(src_str, hash_hex_str); + + std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin"; + std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary); + if(xmrstak::params::inst().AMDCache == false || !clBinFile.good()) { - size_t len; - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); - - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + if(xmrstak::params::inst().AMDCache) + printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); + ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); + if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return ERR_OCL_API; } - char* BuildLog = (char*)malloc(len + 1); - BuildLog[0] = '\0'; - - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL); + if(ret != CL_SUCCESS) { + size_t len; + printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); + + if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + return ERR_OCL_API; + } + + char* BuildLog = (char*)malloc(len + 1); + BuildLog[0] = '\0'; + + if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + { + free(BuildLog); + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + return ERR_OCL_API; + } + + printer::inst()->print_str("Build log:\n"); + std::cerr<print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); return ERR_OCL_API; } - printer::inst()->print_str("Build log:\n"); - std::cerr<Program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); + cl_uint num_devices; + clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); - std::vector devices_ids(num_devices); - clGetProgramInfo(ctx->Program, CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); - int dev_id = 0; - /* Search for the gpu within the program context. - * The id can be different to ctx->DeviceID. - */ - for(auto & ocl_device : devices_ids) - { - if(ocl_device == ctx->DeviceID) - break; - dev_id++; - } + std::vector devices_ids(num_devices); + clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); + int dev_id = 0; + /* Search for the gpu within the program context. + * The id can be different to ctx->DeviceID. + */ + for(auto & ocl_device : devices_ids) + { + if(ocl_device == ctx->DeviceID) + break; + dev_id++; + } - cl_build_status status; - do - { - if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + cl_build_status status; + do { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); - return ERR_OCL_API; + if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + return ERR_OCL_API; + } + port_sleep(1); } - port_sleep(1); - } - while(status == CL_BUILD_IN_PROGRESS); + while(status == CL_BUILD_IN_PROGRESS); + + std::vector binary_sizes(num_devices); + clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); - std::vector binary_sizes(num_devices); - clGetProgramInfo (ctx->Program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); + std::vector all_programs(num_devices); + std::vector> program_storage; - std::vector all_programs(num_devices); - std::vector> program_storage; + if(xmrstak::params::inst().AMDCache) + { + int p_id = 0; + size_t mem_size = 0; + // create memory structure to query all OpenCL program binaries + for(auto & p : all_programs) + { + program_storage.emplace_back(std::vector(binary_sizes[p_id])); + all_programs[p_id] = program_storage[p_id].data(); + mem_size += binary_sizes[p_id]; + p_id++; + } - if(xmrstak::params::inst().AMDCache) + if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); + return ERR_OCL_API; + } + + std::ofstream file_stream; + file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary); + file_stream.write(all_programs[dev_id], binary_sizes[dev_id]); + file_stream.close(); + printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str()); + } + } + else { - int p_id = 0; - size_t mem_size = 0; - // create memory structure to query all OpenCL program binaries - for(auto & p : all_programs) + printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str()); + std::ostringstream ss; + ss << clBinFile.rdbuf(); + std::string s = ss.str(); + + size_t bin_size = s.size(); + auto data_ptr = s.data(); + + cl_int clStatus; + ctx->Program[ii] = clCreateProgramWithBinary( + opencl_ctx, 1, &ctx->DeviceID, &bin_size, + (const unsigned char **)&data_ptr, &clStatus, &ret + ); + if(ret != CL_SUCCESS) { - program_storage.emplace_back(std::vector(binary_sizes[p_id])); - all_programs[p_id] = program_storage[p_id].data(); - mem_size += binary_sizes[p_id]; - p_id++; + printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); + return ERR_OCL_API; } - - if((ret = clGetProgramInfo(ctx->Program, CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL); + if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); + printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } - - std::ofstream file_stream; - file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary); - file_stream.write(all_programs[dev_id], binary_sizes[dev_id]); - file_stream.close(); - printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str()); } - } - else - { - printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str()); - std::ostringstream ss; - ss << clBinFile.rdbuf(); - std::string s = ss.str(); - - size_t bin_size = s.size(); - auto data_ptr = s.data(); - - cl_int clStatus; - ctx->Program = clCreateProgramWithBinary( - opencl_ctx, 1, &ctx->DeviceID, &bin_size, - (const unsigned char **)&data_ptr, &clStatus, &ret - ); - if(ret != CL_SUCCESS) + + std::vector KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; + // append algorithm number to kernel name + for(int k = 0; k < 3; k++) + KernelNames[k] += std::to_string(miner_algo[ii]); + + if(ii == 0) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); - return ERR_OCL_API; + for(int i = 0; i < 7; ++i) + { + ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); + return ERR_OCL_API; + } + } } - ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, NULL, NULL, NULL); - if(ret != CL_SUCCESS) + else { - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); - return ERR_OCL_API; - } - } + for(int i = 0; i < 3; ++i) + { + ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str()); + return ERR_OCL_API; + } + } + // move kernel from the main algorithm into the root algorithm kernel space + for(int i = 3; i < 7; ++i) + { + ctx->Kernels[ii][i] = ctx->Kernels[0][i]; + } - const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero" }; - for(int i = 0; i < 8; ++i) - { - ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel %s.", err_to_str(ret), KernelNames[i]); - return ERR_OCL_API; } } - ctx->Nonce = 0; return 0; } @@ -881,8 +917,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) return ERR_SUCCESS; } -size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version) +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo) { + // switch to the kernel storage + int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1; + cl_int ret; if(input_len > 84) @@ -899,71 +938,51 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_OCL_API; } - if((ret = clSetKernelArg(ctx->Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); return(ERR_OCL_API); } - /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version. - * Do NOT use the variable `miner_algo` because this variable is changed dynamicly - */ - if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy) - { - // version - if ((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_uint), &version)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 4.", err_to_str(ret)); - return ERR_OCL_API; - } - } - // CN1 Kernel - /// @todo only activate if currency is monero - int cn_kernel_offset = 0; - if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon) - { - cn_kernel_offset = 6; - } - // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); return(ERR_OCL_API); @@ -972,113 +991,88 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon ) { // Input - if ((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, arugment 4(input buffer).", err_to_str(ret)); return ERR_OCL_API; } } - /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version. - * Do NOT use the variable `miner_algo` because this variable is changed dynamicly - */ - else if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy) - { - // version - if ((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_uint), &version)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 3 (version).", err_to_str(ret)); - return ERR_OCL_API; - } - } // CN3 Kernel // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Branch 0 - if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Branch 1 - if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } // Branch 2 - if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } // Branch 3 - if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); return(ERR_OCL_API); } - /* ATTENTION: if we miner cryptonight_heavy the kernel needs an additional parameter version. - * Do NOT use the variable `miner_algo` because this variable is changed dynamicly - */ - if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy) - { - // version - if ((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_uint), &version)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 7.", err_to_str(ret)); - return ERR_OCL_API; - } - } - for(int i = 0; i < 4; ++i) { // States - if((ret = clSetKernelArg(ctx->Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); return ERR_OCL_API; } // Nonce buffer - if((ret = clSetKernelArg(ctx->Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); return ERR_OCL_API; } // Output - if((ret = clSetKernelArg(ctx->Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); return ERR_OCL_API; } // Target - if((ret = clSetKernelArg(ctx->Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; @@ -1088,8 +1082,11 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_SUCCESS; } -size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version) +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { + // switch to the kernel storage + int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1; + cl_int ret; cl_uint zero = 0; size_t BranchNonces[4]; @@ -1125,35 +1122,21 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, clFinish(ctx->CommandQueues); size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 }; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); return ERR_OCL_API; } - /*for(int i = 1; i < 3; ++i) - { - if((ret = clEnqueueNDRangeKernel(*ctx->CommandQueues, ctx->Kernels[i], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) - { - Log(LOG_CRITICAL, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i); - return(ERR_OCL_API); - } - }*/ - size_t tmpNonce = ctx->Nonce; - /// @todo only activate if currency is monero - int cn_kernel_offset = 0; - if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon) - { - cn_kernel_offset = 6; - } - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1 + cn_kernel_offset], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; } - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); return ERR_OCL_API; @@ -1190,7 +1173,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, if(BranchNonces[i]) { // Threads - if((clSetKernelArg(ctx->Kernels[i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS) + if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); return(ERR_OCL_API); @@ -1201,7 +1184,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, // number of global threads must be a multiple of the work group size (w_size) assert(BranchNonces[i]%w_size == 0); size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); return ERR_OCL_API; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index a387b15..0db6c90 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -35,8 +35,8 @@ struct GpuContext cl_mem InputBuffer; cl_mem OutputBuffer; cl_mem ExtraBuffers[6]; - cl_program Program; - cl_kernel Kernels[8]; + cl_program Program[2]; + cl_kernel Kernels[2][8]; size_t freeMem; int computeUnits; std::string name; @@ -50,7 +50,7 @@ int getAMDPlatformIdx(); std::vector getAMDDevices(int index); size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); -size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version); -size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version); +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo); +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 5d4e66c..d2ae1a7 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -433,15 +433,13 @@ inline ulong getIdx() #endif } -#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] +#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] + +#define JOIN_DO(x,y) x##y +#define JOIN(x,y) JOIN_DO(x,y) __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) -__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads -// cryptonight_heavy -#if (ALGO == 4) - , uint version -#endif -) +__kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) { ulong State[25]; uint ExpandedKey1[40]; @@ -517,23 +515,20 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul // cryptonight_heavy #if (ALGO == 4) - if(version >= 3) - { - __local uint4 xin[8][WORKSIZE]; + __local uint4 xin[8][WORKSIZE]; - /* Also left over threads performe this loop. - * The left over thread results will be ignored - */ - for(size_t i=0; i < 16; i++) - { - #pragma unroll - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]); - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } + /* Also left over threads performe this loop. + * The left over thread results will be ignored + */ + for(size_t i=0; i < 16; i++) + { + #pragma unroll + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]); + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); } #endif @@ -542,13 +537,9 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul if(gIdx < Threads) #endif { - int iterations = MEMORY >> 7; -#if (ALGO == 4) - if(version < 3) - iterations >>= 1; -#endif + #pragma unroll 2 - for(int i = 0; i < iterations; ++i) + for(int i = 0; i < (MEMORY >> 7); ++i) { #pragma unroll for(int j = 0; j < 10; ++j) @@ -560,22 +551,13 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul mem_fence(CLK_GLOBAL_MEM_FENCE); } -#define VARIANT1_1(p) \ - uint table = 0x75310U; \ - uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \ - (p).s2 ^= ((table >> index) & 0x30U) << 24 - -#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2 - -#define VARIANT1_INIT() \ - tweak1_2 = as_uint2(input[4]); \ - tweak1_2.s0 >>= 24; \ - tweak1_2.s0 |= tweak1_2.s1 << 8; \ - tweak1_2.s1 = get_global_id(0); \ - tweak1_2 ^= as_uint2(states[24]) - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, __global ulong *input) +__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads +// cryptonight_monero || cryptonight_aeon +#if(ALGO == 3 || ALGO == 5) +, __global ulong *input +#endif +) { ulong a[2], b[2]; __local uint AES0[256], AES1[256], AES2[256], AES3[256]; @@ -592,8 +574,9 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo } barrier(CLK_LOCAL_MEM_FENCE); - +#if(ALGO == 3 || ALGO == 5) uint2 tweak1_2; +#endif uint4 b_x; #if(COMP_MODE==1) // do not use early return here @@ -615,7 +598,13 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo b[1] = states[3] ^ states[7]; b_x = ((uint4 *)b)[0]; - VARIANT1_INIT(); +#if(ALGO == 3 || ALGO == 5) + tweak1_2 = as_uint2(input[4]); + tweak1_2.s0 >>= 24; + tweak1_2.s0 |= tweak1_2.s1 << 8; + tweak1_2.s1 = get_global_id(0); + tweak1_2 ^= as_uint2(states[24]); +#endif } mem_fence(CLK_LOCAL_MEM_FENCE); @@ -625,17 +614,23 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo if(gIdx < Threads) #endif { + ulong idx0 = a[0]; + #pragma unroll 8 for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; - ((uint4 *)c)[0] = Scratchpad[IDX((a[0] & MASK) >> 4)]; + ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); b_x ^= ((uint4 *)c)[0]; - VARIANT1_1(b_x); - Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x; +#if(ALGO == 3 || ALGO == 5) + uint table = 0x75310U; + uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2); + b_x.s2 ^= ((table >> index) & 0x30U) << 24; +#endif + Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x; uint4 tmp; tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; @@ -643,101 +638,14 @@ __kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulo a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); - VARIANT1_2(a[1]); - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; - VARIANT1_2(a[1]); - - ((uint4 *)a)[0] ^= tmp; - - b_x = ((uint4 *)c)[0]; - } - } - mem_fence(CLK_GLOBAL_MEM_FENCE); -} - -__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads -// cryptonight_heavy -#if (ALGO == 4) - , uint version -#endif -) -{ - ulong a[2], b[2]; - __local uint AES0[256], AES1[256], AES2[256], AES3[256]; - - const ulong gIdx = getIdx(); - - for(int i = get_local_id(0); i < 256; i += WORKSIZE) - { - const uint tmp = AES0_C[i]; - AES0[i] = tmp; - AES1[i] = rotate(tmp, 8U); - AES2[i] = rotate(tmp, 16U); - AES3[i] = rotate(tmp, 24U); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - uint4 b_x; -#if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) -#endif - { - states += 25 * gIdx; -#if(STRIDED_INDEX==0) - Scratchpad += gIdx * (MEMORY >> 4); -#elif(STRIDED_INDEX==1) - Scratchpad += gIdx; -#elif(STRIDED_INDEX==2) - Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); -#endif - - a[0] = states[0] ^ states[4]; - b[0] = states[2] ^ states[6]; - a[1] = states[1] ^ states[5]; - b[1] = states[3] ^ states[7]; - - b_x = ((uint4 *)b)[0]; - } - - mem_fence(CLK_LOCAL_MEM_FENCE); - -#if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) -#endif - { - ulong idx0 = a[0]; - ulong mask = MASK; - int iterations = ITERATIONS; -#if (ALGO == 4) - if(version < 3) - { - iterations <<= 1; - mask -= 0x200000; - } +#if(ALGO == 3 || ALGO == 5) + ((uint2 *)&(a[1]))[0] ^= tweak1_2; + Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + ((uint2 *)&(a[1]))[0] ^= tweak1_2; +#else + Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; #endif - #pragma unroll 8 - for(int i = 0; i < iterations; ++i) - { - ulong c[2]; - - ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & mask) >> 4)]; - ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); - //b_x ^= ((uint4 *)c)[0]; - - Scratchpad[IDX((idx0 & mask) >> 4)] = b_x ^ ((uint4 *)c)[0]; - - uint4 tmp; - tmp = Scratchpad[IDX((c[0] & mask) >> 4)]; - - a[1] += c[0] * as_ulong2(tmp).s0; - a[0] += mul_hi(c[0], as_ulong2(tmp).s0); - - Scratchpad[IDX((c[0] & mask) >> 4)] = ((uint4 *)a)[0]; ((uint4 *)a)[0] ^= tmp; idx0 = a[0]; @@ -745,14 +653,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre b_x = ((uint4 *)c)[0]; // cryptonight_heavy #if (ALGO == 4) - if(version >= 3) - { - long n = *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & mask) >> 4))))[2]; - long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))) = n ^ q; - idx0 = d ^ q; - } + long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; + long q = n / (d | 0x5); + *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; + idx0 = d ^ q; #endif } } @@ -760,12 +665,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre } __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) -__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads -// cryptonight_heavy -#if (ALGO == 4) - , uint version -#endif - ) +__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads) { __local uint AES0[256], AES1[256], AES2[256], AES3[256]; uint ExpandedKey2[40]; @@ -827,58 +727,42 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u if(gIdx < Threads) #endif { - int iterations = MEMORY >> 7; #if (ALGO == 4) - if(version < 3) - { - iterations >>= 1; - #pragma unroll 2 - for(int i = 0; i < iterations; ++i) - { - text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; - - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - } - } - else + #pragma unroll 2 + for(int i = 0; i < (MEMORY >> 7); ++i) { - #pragma unroll 2 - for(int i = 0; i < iterations; ++i) - { - text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); + } - #pragma unroll 2 - for(int i = 0; i < iterations; ++i) - { - text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + #pragma unroll 2 + for(int i = 0; i < (MEMORY >> 7); ++i) + { + text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); } + #else #pragma unroll 2 - for(int i = 0; i < iterations; ++i) + for(int i = 0; i < (MEMORY >> 7); ++i) { text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; @@ -891,21 +775,18 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u // cryptonight_heavy #if (ALGO == 4) - if(version >= 3) + /* Also left over threads perform this loop. + * The left over thread results will be ignored + */ + for(size_t i=0; i < 16; i++) { - /* Also left over threads performe this loop. - * The left over thread results will be ignored - */ - for(size_t i=0; i < 16; i++) - { - #pragma unroll - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } + #pragma unroll + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); } #endif -- cgit v1.1 From 1b18f598aa1190a0e6126ed2c70e052e9403d180 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 8 Apr 2018 20:40:54 +0200 Subject: refactor scratchpad creation Use the maximum scratchpad size from before and after the fork. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'xmrstak/backend/amd/amd_gpu') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index b9cc9b6..79e80bd 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -308,9 +308,10 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - size_t scratchPadSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); - int threadMemMask = cn_select_mask(::jconf::inst()->GetMiningAlgo()); - int hashIterations = cn_select_iter(::jconf::inst()->GetMiningAlgo()); + size_t scratchPadSize = std::max( + cn_select_memory(::jconf::inst()->GetMiningAlgo()), + cn_select_memory(::jconf::inst()->GetMiningAlgoRoot()) + ); size_t g_thd = ctx->rawIntensity; ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret); @@ -382,6 +383,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ for(int ii = 0; ii < num_algos; ++ii) { + // scratchpad size for the selected mining algorithm size_t hashMemSize = cn_select_memory(miner_algo[ii]); int threadMemMask = cn_select_mask(miner_algo[ii]); int hashIterations = cn_select_iter(miner_algo[ii]); @@ -493,7 +495,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ p_id++; } - if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); return ERR_OCL_API; -- cgit v1.1 From 4682b28a5d304436ca20469e5089f97814f3f4ab Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 14 Apr 2018 23:17:33 +0200 Subject: allow non AMD OpenCL driver and devices - add CLI flag to explicitly use non AMD OpenCL and devices - adjust OpenCL output (use OpenCL instead of AMD if --altOpenCL is sued) - optimize NVIDIA OpenCL auto suggestion --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 30 ++++++++++++++++++++++-------- xmrstak/backend/amd/amd_gpu/gpu.hpp | 1 + 2 files changed, 23 insertions(+), 8 deletions(-) (limited to 'xmrstak/backend/amd/amd_gpu') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 79e80bd..9a4ba73 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -675,11 +675,18 @@ std::vector getAMDDevices(int index) } std::string devVendor(devVendorVec.data()); - if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos) + + bool isAMDDevice = devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos; + bool isNVIDIADevice = devVendor.find("NVIDIA Corporation") != std::string::npos || devVendor.find("NVIDIA") != std::string::npos; + + std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor; + if((isAMDDevice && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA")) { GpuContext ctx; std::vector devNameVec(1024); size_t maxMem; + if( devVendor.find("NVIDIA Corporation") != std::string::npos) + ctx.isNVIDIA = true; if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS) { @@ -699,6 +706,10 @@ std::vector getAMDDevices(int index) continue; } + // the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation + if(ctx.isNVIDIA) + maxMem = ctx.freeMem; + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k); @@ -747,13 +758,15 @@ int getAMDPlatformIdx() clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL); std::string platformName(platformNameVec.data()); - if( platformName.find("Advanced Micro Devices") != std::string::npos || + + bool isAMDOpenCL = platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos || - platformName.find("Mesa") != std::string::npos - ) + platformName.find("Mesa") != std::string::npos; + bool isNVIDIADevice = platformName.find("NVIDIA Corporation") != std::string::npos || platformName.find("NVIDIA") != std::string::npos; + std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor; + if((isAMDOpenCL && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA")) { - - printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str()); + printer::inst()->print_msg(L0,"Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i , platformName.c_str()); if(platformName.find("Mesa") != std::string::npos) mesaPlatform = i; else @@ -819,7 +832,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) std::vector platformNameVec(infoSize); clGetPlatformInfo(PlatformIDList[platform_idx], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL); std::string platformName(platformNameVec.data()); - if( platformName.find("Advanced Micro Devices") == std::string::npos) + if(xmrstak::params::inst().openCLVendor == "AMD" && platformName.find("Advanced Micro Devices") == std::string::npos) { printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str()); } @@ -907,7 +920,8 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) { size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; ctx[i].rawIntensity = reduced_intensity; - printer::inst()->print_msg(L0, "WARNING AMD: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", ctx[i].deviceIdx, int(reduced_intensity)); + const std::string backendName = xmrstak::params::inst().openCLVendor; + printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity)); } if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 0db6c90..5ab80b8 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -27,6 +27,7 @@ struct GpuContext size_t workSize; int stridedIndex; int memChunk; + bool isNVIDIA = false; int compMode; /*Output vars*/ -- cgit v1.1 From 0877e2f654b203c5145bb8154fcfb1ad46ba8265 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 16 Apr 2018 21:46:33 +0200 Subject: add independent dev pool coin description - allow the dev pool to fork on a different block version than the user descriped coin All algorithm are centered around the user coin description. It is allowed to have two two different coin algorithms in the user coin description. It is only allowed to use algorithms for the dev pool coin description those are used in the user coin description. There are two ways to define a non forking coin. - set both user coin algorithm descriptions to the same algorithm and set version to zero - set the first algorithm in the user coin description to something you like to use in the dev pool and set the second algorithm to the correct representation of the coin. Set the version to 255. This will allow that the dev pool can mine on a different coin algorithm than the not forking user coin. Do not use an algorithm with different scratchpad size for the dev pool. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'xmrstak/backend/amd/amd_gpu') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 9a4ba73..b3d36e7 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -309,8 +309,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } size_t scratchPadSize = std::max( - cn_select_memory(::jconf::inst()->GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetMiningAlgoRoot()) + cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), + cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) ); size_t g_thd = ctx->rawIntensity; @@ -376,8 +376,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } xmrstak_algo miner_algo[2] = { - ::jconf::inst()->GetMiningAlgo(), - ::jconf::inst()->GetMiningAlgoRoot() + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(), + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() }; int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2; @@ -936,7 +936,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo) { // switch to the kernel storage - int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1; + int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1; cl_int ret; @@ -1101,7 +1101,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { // switch to the kernel storage - int kernel_storage = miner_algo == ::jconf::inst()->GetMiningAlgo() ? 0 : 1; + int kernel_storage = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1; cl_int ret; cl_uint zero = 0; -- cgit v1.1 From 053dcb01d814daab686d66ba82da96ce2aec7747 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 16 Apr 2018 22:06:38 +0200 Subject: remove fork for sumokoin and monero remove fork version for sumokoin and monero7 --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'xmrstak/backend/amd/amd_gpu') diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index b3d36e7..03100d0 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -1101,7 +1101,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { // switch to the kernel storage - int kernel_storage = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1; + int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1; cl_int ret; cl_uint zero = 0; -- cgit v1.1