diff options
Diffstat (limited to 'xmrstak/backend/amd')
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/gpu.cpp | 126 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/gpu.hpp | 2 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 19 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl | 30 | ||||
-rw-r--r-- | xmrstak/backend/amd/autoAdjust.hpp | 2 | ||||
-rw-r--r-- | xmrstak/backend/amd/config.tpl | 5 | ||||
-rw-r--r-- | xmrstak/backend/amd/jconf.cpp | 9 | ||||
-rw-r--r-- | xmrstak/backend/amd/jconf.hpp | 1 | ||||
-rw-r--r-- | xmrstak/backend/amd/minethd.cpp | 1 |
9 files changed, 128 insertions, 67 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 15b8457..d9bc962 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -332,7 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[256]; snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu", hasIterations, threadMemMask, int_port(ctx->workSize)); + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", + hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0); ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); if(ret != CL_SUCCESS) { @@ -448,68 +449,85 @@ uint32_t getNumPlatforms() std::vector<GpuContext> getAMDDevices(int index) { std::vector<GpuContext> ctxVec; - cl_platform_id * platforms = NULL; + std::vector<cl_platform_id> platforms; + std::vector<cl_device_id> device_list; + cl_int clStatus; cl_uint num_devices; - cl_device_id *device_list = NULL; - uint32_t numPlatforms = getNumPlatforms(); - if(numPlatforms) + if(numPlatforms == 0) + return ctxVec; + + platforms.resize(numPlatforms); + if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); + return ctxVec; + } + + if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus)); + return ctxVec; + } + + device_list.resize(num_devices); + if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus)); + return ctxVec; + } + + for (size_t k = 0; k < num_devices; k++) { - platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms); - clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL); - if(clStatus == CL_SUCCESS) + std::vector<char> devVendorVec(1024); + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS) { - clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); - if(clStatus == CL_SUCCESS) + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k); + continue; + } + + std::string devVendor(devVendorVec.data()); + if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos) + { + GpuContext ctx; + std::vector<char> devNameVec(1024); + size_t maxMem; + + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k); + continue; + } + + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS) { - device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices); - clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL); - if(clStatus == CL_SUCCESS) - { - for (int k = 0; k < num_devices; k++) - { - cl_int clError; - std::vector<char> devVendorVec(1024); - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL); - if(clStatus == CL_SUCCESS) - { - std::string devVendor(devVendorVec.data()); - if( devVendor.find("Advanced Micro Devices") != std::string::npos) - { - GpuContext ctx; - ctx.deviceIdx = k; - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL); - size_t maxMem; - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL); - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL); - // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory - ctx.freeMem = std::min(ctx.freeMem, maxMem); - std::vector<char> devNameVec(1024); - clError = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL); - ctx.name = std::string(devNameVec.data()); - printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); - ctx.DeviceID = device_list[k]; - ctxVec.push_back(ctx); - } - } - else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name.", err_to_str(clStatus)); - } - } - else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus)); - free(device_list); + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); + continue; } - else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus)); + + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k); + continue; + } + + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k); + continue; + } + printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); + + // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory + ctx.deviceIdx = k; + ctx.freeMem = std::min(ctx.freeMem, maxMem); + ctx.name = std::string(devNameVec.data()); + ctx.DeviceID = device_list[k]; + ctxVec.push_back(ctx); } - else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); } - - free(platforms); return ctxVec; } @@ -541,7 +559,7 @@ int getAMDPlatformIdx() clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL); std::string platformName(platformNameVec.data()); - if( platformName.find("Advanced Micro Devices") != std::string::npos) + if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos) { platformIndex = i; printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str()); diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 123de01..c17bac1 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -9,6 +9,7 @@ #endif #include <stdint.h> +#include <string> #include <vector> #define ERR_SUCCESS (0) @@ -23,6 +24,7 @@ struct GpuContext size_t deviceIdx; size_t rawIntensity; size_t workSize; + int stridedIndex; /*Output vars*/ cl_device_id DeviceID; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 966199b..255fcbb 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -411,7 +411,11 @@ void AESExpandKey256(uint *keybuf) } } -#define IDX(x) (x) +#if(STRIDED_INDEX==0) +# define IDX(x) (x) +#else +# define IDX(x) ((x) * (Threads)) +#endif __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads) @@ -440,7 +444,12 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul if(gIdx < Threads) { states += 25 * gIdx; + +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif ((ulong8 *)State)[0] = vload8(0, input); State[8] = input[8]; @@ -519,7 +528,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre if(gIdx < Threads) { states += 25 * gIdx; +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif a[0] = states[0] ^ states[4]; b[0] = states[2] ^ states[6]; @@ -588,7 +601,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u if(gIdx < Threads) { states += 25 * gIdx; +#if(STRIDED_INDEX==0) Scratchpad += gIdx * (ITERATIONS >> 2); +#else + Scratchpad += gIdx; +#endif #if defined(__Tahiti__) || defined(__Pitcairn__) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl index 996944b..81e1644 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl @@ -74,15 +74,29 @@ static const __constant uint AES0_C[256] = #define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U)) -uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, const uint4 key) +uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key) { - uint4 Y; - Y.s0 = AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)]; - Y.s1 = AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)]; - Y.s2 = AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)]; - Y.s3 = AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)]; - Y ^= key; - return(Y); + key.s0 ^= AES0[BYTE(X.s0, 0)]; + key.s1 ^= AES0[BYTE(X.s1, 0)]; + key.s2 ^= AES0[BYTE(X.s2, 0)]; + key.s3 ^= AES0[BYTE(X.s3, 0)]; + + key.s0 ^= AES2[BYTE(X.s2, 2)]; + key.s1 ^= AES2[BYTE(X.s3, 2)]; + key.s2 ^= AES2[BYTE(X.s0, 2)]; + key.s3 ^= AES2[BYTE(X.s1, 2)]; + + key.s0 ^= AES1[BYTE(X.s1, 1)]; + key.s1 ^= AES1[BYTE(X.s2, 1)]; + key.s2 ^= AES1[BYTE(X.s3, 1)]; + key.s3 ^= AES1[BYTE(X.s0, 1)]; + + key.s0 ^= AES3[BYTE(X.s3, 3)]; + key.s1 ^= AES3[BYTE(X.s0, 3)]; + key.s2 ^= AES3[BYTE(X.s1, 3)]; + key.s3 ^= AES3[BYTE(X.s2, 3)]; + + return key; } #endif diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 0b91212..0bc5239 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -123,7 +123,7 @@ private: // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \n" + " \"affine_to_cpu\" : false, \"strided_index\" : true\n" " },\n"; ++i; } diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index a93859c..af662f8 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -5,9 +5,12 @@ R"===( * intensity - Number of parallel GPU threads (nothing to do with CPU threads) * worksize - Number of local GPU threads (nothing to do with CPU threads) * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. + * strided_index - switch memory pattern used for the scratch pad memory + * true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks + * false = use a contiguous block of memory per thread * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true }, * ], */ diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 0617aeb..07afb19 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -103,13 +103,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + stridedIndex = GetObjectMember(oThdConf, "strided_index"); - if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr) + if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -118,9 +119,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!aff->IsUint64() && !aff->IsBool()) return false; + if(!stridedIndex->IsBool()) + return false; + cfg.index = idx->GetUint64(); cfg.intensity = intensity->GetUint64(); cfg.w_size = w_size->GetUint64(); + cfg.stridedIndex = stridedIndex->GetBool(); if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt64(); diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index da024a4..ee1882a 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -26,6 +26,7 @@ public: size_t intensity; size_t w_size; long long cpu_aff; + bool stridedIndex; }; size_t GetThreadCount(); diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index c1399e0..103688f 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -96,6 +96,7 @@ bool minethd::init_gpus() vGpuData[i].deviceIdx = cfg.index; vGpuData[i].rawIntensity = cfg.intensity; vGpuData[i].workSize = cfg.w_size; + vGpuData[i].stridedIndex = cfg.stridedIndex; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; |