summaryrefslogtreecommitdiffstats
path: root/xmrstak/backend/amd
diff options
context:
space:
mode:
Diffstat (limited to 'xmrstak/backend/amd')
-rw-r--r--xmrstak/backend/amd/amd_gpu/gpu.cpp126
-rw-r--r--xmrstak/backend/amd/amd_gpu/gpu.hpp2
-rw-r--r--xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl19
-rw-r--r--xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl30
-rw-r--r--xmrstak/backend/amd/autoAdjust.hpp2
-rw-r--r--xmrstak/backend/amd/config.tpl5
-rw-r--r--xmrstak/backend/amd/jconf.cpp9
-rw-r--r--xmrstak/backend/amd/jconf.hpp1
-rw-r--r--xmrstak/backend/amd/minethd.cpp1
9 files changed, 128 insertions, 67 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 15b8457..d9bc962 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -332,7 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
char options[256];
snprintf(options, sizeof(options),
- "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu", hasIterations, threadMemMask, int_port(ctx->workSize));
+ "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d",
+ hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
if(ret != CL_SUCCESS)
{
@@ -448,68 +449,85 @@ uint32_t getNumPlatforms()
std::vector<GpuContext> getAMDDevices(int index)
{
std::vector<GpuContext> ctxVec;
- cl_platform_id * platforms = NULL;
+ std::vector<cl_platform_id> platforms;
+ std::vector<cl_device_id> device_list;
+
cl_int clStatus;
cl_uint num_devices;
- cl_device_id *device_list = NULL;
-
uint32_t numPlatforms = getNumPlatforms();
- if(numPlatforms)
+ if(numPlatforms == 0)
+ return ctxVec;
+
+ platforms.resize(numPlatforms);
+ if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
+ return ctxVec;
+ }
+
+ if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+ return ctxVec;
+ }
+
+ device_list.resize(num_devices);
+ if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
+ return ctxVec;
+ }
+
+ for (size_t k = 0; k < num_devices; k++)
{
- platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
- clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
- if(clStatus == CL_SUCCESS)
+ std::vector<char> devVendorVec(1024);
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS)
{
- clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
- if(clStatus == CL_SUCCESS)
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
+ continue;
+ }
+
+ std::string devVendor(devVendorVec.data());
+ if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos)
+ {
+ GpuContext ctx;
+ std::vector<char> devNameVec(1024);
+ size_t maxMem;
+
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k);
+ continue;
+ }
+
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS)
{
- device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
- clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
- if(clStatus == CL_SUCCESS)
- {
- for (int k = 0; k < num_devices; k++)
- {
- cl_int clError;
- std::vector<char> devVendorVec(1024);
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL);
- if(clStatus == CL_SUCCESS)
- {
- std::string devVendor(devVendorVec.data());
- if( devVendor.find("Advanced Micro Devices") != std::string::npos)
- {
- GpuContext ctx;
- ctx.deviceIdx = k;
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL);
- size_t maxMem;
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL);
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL);
- // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
- ctx.freeMem = std::min(ctx.freeMem, maxMem);
- std::vector<char> devNameVec(1024);
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL);
- ctx.name = std::string(devNameVec.data());
- printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
- ctx.DeviceID = device_list[k];
- ctxVec.push_back(ctx);
- }
- }
- else
- printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name.", err_to_str(clStatus));
- }
- }
- else
- printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
- free(device_list);
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
+ continue;
}
- else
- printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k);
+ continue;
+ }
+
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
+ continue;
+ }
+ printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
+
+ // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
+ ctx.deviceIdx = k;
+ ctx.freeMem = std::min(ctx.freeMem, maxMem);
+ ctx.name = std::string(devNameVec.data());
+ ctx.DeviceID = device_list[k];
+ ctxVec.push_back(ctx);
}
- else
- printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
}
-
- free(platforms);
return ctxVec;
}
@@ -541,7 +559,7 @@ int getAMDPlatformIdx()
clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
std::string platformName(platformNameVec.data());
- if( platformName.find("Advanced Micro Devices") != std::string::npos)
+ if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos)
{
platformIndex = i;
printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str());
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 123de01..c17bac1 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -9,6 +9,7 @@
#endif
#include <stdint.h>
+#include <string>
#include <vector>
#define ERR_SUCCESS (0)
@@ -23,6 +24,7 @@ struct GpuContext
size_t deviceIdx;
size_t rawIntensity;
size_t workSize;
+ int stridedIndex;
/*Output vars*/
cl_device_id DeviceID;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 966199b..255fcbb 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -411,7 +411,11 @@ void AESExpandKey256(uint *keybuf)
}
}
-#define IDX(x) (x)
+#if(STRIDED_INDEX==0)
+# define IDX(x) (x)
+#else
+# define IDX(x) ((x) * (Threads))
+#endif
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
@@ -440,7 +444,12 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
if(gIdx < Threads)
{
states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+ Scratchpad += gIdx;
+#endif
((ulong8 *)State)[0] = vload8(0, input);
State[8] = input[8];
@@ -519,7 +528,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
if(gIdx < Threads)
{
states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+ Scratchpad += gIdx;
+#endif
a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
@@ -588,7 +601,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
if(gIdx < Threads)
{
states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+ Scratchpad += gIdx;
+#endif
#if defined(__Tahiti__) || defined(__Pitcairn__)
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
index 996944b..81e1644 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -74,15 +74,29 @@ static const __constant uint AES0_C[256] =
#define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U))
-uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, const uint4 key)
+uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
{
- uint4 Y;
- Y.s0 = AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)];
- Y.s1 = AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)];
- Y.s2 = AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)];
- Y.s3 = AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)];
- Y ^= key;
- return(Y);
+ key.s0 ^= AES0[BYTE(X.s0, 0)];
+ key.s1 ^= AES0[BYTE(X.s1, 0)];
+ key.s2 ^= AES0[BYTE(X.s2, 0)];
+ key.s3 ^= AES0[BYTE(X.s3, 0)];
+
+ key.s0 ^= AES2[BYTE(X.s2, 2)];
+ key.s1 ^= AES2[BYTE(X.s3, 2)];
+ key.s2 ^= AES2[BYTE(X.s0, 2)];
+ key.s3 ^= AES2[BYTE(X.s1, 2)];
+
+ key.s0 ^= AES1[BYTE(X.s1, 1)];
+ key.s1 ^= AES1[BYTE(X.s2, 1)];
+ key.s2 ^= AES1[BYTE(X.s3, 1)];
+ key.s3 ^= AES1[BYTE(X.s0, 1)];
+
+ key.s0 ^= AES3[BYTE(X.s3, 3)];
+ key.s1 ^= AES3[BYTE(X.s0, 3)];
+ key.s2 ^= AES3[BYTE(X.s1, 3)];
+ key.s3 ^= AES3[BYTE(X.s2, 3)];
+
+ return key;
}
#endif
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 0b91212..0bc5239 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -123,7 +123,7 @@ private:
// set 8 threads per block (this is a good value for the most gpus)
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
- " \"affine_to_cpu\" : false, \n"
+ " \"affine_to_cpu\" : false, \"strided_index\" : true\n"
" },\n";
++i;
}
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index a93859c..af662f8 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -5,9 +5,12 @@ R"===(
* intensity - Number of parallel GPU threads (nothing to do with CPU threads)
* worksize - Number of local GPU threads (nothing to do with CPU threads)
* affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ * strided_index - switch memory pattern used for the scratch pad memory
+ * true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ * false = use a contiguous block of memory per thread
* "gpu_threads_conf" :
* [
- * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
+ * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true },
* ],
*/
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 0617aeb..07afb19 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -103,13 +103,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
- const Value *idx, *intensity, *w_size, *aff;
+ const Value *idx, *intensity, *w_size, *aff, *stridedIndex;
idx = GetObjectMember(oThdConf, "index");
intensity = GetObjectMember(oThdConf, "intensity");
w_size = GetObjectMember(oThdConf, "worksize");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
+ stridedIndex = GetObjectMember(oThdConf, "strided_index");
- if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr)
+ if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr)
return false;
if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -118,9 +119,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!aff->IsUint64() && !aff->IsBool())
return false;
+ if(!stridedIndex->IsBool())
+ return false;
+
cfg.index = idx->GetUint64();
cfg.intensity = intensity->GetUint64();
cfg.w_size = w_size->GetUint64();
+ cfg.stridedIndex = stridedIndex->GetBool();
if(aff->IsNumber())
cfg.cpu_aff = aff->GetInt64();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index da024a4..ee1882a 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -26,6 +26,7 @@ public:
size_t intensity;
size_t w_size;
long long cpu_aff;
+ bool stridedIndex;
};
size_t GetThreadCount();
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index c1399e0..103688f 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -96,6 +96,7 @@ bool minethd::init_gpus()
vGpuData[i].deviceIdx = cfg.index;
vGpuData[i].rawIntensity = cfg.intensity;
vGpuData[i].workSize = cfg.w_size;
+ vGpuData[i].stridedIndex = cfg.stridedIndex;
}
return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
OpenPOWER on IntegriCloud