4 files changed, 114 insertions, 63 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 15b8457..d9bc962 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -332,7 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 	char options[256];
 	snprintf(options, sizeof(options), 
-		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu", hasIterations, threadMemMask, int_port(ctx->workSize));
+		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", 
+		hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
 	ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
 	if(ret != CL_SUCCESS)
 	{
@@ -448,68 +449,85 @@ uint32_t getNumPlatforms()
 std::vector<GpuContext> getAMDDevices(int index)
 {
 	std::vector<GpuContext> ctxVec;
-	cl_platform_id * platforms = NULL;
+	std::vector<cl_platform_id> platforms;
+	std::vector<cl_device_id> device_list;
+
 	cl_int clStatus;
 	cl_uint num_devices;
-	cl_device_id *device_list = NULL;
-
 	uint32_t numPlatforms = getNumPlatforms();
 
-	if(numPlatforms)
+	if(numPlatforms == 0)
+		return ctxVec;
+
+	platforms.resize(numPlatforms);
+	if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
+		return ctxVec;
+	}
+
+	if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+		return ctxVec;
+	}
+
+	device_list.resize(num_devices);
+	if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
+		return ctxVec;
+	}
+
+	for (size_t k = 0; k < num_devices; k++)
 	{
-		platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
-		clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
-		if(clStatus == CL_SUCCESS)
+		std::vector<char> devVendorVec(1024);
+		if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS)
 		{
-			clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
-			if(clStatus == CL_SUCCESS)
+			printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
+			continue;
+		}
+		
+		std::string devVendor(devVendorVec.data());
+		if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos)
+		{
+			GpuContext ctx;
+			std::vector<char> devNameVec(1024);
+			size_t maxMem;
+
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS)
 			{
-				device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
-				clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
-				if(clStatus == CL_SUCCESS)
-				{
-					for (int k = 0; k < num_devices; k++)
-					{
-						cl_int clError;
-						std::vector<char> devVendorVec(1024);
-						clError = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL);
-						if(clStatus == CL_SUCCESS)
-						{
-							std::string devVendor(devVendorVec.data());
-							if( devVendor.find("Advanced Micro Devices") != std::string::npos)
-							{
-								GpuContext ctx;
-								ctx.deviceIdx = k;
-								clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL);
-								size_t maxMem;
-								clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL);
-								clError = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL);
-								// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
-								ctx.freeMem = std::min(ctx.freeMem, maxMem);
-								std::vector<char> devNameVec(1024);
-								clError = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL);
-								ctx.name = std::string(devNameVec.data());
-								printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
-								ctx.DeviceID = device_list[k];
-								ctxVec.push_back(ctx);
-							}
-						}
-						else
-							printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name.", err_to_str(clStatus));
-					}
-				}
-				else
-					printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
-				free(device_list);
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
+				continue;
 			}
-			else
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
+
+			// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
+			ctx.deviceIdx = k;
+			ctx.freeMem = std::min(ctx.freeMem, maxMem);
+			ctx.name = std::string(devNameVec.data());
+			ctx.DeviceID = device_list[k];
+			ctxVec.push_back(ctx);
 		}
-		else
-			printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
 	}
-	
-	free(platforms);
 
 	return ctxVec;
 }
@@ -541,7 +559,7 @@ int getAMDPlatformIdx()
 
 			clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
 			std::string platformName(platformNameVec.data());
-			if( platformName.find("Advanced Micro Devices") != std::string::npos)
+			if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos)
 			{
 				platformIndex = i;
 				printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str());
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 123de01..c17bac1 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -9,6 +9,7 @@
 #endif
 
 #include <stdint.h>
+#include <string>
 #include <vector>
 
 #define ERR_SUCCESS (0)
@@ -23,6 +24,7 @@ struct GpuContext
 	size_t deviceIdx;
 	size_t rawIntensity;
 	size_t workSize;
+	int stridedIndex;
 
 	/*Output vars*/
 	cl_device_id DeviceID;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 966199b..255fcbb 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -411,7 +411,11 @@ void AESExpandKey256(uint *keybuf)
 	}
 }
 
-#define IDX(x)	(x)
+#if(STRIDED_INDEX==0)
+#   define IDX(x)	(x)
+#else
+#   define IDX(x)	((x) * (Threads))
+#endif
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
 __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
@@ -440,7 +444,12 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		((ulong8 *)State)[0] = vload8(0, input);
 		State[8] = input[8];
@@ -519,7 +528,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		a[0] = states[0] ^ states[4];
 		b[0] = states[2] ^ states[6];
@@ -588,7 +601,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		#if defined(__Tahiti__) || defined(__Pitcairn__)
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
index 996944b..81e1644 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -74,15 +74,29 @@ static const __constant uint AES0_C[256] =
 
 #define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
 
-uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, const uint4 key)
+uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
 {
-	uint4 Y;
-	Y.s0 = AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)];
-    Y.s1 = AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)];
-    Y.s2 = AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)];
-    Y.s3 = AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)];
-    Y ^= key;
-    return(Y);
+	key.s0 ^= AES0[BYTE(X.s0, 0)];
+    key.s1 ^= AES0[BYTE(X.s1, 0)];
+    key.s2 ^= AES0[BYTE(X.s2, 0)];
+    key.s3 ^= AES0[BYTE(X.s3, 0)];
+
+	key.s0 ^= AES2[BYTE(X.s2, 2)];
+    key.s1 ^= AES2[BYTE(X.s3, 2)];
+    key.s2 ^= AES2[BYTE(X.s0, 2)];
+    key.s3 ^= AES2[BYTE(X.s1, 2)];
+
+	key.s0 ^= AES1[BYTE(X.s1, 1)];
+    key.s1 ^= AES1[BYTE(X.s2, 1)];
+    key.s2 ^= AES1[BYTE(X.s3, 1)];
+    key.s3 ^= AES1[BYTE(X.s0, 1)];
+
+	key.s0 ^= AES3[BYTE(X.s3, 3)];
+    key.s1 ^= AES3[BYTE(X.s0, 3)];
+    key.s2 ^= AES3[BYTE(X.s1, 3)];
+    key.s3 ^= AES3[BYTE(X.s2, 3)];
+
+    return key;
 }
 
 #endif