2 files changed, 59 insertions, 24 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index deb0fc7..2bd3688 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -653,6 +653,8 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint32_t tar
 
 	input[input_len] = 0x01;
 	memset(input + input_len + 1, 0, 88 - input_len - 1);
+	
+	size_t numThreads = ctx->rawIntensity;
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
 	{
@@ -680,6 +682,13 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint32_t tar
 		return ERR_OCL_API;
 	}
 
+	// Threads
+	if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
+		return(ERR_OCL_API);
+	}
+
 	// CN2 Kernel
 
 	// Scratchpads
@@ -696,6 +705,13 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint32_t tar
 		return ERR_OCL_API;
 	}
 
+	// Threads
+	if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
+		return(ERR_OCL_API);
+	}
+
 	// CN3 Kernel
 	// Scratchpads
 	if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
@@ -739,6 +755,13 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint32_t tar
 		return ERR_OCL_API;
 	}
 
+	// Threads
+	if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
+		return(ERR_OCL_API);
+	}
+
 	for(int i = 0; i < 4; ++i)
 	{
 		// States
@@ -780,12 +803,14 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 	size_t BranchNonces[4];
 	memset(BranchNonces,0,sizeof(size_t)*4);
 
-	size_t g_thd = ctx->rawIntensity;
+	size_t g_intensity = ctx->rawIntensity;
 	size_t w_size = ctx->workSize;
+	// round up to next multiple of w_size
+	size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
 
 	for(int i = 2; i < 6; ++i)
 	{
-		if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->ExtraBuffers[i], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
+		if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->ExtraBuffers[i], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2);
 			return ERR_OCL_API;
@@ -828,25 +853,25 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
@@ -865,7 +890,8 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 				return(ERR_OCL_API);
 			}
 
-			BranchNonces[i] = ((size_t)ceil( (double)BranchNonces[i] / (double)w_size) ) * w_size;
+			// round up to next multiple of w_size
+			BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size;
 			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &ctx->Nonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
@@ -881,7 +907,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 	}
 
 	clFinish(ctx->CommandQueues);
-	ctx->Nonce += g_thd;
+	ctx->Nonce += g_intensity;
 
 	return ERR_SUCCESS;
 }
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index dd0ebcb..4fb8b0d 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -414,16 +414,19 @@ void AESExpandKey256(uint *keybuf)
 #define IDX(x)	(x)
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states)
+__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
 {
 	ulong State[25];
 	uint ExpandedKey1[256];
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 	uint4 text;
-	
-	states += (25 * (get_global_id(0) - get_global_offset(0)));
-	Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2);
-	
+
+	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+	if(gIdx >= Threads) return;
+
+	states += 25 * gIdx;
+	Scratchpad += gIdx * (0x80000 >> 2);
+
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
 		const uint tmp = AES0_C[i];
@@ -479,13 +482,16 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states)
+__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads)
 {
 	ulong a[2], b[2];
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
-	
-	Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2);
-	states += (25 * (get_global_id(0) - get_global_offset(0)));
+
+	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+	if(gIdx >= Threads) return;
+
+	states += 25 * gIdx;
+	Scratchpad += gIdx * (0x80000 >> 2);
 	
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
@@ -534,15 +540,18 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states)
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3)
+__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
 {
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 	uint ExpandedKey2[256];
 	ulong State[25];
 	uint4 text;
 	
-	Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2);
-	states += (25 * (get_global_id(0) - get_global_offset(0)));
+	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+	if(gIdx >= Threads) return;
+
+	states += 25 * gIdx;
+	Scratchpad += gIdx * (0x80000 >> 2);
 	
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
@@ -595,16 +604,16 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 		switch(State[0] & 3)
 		{
 			case 0:
-				Branch0[atomic_inc(Branch0 + get_global_size(0))] = get_global_id(0) - get_global_offset(0);
+				Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0);
 				break;
 			case 1:
-				Branch1[atomic_inc(Branch1 + get_global_size(0))] = get_global_id(0) - get_global_offset(0);
+				Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0);
 				break;
 			case 2:
-				Branch2[atomic_inc(Branch2 + get_global_size(0))] = get_global_id(0) - get_global_offset(0);
+				Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0);
 				break;
 			case 3:
-				Branch3[atomic_inc(Branch3 + get_global_size(0))] = get_global_id(0) - get_global_offset(0);
+				Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0);
 				break;
 		}
 	}