Merge pull request #1087 from psychocrypt/topic-blockedStride3

AMD: option `mem_chunk`and new `strided_index`
author: fireice-uk <fireice-uk@users.noreply.github.com> 2018-02-19 14:10:36 +0000
committer: GitHub <noreply@github.com> 2018-02-19 14:10:36 +0000
commit: f785481bb7c1fb887a65a0b19c3e453904bb5474 (patch)
tree: f0ac2fc281c47ce4965189f75e29e7b71002bc85 /xmrstak/backend/amd/amd_gpu
parent: 84febdf63edc3cf702ae6747c8b071d5302249de (diff)
parent: 737185ee82bae05953680b1f4c4cdf8646c51b5a (diff)
download: xmr-stak-f785481bb7c1fb887a65a0b19c3e453904bb5474.zip
xmr-stak-f785481bb7c1fb887a65a0b19c3e453904bb5474.tar.gz
3 files changed, 36 insertions, 11 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index c39c567..054ffc4 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -332,8 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 	char options[256];
 	snprintf(options, sizeof(options), 
-		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", 
-		hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
+		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK=%d",
+		hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk));
 	ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
 	if(ret != CL_SUCCESS)
 	{
@@ -696,6 +696,13 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
+		{
+			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
+			ctx[i].rawIntensity = reduced_intensity;
+			printer::inst()->print_msg(L0, "WARNING AMD: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", ctx[i].deviceIdx, int(reduced_intensity));
+		}
+
 		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
 		{
 			return ret;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index c17bac1..abfad5c 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -25,6 +25,7 @@ struct GpuContext
 	size_t rawIntensity;
 	size_t workSize;
 	int stridedIndex;
+	int memChunk;
 
 	/*Output vars*/
 	cl_device_id DeviceID;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index c0b6529..53299ec 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -411,12 +411,23 @@ void AESExpandKey256(uint *keybuf)
 	}
 }
 
+#define MEM_CHUNK (1<<4)
+
 #if(STRIDED_INDEX==0)
 #   define IDX(x)	(x)
-#else
+#elif(STRIDED_INDEX==1)
 #   define IDX(x)	((x) * (Threads))
+#elif(STRIDED_INDEX==2)
+#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
 #endif
 
+inline ulong getIdx()
+{
+#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2)
+	return get_global_id(0) - get_global_offset(0);
+#endif
+}
+
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
 __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
 {
@@ -425,7 +436,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 	uint4 text;
 
-	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
 		i < 256;
@@ -439,7 +450,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	}
 
 	barrier(CLK_LOCAL_MEM_FENCE);
-
+		
 	// do not use early return here
 	if(gIdx < Threads)
 	{
@@ -447,8 +458,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 
 #if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
+#elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+		Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #endif
 
 		((ulong8 *)State)[0] = vload8(0, input);
@@ -509,7 +522,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	ulong a[2], b[2];
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 
-	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
@@ -523,15 +536,17 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	uint4 b_x;
-		
+
 	// do not use early return here
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
+#elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+		Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #endif
 
 		a[0] = states[0] ^ states[4];
@@ -582,7 +597,7 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 	ulong State[25];
 	uint4 text;
 	
-	const ulong gIdx = get_global_id(0) - get_global_offset(0);
+	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
 		i < 256;
@@ -603,8 +618,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
+#elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+		Scratchpad += get_group_id(0) * (ITERATIONS >> 2) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #endif
 
 		#if defined(__Tahiti__) || defined(__Pitcairn__)
author	fireice-uk <fireice-uk@users.noreply.github.com>	2018-02-19 14:10:36 +0000
committer	GitHub <noreply@github.com>	2018-02-19 14:10:36 +0000
commit	f785481bb7c1fb887a65a0b19c3e453904bb5474 (patch)
tree	f0ac2fc281c47ce4965189f75e29e7b71002bc85 /xmrstak/backend/amd/amd_gpu
parent	84febdf63edc3cf702ae6747c8b071d5302249de (diff)
parent	737185ee82bae05953680b1f4c4cdf8646c51b5a (diff)
download	xmr-stak-f785481bb7c1fb887a65a0b19c3e453904bb5474.zip xmr-stak-f785481bb7c1fb887a65a0b19c3e453904bb5474.tar.gz