1 files changed, 376 insertions, 335 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 9c65425..a1642c4 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -422,12 +422,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	uint4 text;
 
 	const ulong gIdx = get_global_id(0) - get_global_offset(0);
-	if(gIdx >= Threads) return;
 
-	states += 25 * gIdx;
-	Scratchpad += gIdx * (0x80000 >> 2);
-
-	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
+		i < 256;
+		i += WORKSIZE * 8)
 	{
 		const uint tmp = AES0_C[i];
 		AES0[i] = tmp;
@@ -435,49 +433,64 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
 	}
+
 	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	((ulong8 *)State)[0] = vload8(0, input);
-	State[8] = input[8];
-	State[9] = input[9];
-	State[10] = input[10];
-
-	((uint *)State)[9] &= 0x00FFFFFFU;
-	((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24;
-	((uint *)State)[10] &= 0xFF000000U;
-	((uint *)State)[10] |= ((get_global_id(0) >> 8));
-	
-	for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
-	
-	// Last bit of padding
-	State[16] = 0x8000000000000000UL;
-	
-	keccakf1600_2(State);
-	
+
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		states += 25 * gIdx;
+		Scratchpad += gIdx * (0x80000 >> 2);
+
+		((ulong8 *)State)[0] = vload8(0, input);
+		State[8] = input[8];
+		State[9] = input[9];
+		State[10] = input[10];
+
+		((uint *)State)[9] &= 0x00FFFFFFU;
+		((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24;
+		((uint *)State)[10] &= 0xFF000000U;
+		((uint *)State)[10] |= ((get_global_id(0) >> 8));
+
+		for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
+
+		// Last bit of padding
+		State[16] = 0x8000000000000000UL;
+
+		keccakf1600_2(State);
+	}
+
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
-	
-	#pragma unroll
-	for(int i = 0; i < 25; ++i) states[i] = State[i];
-	
-	text = vload4(get_local_id(1) + 4, (__global uint *)(states));
-	
-	#pragma unroll
-	for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey1)[i] = states[i];
-	
-	AESExpandKey256(ExpandedKey1);
-	
-	mem_fence(CLK_LOCAL_MEM_FENCE);
-	
-	#pragma unroll 2
-	for(int i = 0; i < 0x4000; ++i)
+
+	// do not use early return here
+	if(gIdx < Threads)
 	{
 		#pragma unroll
-		for(int j = 0; j < 10; ++j)
-			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
-		
-		Scratchpad[IDX((i << 3) + get_local_id(1))] = text;
+		for(int i = 0; i < 25; ++i) states[i] = State[i];
+
+		text = vload4(get_local_id(1) + 4, (__global uint *)(states));
+
+		#pragma unroll
+		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey1)[i] = states[i];
+
+		AESExpandKey256(ExpandedKey1);
+	}
+
+	mem_fence(CLK_LOCAL_MEM_FENCE);
+
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		#pragma unroll 2
+		for(int i = 0; i < 0x4000; ++i)
+		{
+			#pragma unroll
+			for(int j = 0; j < 10; ++j)
+				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
+
+			Scratchpad[IDX((i << 3) + get_local_id(1))] = text;
+		}
 	}
-	
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
@@ -488,11 +501,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 
 	const ulong gIdx = get_global_id(0) - get_global_offset(0);
-	if(gIdx >= Threads) return;
 
-	states += 25 * gIdx;
-	Scratchpad += gIdx * (0x80000 >> 2);
-	
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
 		const uint tmp = AES0_C[i];
@@ -501,41 +510,54 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
 	}
+
 	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	a[0] = states[0] ^ states[4];
-	b[0] = states[2] ^ states[6];
-	a[1] = states[1] ^ states[5];
-	b[1] = states[3] ^ states[7];
-	
-	uint4 b_x = ((uint4 *)b)[0];
-	
+
+	uint4 b_x;
+		
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		states += 25 * gIdx;
+		Scratchpad += gIdx * (0x80000 >> 2);
+
+		a[0] = states[0] ^ states[4];
+		b[0] = states[2] ^ states[6];
+		a[1] = states[1] ^ states[5];
+		b[1] = states[3] ^ states[7];
+
+		b_x = ((uint4 *)b)[0];
+	}
+
 	mem_fence(CLK_LOCAL_MEM_FENCE);
-	
-	#pragma unroll 8
-	for(int i = 0; i < 0x80000; ++i)
+
+	// do not use early return here
+	if(gIdx < Threads)
 	{
-		ulong c[2];
-		
-		((uint4 *)c)[0] = Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)];
-		((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
-		//b_x ^= ((uint4 *)c)[0];
-		
-		Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)] = b_x ^ ((uint4 *)c)[0];
-		
-		uint4 tmp;
-		tmp = Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)];
-		
-		a[1] += c[0] * as_ulong2(tmp).s0;
-		a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
-		
-		Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)] = ((uint4 *)a)[0];
-		
-		((uint4 *)a)[0] ^= tmp;
-		
-		b_x = ((uint4 *)c)[0];
+		#pragma unroll 8
+		for(int i = 0; i < 0x80000; ++i)
+		{
+			ulong c[2];
+
+			((uint4 *)c)[0] = Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)];
+			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+			//b_x ^= ((uint4 *)c)[0];
+
+			Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)] = b_x ^ ((uint4 *)c)[0];
+
+			uint4 tmp;
+			tmp = Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)];
+
+			a[1] += c[0] * as_ulong2(tmp).s0;
+			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+
+			Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)] = ((uint4 *)a)[0];
+
+			((uint4 *)a)[0] ^= tmp;
+
+			b_x = ((uint4 *)c)[0];
+		}
 	}
-	
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
@@ -548,12 +570,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 	uint4 text;
 	
 	const ulong gIdx = get_global_id(0) - get_global_offset(0);
-	if(gIdx >= Threads) return;
-
-	states += 25 * gIdx;
-	Scratchpad += gIdx * (0x80000 >> 2);
 
-	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
+		i < 256;
+		i += WORKSIZE * 8)
 	{
 		const uint tmp = AES0_C[i];
 		AES0[i] = tmp;
@@ -561,63 +581,78 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
 	}
+
 	barrier(CLK_LOCAL_MEM_FENCE);
 
-	#if defined(__Tahiti__) || defined(__Pitcairn__)
-	
-	for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
-	text = vload4(get_local_id(1) + 4, (__global uint *)states);
-	
-	#else
-	
-	text = vload4(get_local_id(1) + 4, (__global uint *)states);
-	((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
-	
-	#endif
-	
-	AESExpandKey256(ExpandedKey2);
-	
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		states += 25 * gIdx;
+		Scratchpad += gIdx * (0x80000 >> 2);
+
+		#if defined(__Tahiti__) || defined(__Pitcairn__)
+
+		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
+		text = vload4(get_local_id(1) + 4, (__global uint *)states);
+
+		#else
+
+		text = vload4(get_local_id(1) + 4, (__global uint *)states);
+		((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
+
+		#endif
+
+		AESExpandKey256(ExpandedKey2);
+	}
+
 	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	#pragma unroll 2
-	for(int i = 0; i < 0x4000; ++i)
-	{		
-		text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
-		
-		#pragma unroll
-		for(int j = 0; j < 10; ++j)
-			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		#pragma unroll 2
+		for(int i = 0; i < 0x4000; ++i)
+		{
+			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+
+			#pragma unroll
+			for(int j = 0; j < 10; ++j)
+				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		}
+
+		vstore2(as_ulong2(text), get_local_id(1) + 4, states);
 	}
-	
-	vstore2(as_ulong2(text), get_local_id(1) + 4, states);
-	
+
 	barrier(CLK_GLOBAL_MEM_FENCE);
-	
-	if(!get_local_id(1))
+
+	// do not use early return here
+	if(gIdx < Threads)
 	{
-		for(int i = 0; i < 25; ++i) State[i] = states[i];
-		
-		keccakf1600_2(State);
-		
-		for(int i = 0; i < 25; ++i) states[i] = State[i];
-		
-		switch(State[0] & 3)
+		if(!get_local_id(1))
 		{
-			case 0:
-				Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0);
-				break;
-			case 1:
-				Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0);
-				break;
-			case 2:
-				Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0);
-				break;
-			case 3:
-				Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0);
-				break;
+			for(int i = 0; i < 25; ++i) State[i] = states[i];
+
+			keccakf1600_2(State);
+
+			for(int i = 0; i < 25; ++i) states[i] = State[i];
+
+			switch(State[0] & 3)
+			{
+				case 0:
+					Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0);
+					break;
+				case 1:
+					Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0);
+					break;
+				case 2:
+					Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0);
+					break;
+				case 3:
+					Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0);
+					break;
+			}
 		}
 	}
-	
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
@@ -634,54 +669,56 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
 {
 	const ulong idx = get_global_id(0) - get_global_offset(0);
 	
-	if(idx >= Threads) return;
-	
-	states += 25 * BranchBuf[idx];
-	
-	// skein
-	ulong8 h = vload8(0, SKEIN512_256_IV);
-	
-	// Type field begins with final bit, first bit, then six bits of type; the last 96
-	// bits are input processed (including in the block to be processed with that tweak)
-	// The output transform is only one run of UBI, since we need only 256 bits of output
-	// The tweak for the output transform is Type = Output with the Final bit set
-	// T[0] for the output is 8, and I don't know why - should be message size...
-	ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
-	ulong8 p, m;
-	
-	for(uint i = 0; i < 4; ++i)
+	// do not use early return here
+	if(idx < Threads)
 	{
-		if(i < 3) t[0] += 0x40UL;
-		else t[0] += 0x08UL;
-		
+		states += 25 * BranchBuf[idx];
+
+		// skein
+		ulong8 h = vload8(0, SKEIN512_256_IV);
+
+		// Type field begins with final bit, first bit, then six bits of type; the last 96
+		// bits are input processed (including in the block to be processed with that tweak)
+		// The output transform is only one run of UBI, since we need only 256 bits of output
+		// The tweak for the output transform is Type = Output with the Final bit set
+		// T[0] for the output is 8, and I don't know why - should be message size...
+		ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
+		ulong8 p, m;
+
+		for(uint i = 0; i < 4; ++i)
+		{
+			if(i < 3) t[0] += 0x40UL;
+			else t[0] += 0x08UL;
+
+			t[2] = t[0] ^ t[1];
+
+			m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+			const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+			p = Skein512Block(m, h, h8, t);
+
+			h = m ^ p;
+
+			if(i < 2) t[1] = 0x3000000000000000UL;
+			else t[1] = 0xB000000000000000UL;
+		}
+
+		t[0] = 0x08UL;
+		t[1] = 0xFF00000000000000UL;
 		t[2] = t[0] ^ t[1];
-		
-		m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+
+		p = (ulong8)(0);
 		const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
-		p = Skein512Block(m, h, h8, t);
-		
-		h = m ^ p;
-		
-		if(i < 2) t[1] = 0x3000000000000000UL;
-		else t[1] = 0xB000000000000000UL;
-	}
-	
-	t[0] = 0x08UL;
-	t[1] = 0xFF00000000000000UL;
-	t[2] = t[0] ^ t[1];
-	
-	p = (ulong8)(0);
-	const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
-	
-	p = Skein512Block(p, h, h8, t);
-	
-	//vstore8(p, 0, output);
-	
-	if(as_uint16(p).s7 <= Target)
-	{
-		ulong outIdx = atomic_inc(output + 0xFF);
-		if(outIdx < 0xFF)
-			output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+
+		p = Skein512Block(p, h, h8, t);
+
+		//vstore8(p, 0, output);
+
+		if(as_uint16(p).s7 <= Target)
+		{
+			ulong outIdx = atomic_inc(output + 0xFF);
+			if(outIdx < 0xFF)
+				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);	
 }
@@ -692,66 +729,68 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
 	
-	if(idx >= Threads) return;
-	
-	states += 25 * BranchBuf[idx];
-	
-	sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
-	sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
-	sph_u64 tmp;
-	
-	for(int i = 0; i < 5; ++i)
+	// do not use early return here
+	if(idx < Threads)
 	{
-		ulong input[8];
-		
-		if(i < 3)
-		{
-			for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]);
-		}
-		else if(i == 3)
+		states += 25 * BranchBuf[idx];
+
+		sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
+		sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
+		sph_u64 tmp;
+
+		for(int i = 0; i < 5; ++i)
 		{
-			input[0] = (states[24]);
-			input[1] = 0x80UL;
-			for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+			ulong input[8];
+
+			if(i < 3)
+			{
+				for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]);
+			}
+			else if(i == 3)
+			{
+				input[0] = (states[24]);
+				input[1] = 0x80UL;
+				for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+			}
+			else
+			{
+				input[7] = 0x4006000000000000UL;
+
+				for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+			}
+
+			h0h ^= input[0];
+			h0l ^= input[1];
+			h1h ^= input[2];
+			h1l ^= input[3];
+			h2h ^= input[4];
+			h2l ^= input[5];
+			h3h ^= input[6];
+			h3l ^= input[7];
+
+			E8;
+
+			h4h ^= input[0];
+			h4l ^= input[1];
+			h5h ^= input[2];
+			h5l ^= input[3];
+			h6h ^= input[4];
+			h6l ^= input[5];
+			h7h ^= input[6];
+			h7l ^= input[7];
 		}
-		else
+
+		//output[0] = h6h;
+		//output[1] = h6l;
+		//output[2] = h7h;
+		//output[3] = h7l;
+
+		if(as_uint2(h7l).s1 <= Target)
 		{
-			input[7] = 0x4006000000000000UL;
-						
-			for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+			ulong outIdx = atomic_inc(output + 0xFF);
+			if(outIdx < 0xFF)
+				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 		}
-		
-		h0h ^= input[0];
-		h0l ^= input[1];
-		h1h ^= input[2];
-		h1l ^= input[3];
-		h2h ^= input[4];
-		h2l ^= input[5];
-		h3h ^= input[6];
-		h3l ^= input[7];
-		
-		E8;
-		
-		h4h ^= input[0];
-		h4l ^= input[1];
-		h5h ^= input[2];
-		h5l ^= input[3];
-		h6h ^= input[4];
-		h6l ^= input[5];
-		h7h ^= input[6];
-		h7l ^= input[7];
-	}
-	
-	//output[0] = h6h;
-	//output[1] = h6l;
-	//output[2] = h7h;
-	//output[3] = h7l;
-	
-	if(as_uint2(h7l).s1 <= Target)
-	{
-		ulong outIdx = atomic_inc(output + 0xFF);
-		if(outIdx < 0xFF)
-			output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 	}
 }
 
@@ -761,70 +800,71 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
 	
-	if(idx >= Threads) return;
-	
-	states += 25 * BranchBuf[idx];
-	
-	unsigned int m[16];
-	unsigned int v[16];
-	uint h[8];
-	
-	((uint8 *)h)[0] = vload8(0U, c_IV256);
-	
-	for(uint i = 0, bitlen = 0; i < 4; ++i)
+	// do not use early return here
+	if(idx < Threads)
 	{
-		if(i < 3)
+		states += 25 * BranchBuf[idx];
+	
+		unsigned int m[16];
+		unsigned int v[16];
+		uint h[8];
+
+		((uint8 *)h)[0] = vload8(0U, c_IV256);
+
+		for(uint i = 0, bitlen = 0; i < 4; ++i)
 		{
-			((uint16 *)m)[0] = vload16(i, (__global uint *)states);
-			for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
-			bitlen += 512;
+			if(i < 3)
+			{
+				((uint16 *)m)[0] = vload16(i, (__global uint *)states);
+				for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
+				bitlen += 512;
+			}
+			else
+			{
+				m[0] = SWAP4(((__global uint *)states)[48]);
+				m[1] = SWAP4(((__global uint *)states)[49]);
+				m[2] = 0x80000000U;
+
+				for(int i = 3; i < 13; ++i) m[i] = 0x00U;
+
+				m[13] = 1U;
+				m[14] = 0U;
+				m[15] = 0x640;
+				bitlen += 64;
+			}
+
+			((uint16 *)v)[0].lo = ((uint8 *)h)[0];
+			((uint16 *)v)[0].hi = vload8(0U, c_u256);
+
+			//v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
+			//v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
+
+			v[12] ^= bitlen;
+			v[13] ^= bitlen;
+
+			for(int r = 0; r < 14; r++)
+			{
+				GS(0, 4, 0x8, 0xC, 0x0);
+				GS(1, 5, 0x9, 0xD, 0x2);
+				GS(2, 6, 0xA, 0xE, 0x4);
+				GS(3, 7, 0xB, 0xF, 0x6);
+				GS(0, 5, 0xA, 0xF, 0x8);
+				GS(1, 6, 0xB, 0xC, 0xA);
+				GS(2, 7, 0x8, 0xD, 0xC);
+				GS(3, 4, 0x9, 0xE, 0xE);
+			}
+
+			((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
 		}
-		else
+
+		for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
+
+		if(h[7] <= Target)
 		{
-			m[0] = SWAP4(((__global uint *)states)[48]);
-			m[1] = SWAP4(((__global uint *)states)[49]);
-			m[2] = 0x80000000U;
-			
-			for(int i = 3; i < 13; ++i) m[i] = 0x00U;
-			
-			m[13] = 1U;
-			m[14] = 0U;
-			m[15] = 0x640;
-			bitlen += 64;
+			ulong outIdx = atomic_inc(output + 0xFF);
+			if(outIdx < 0xFF)
+				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 		}
-		
-		((uint16 *)v)[0].lo = ((uint8 *)h)[0];
-		((uint16 *)v)[0].hi = vload8(0U, c_u256);
-		
-		//v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-		//v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-		
-		v[12] ^= bitlen;
-		v[13] ^= bitlen;
-		
-		for(int r = 0; r < 14; r++)
-		{	
-			GS(0, 4, 0x8, 0xC, 0x0);
-			GS(1, 5, 0x9, 0xD, 0x2);
-			GS(2, 6, 0xA, 0xE, 0x4);
-			GS(3, 7, 0xB, 0xF, 0x6);
-			GS(0, 5, 0xA, 0xF, 0x8);
-			GS(1, 6, 0xB, 0xC, 0xA);
-			GS(2, 7, 0x8, 0xD, 0xC);
-			GS(3, 4, 0x9, 0xE, 0xE);
-		}
-		
-		((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
-	}
-	
-	for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
-	
-	//for(int i = 0; i < 4; ++i) output[i] = ((ulong *)h)[i];
-	if(h[7] <= Target)
-	{
-		ulong outIdx = atomic_inc(output + 0xFF);
-		if(outIdx < 0xFF)
-			output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 	}
 }
 
@@ -832,56 +872,57 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
 	
-	if(idx >= Threads) return;
-	
-	states += 25 * BranchBuf[idx];
-	
-	ulong State[8];
-	
-	for(int i = 0; i < 7; ++i) State[i] = 0UL;
-	
-	State[7] = 0x0001000000000000UL;
-	
-	for(uint i = 0; i < 4; ++i)
+	// do not use early return here
+	if(idx < Threads)
 	{
-		ulong H[8], M[8];
-		
-		if(i < 3)
+		states += 25 * BranchBuf[idx];
+
+		ulong State[8];
+
+		for(int i = 0; i < 7; ++i) State[i] = 0UL;
+
+		State[7] = 0x0001000000000000UL;
+
+		for(uint i = 0; i < 4; ++i)
 		{
-			((ulong8 *)M)[0] = vload8(i, states);
+			ulong H[8], M[8];
+
+			if(i < 3)
+			{
+				((ulong8 *)M)[0] = vload8(i, states);
+			}
+			else
+			{
+				M[0] = states[24];
+				M[1] = 0x80UL;
+
+				for(int x = 2; x < 7; ++x) M[x] = 0UL;
+
+				M[7] = 0x0400000000000000UL;
+			}
+
+			for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
+
+			PERM_SMALL_P(H);
+			PERM_SMALL_Q(M);
+
+			for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
 		}
-		else
+
+		ulong tmp[8];
+
+		for(int i = 0; i < 8; ++i) tmp[i] = State[i];
+
+		PERM_SMALL_P(State);
+
+		for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
+
+		if(as_uint2(State[7]).s1 <= Target)
 		{
-			M[0] = states[24];
-			M[1] = 0x80UL;
-						
-			for(int x = 2; x < 7; ++x) M[x] = 0UL;
-			
-			M[7] = 0x0400000000000000UL;
+			ulong outIdx = atomic_inc(output + 0xFF);
+			if(outIdx < 0xFF)
+				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 		}
-		
-		for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
-		
-		PERM_SMALL_P(H);
-		PERM_SMALL_Q(M);
-		
-		for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
-	}
-	
-	ulong tmp[8];
-	
-	for(int i = 0; i < 8; ++i) tmp[i] = State[i];
-	
-	PERM_SMALL_P(State);
-	
-	for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
-	
-	//for(int i = 0; i < 4; ++i) output[i] = State[i + 4];
-	if(as_uint2(State[7]).s1 <= Target)
-	{
-		ulong outIdx = atomic_inc(output + 0xFF);
-		if(outIdx < 0xFF)
-			output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 	}
 }