summaryrefslogtreecommitdiffstats
path: root/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
diff options
context:
space:
mode:
Diffstat (limited to 'xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl')
-rw-r--r--xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl112
1 files changed, 56 insertions, 56 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 9e2f03c..c925c87 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -89,7 +89,7 @@ XMRSTAK_INCLUDE_BLAKE256
//#include "opencl/groestl256.cl"
XMRSTAK_INCLUDE_GROESTL256
-static const __constant ulong keccakf_rndc[24] =
+static const __constant ulong keccakf_rndc[24] =
{
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
@@ -101,7 +101,7 @@ static const __constant ulong keccakf_rndc[24] =
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};
-static const __constant uchar sbox[256] =
+static const __constant uchar sbox[256] =
{
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
@@ -124,7 +124,7 @@ static const __constant uchar sbox[256] =
void keccakf1600(ulong *s)
{
- for(int i = 0; i < 24; ++i)
+ for(int i = 0; i < 24; ++i)
{
ulong bc[5], tmp1, tmp2;
bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
@@ -132,9 +132,9 @@ void keccakf1600(ulong *s)
bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
-
+
tmp1 = s[1] ^ bc[0];
-
+
s[0] ^= bc[4];
s[1] = rotate(s[6] ^ bc[0], 44UL);
s[6] = rotate(s[9] ^ bc[3], 20UL);
@@ -160,7 +160,7 @@ void keccakf1600(ulong *s)
s[11] = rotate(s[7] ^ bc[1], 6UL);
s[7] = rotate(s[10] ^ bc[4], 3UL);
s[10] = rotate(tmp1, 1UL);
-
+
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
@@ -170,23 +170,23 @@ void keccakf1600(ulong *s)
}
}
-static const __constant uint keccakf_rotc[24] =
+static const __constant uint keccakf_rotc[24] =
{
- 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
+ 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
};
-static const __constant uint keccakf_piln[24] =
+static const __constant uint keccakf_piln[24] =
{
- 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
- 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
+ 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
+ 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
};
void keccakf1600_1(ulong *st)
{
int i, round;
ulong t, bc[5];
-
+
#pragma unroll 1
for(round = 0; round < 24; ++round)
{
@@ -197,7 +197,7 @@ void keccakf1600_1(ulong *st)
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
+
#pragma unroll 1
for (i = 0; i < 5; ++i) {
t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL);
@@ -222,20 +222,20 @@ void keccakf1600_1(ulong *st)
//tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1);
//tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1);
//tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1);
-
+
#pragma unroll 1
for(int i = 0; i < 25; i += 5)
- {
+ {
ulong tmp[5];
-
+
#pragma unroll 1
for(int x = 0; x < 5; ++x)
tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
-
+
#pragma unroll 1
for(int x = 0; x < 5; ++x) st[i + x] = tmp[x];
}
-
+
// Iota
st[0] ^= keccakf_rndc[round];
}
@@ -246,7 +246,7 @@ void keccakf1600_2(ulong *st)
{
int i, round;
ulong t, bc[5];
-
+
#pragma unroll 1
for(round = 0; round < 24; ++round)
{
@@ -257,7 +257,7 @@ void keccakf1600_2(ulong *st)
//bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
//bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
//bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
+
/*
#pragma unroll
for (i = 0; i < 5; ++i) {
@@ -269,43 +269,43 @@ void keccakf1600_2(ulong *st)
st[i + 20] ^= t;
}
*/
-
+
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
-
+
st[0] ^= bc[4];
st[5] ^= bc[4];
st[10] ^= bc[4];
st[15] ^= bc[4];
st[20] ^= bc[4];
-
+
st[1] ^= bc[0];
st[6] ^= bc[0];
st[11] ^= bc[0];
st[16] ^= bc[0];
st[21] ^= bc[0];
-
+
st[2] ^= bc[1];
st[7] ^= bc[1];
st[12] ^= bc[1];
st[17] ^= bc[1];
st[22] ^= bc[1];
-
+
st[3] ^= bc[2];
st[8] ^= bc[2];
st[13] ^= bc[2];
st[18] ^= bc[2];
st[23] ^= bc[2];
-
+
st[4] ^= bc[3];
st[9] ^= bc[3];
st[14] ^= bc[3];
st[19] ^= bc[3];
st[24] ^= bc[3];
-
+
// Rho Pi
t = st[1];
#pragma unroll
@@ -314,11 +314,11 @@ void keccakf1600_2(ulong *st)
st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
t = bc[0];
}
-
-
-
+
+
+
/*ulong tmp1 = st[1] ^ bc[0];
-
+
st[0] ^= bc[4];
st[1] = rotate(st[6] ^ bc[0], 44UL);
st[6] = rotate(st[9] ^ bc[3], 20UL);
@@ -345,26 +345,26 @@ void keccakf1600_2(ulong *st)
st[7] = rotate(st[10] ^ bc[4], 3UL);
st[10] = rotate(tmp1, 1UL);
*/
-
-
+
+
//ulong tmp1 = st[0]; ulong tmp2 = st[1]; st[0] = bitselect(st[0] ^ st[2], st[0], st[1]); st[1] = bitselect(st[1] ^ st[3], st[1], st[2]); st[2] = bitselect(st[2] ^ st[4], st[2], st[3]); st[3] = bitselect(st[3] ^ tmp1, st[3], st[4]); st[4] = bitselect(st[4] ^ tmp2, st[4], tmp1);
//tmp1 = st[5]; tmp2 = st[6]; st[5] = bitselect(st[5] ^ st[7], st[5], st[6]); st[6] = bitselect(st[6] ^ st[8], st[6], st[7]); st[7] = bitselect(st[7] ^ st[9], st[7], st[8]); st[8] = bitselect(st[8] ^ tmp1, st[8], st[9]); st[9] = bitselect(st[9] ^ tmp2, st[9], tmp1);
//tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1);
//tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1);
//tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1);
-
+
#pragma unroll
for(int i = 0; i < 25; i += 5)
{
ulong tmp1 = st[i], tmp2 = st[i + 1];
-
+
st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
}
-
+
// Iota
st[0] ^= keccakf_rndc[round];
}
@@ -376,22 +376,22 @@ R"===(
void CNKeccak(ulong *output, ulong *input)
{
ulong st[25];
-
+
// Copy 72 bytes
for(int i = 0; i < 9; ++i) st[i] = input[i];
-
+
// Last four and '1' bit for padding
//st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
-
+
st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
-
+
for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
-
+
// Last bit of padding
st[16] = 0x8000000000000000UL;
-
+
keccakf1600_1(st);
-
+
for(int i = 0; i < 25; ++i) output[i] = st[i];
}
@@ -408,7 +408,7 @@ void AESExpandKey256(uint *keybuf)
{
// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
-
+
// If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
// then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
// is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
@@ -434,7 +434,7 @@ inline ulong getIdx()
}
#define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
-
+
#define JOIN_DO(x,y) x##y
#define JOIN(x,y) JOIN_DO(x,y)
@@ -460,7 +460,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
}
barrier(CLK_LOCAL_MEM_FENCE);
-
+
#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
@@ -512,12 +512,12 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
}
mem_fence(CLK_LOCAL_MEM_FENCE);
-
+
// cryptonight_heavy
#if (ALGO == 4)
__local uint4 xin[8][WORKSIZE];
- /* Also left over threads performe this loop.
+ /* Also left over threads perform this loop.
* The left over thread results will be ignored
*/
for(size_t i=0; i < 16; i++)
@@ -688,7 +688,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
uint ExpandedKey2[40];
ulong State[25];
uint4 text;
-
+
const ulong gIdx = getIdx();
for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
@@ -776,7 +776,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
barrier(CLK_LOCAL_MEM_FENCE);
text = mix_and_propagate(xin);
}
-
+
#else
#pragma unroll 2
for(int i = 0; i < (MEMORY >> 7); ++i)
@@ -852,7 +852,7 @@ R"===(
__kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
{
const ulong idx = get_global_id(0) - get_global_offset(0);
-
+
// do not use early return here
if(idx < Threads)
{
@@ -904,7 +904,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
output[outIdx] = BranchBuf[idx] + get_global_offset(0);
}
}
- mem_fence(CLK_GLOBAL_MEM_FENCE);
+ mem_fence(CLK_GLOBAL_MEM_FENCE);
}
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
@@ -933,7 +933,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
{
const uint idx = get_global_id(0) - get_global_offset(0);
-
+
// do not use early return here
if(idx < Threads)
{
@@ -987,12 +987,12 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
__kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
{
const uint idx = get_global_id(0) - get_global_offset(0);
-
+
// do not use early return here
if(idx < Threads)
{
states += 25 * BranchBuf[idx];
-
+
unsigned int m[16];
unsigned int v[16];
uint h[8];
@@ -1063,7 +1063,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
{
const uint idx = get_global_id(0) - get_global_offset(0);
-
+
// do not use early return here
if(idx < Threads)
{
OpenPOWER on IntegriCloud