diff options
author | psychocrypt <psychocrypt@users.noreply.github.com> | 2017-09-29 20:32:31 +0200 |
---|---|---|
committer | psychocrypt <psychocrypt@users.noreply.github.com> | 2017-09-30 23:46:08 +0200 |
commit | cc429b68fadc502b981fd0acd64a5ff6e2ae1d15 (patch) | |
tree | 3fb23fc4db15dbdd08af4c7ea20134b9d82e58fd /xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp | |
parent | e5b0319d5a9f58762fa934ad700113908940cb31 (diff) | |
download | xmr-stak-cc429b68fadc502b981fd0acd64a5ff6e2ae1d15.zip xmr-stak-cc429b68fadc502b981fd0acd64a5ff6e2ae1d15.tar.gz |
group files
- move source code to `src`
- categorize files and move to group folder
- change upper case class files to lower case
- change C++ header to `*.hpp`
Diffstat (limited to 'xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp')
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp | 347 |
1 files changed, 347 insertions, 0 deletions
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp new file mode 100644 index 0000000..041a593 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp @@ -0,0 +1,347 @@ +#pragma once + +typedef unsigned int uint_t; /* native unsigned integer */ + +#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ + +#define SKEIN_256_STATE_WORDS ( 4) +#define SKEIN_512_STATE_WORDS ( 8) +#define SKEIN1024_STATE_WORDS (16) + +#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) + +#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) + +#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) + +#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ + +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ + +#define SKEIN_T1_FLAG_FIRST (((uint64_t) 1 ) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t) 1 ) << SKEIN_T1_POS_BIT_PAD) +#define SKEIN_T1_FLAG_FINAL (((uint64_t) 1 ) << SKEIN_T1_POS_FINAL) + +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ + +#define SKEIN_T1_BLK_TYPE(T) (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) + +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ + +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} + +#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) +#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) + +#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \ + Skein_Set_T0(ctxPtr,(T0)); \ + Skein_Set_T1(ctxPtr,(T1)); } + +#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ +{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } + +#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } + +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \ + X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \ + X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \ + X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6; + +#define I512(R) \ + X0 += ks[((R)+1) % 9]; \ + X1 += ks[((R)+2) % 9]; \ + X2 += ks[((R)+3) % 9]; \ + X3 += ks[((R)+4) % 9]; \ + X4 += ks[((R)+5) % 9]; \ + X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ + X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ + X7 += ks[((R)+8) % 9] + (R)+1; + + +#define R512_8_rounds(R) \ + R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ + R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ + R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ + R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ + I512(2*(R)); \ + R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ + R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ + R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ + R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ + I512(2*(R)+1); + +typedef struct +{ + size_t hashBitLen; + size_t bCnt; + uint64_t T[SKEIN_MODIFIER_WORDS]; +} Skein_Ctxt_Hdr_t; + +typedef struct { + Skein_Ctxt_Hdr_t h; + uint64_t X[SKEIN_256_STATE_WORDS]; + uint8_t b[SKEIN_256_BLOCK_BYTES]; +} Skein_256_Ctxt_t; + +typedef struct { + Skein_Ctxt_Hdr_t h; + uint64_t X[SKEIN_512_STATE_WORDS]; + uint8_t b[SKEIN_512_BLOCK_BYTES]; +} Skein_512_Ctxt_t; + +typedef struct { + Skein_Ctxt_Hdr_t h; + uint64_t X[SKEIN1024_STATE_WORDS]; + uint8_t b[SKEIN1024_BLOCK_BYTES]; +} Skein1024_Ctxt_t; + +typedef struct { + uint_t statebits; + union { + Skein_Ctxt_Hdr_t h; + Skein_256_Ctxt_t ctx_256; + Skein_512_Ctxt_t ctx_512; + Skein1024_Ctxt_t ctx1024; + } u; +} skeinHashState; + +__device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen) +{ + const uint64_t SKEIN_512_IV_256[] = + { + SKEIN_MK_64(0xCCD044A1,0x2FDB3E13), + SKEIN_MK_64(0xE8359030,0x1A79A9EB), + SKEIN_MK_64(0x55AEA061,0x4F816E6F), + SKEIN_MK_64(0x2A2767A4,0xAE9B94DB), + SKEIN_MK_64(0xEC06025E,0x74DD7683), + SKEIN_MK_64(0xE7A436CD,0xC4746251), + SKEIN_MK_64(0xC36FBAF9,0x393AD185), + SKEIN_MK_64(0x3EEDBA18,0x33EDFC13) + }; + + Skein_512_Ctxt_t *ctx = &state->u.ctx_512; + + ctx->h.hashBitLen = hashBitLen; + + memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X)); + + Skein_Start_New_Type(ctx, MSG); +} + +__device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) +{ + enum { + R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, + R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, + R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, + R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, + R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, + R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, + R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, + R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22 + }; + + uint64_t X0,X1,X2,X3,X4,X5,X6,X7; + uint64_t w[SKEIN_512_STATE_WORDS]; + uint64_t kw[SKEIN_512_STATE_WORDS+4]; + + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + + do + { + + ts[0] += byteCntAdd; + + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + memcpy(w, blkPtr, SKEIN_512_STATE_WORDS << 3); + + X0 = w[0] + ks[0]; + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; + + blkPtr += SKEIN_512_BLOCK_BYTES; + + R512_8_rounds( 0); + R512_8_rounds( 1); + R512_8_rounds( 2); + R512_8_rounds( 3); + R512_8_rounds( 4); + R512_8_rounds( 5); + R512_8_rounds( 6); + R512_8_rounds( 7); + R512_8_rounds( 8); + + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + ctx->X[4] = X4 ^ w[4]; + ctx->X[5] = X5 ^ w[5]; + ctx->X[6] = X6 ^ w[6]; + ctx->X[7] = X7 ^ w[7]; + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +__device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal) +{ + size_t i,n,byteCnt; + uint64_t X[SKEIN_512_STATE_WORDS]; + Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; + //size_t tmp; + //uint8_t *p8; + //uint64_t *p64; + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; + + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + { + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + //p8 = &ctx->b[ctx->h.bCnt]; + //tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; + //for( i = 0; i < tmp; i++ ) *(p8+i) = 0; + } + + cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt); + + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + //uint8_t b[SKEIN_512_BLOCK_BYTES] == 64 + memset(ctx->b,0,sizeof(ctx->b)); + //p64 = (uint64_t *)ctx->b; + //for( i = 0; i < 8; i++ ) *(p64+i) = 0; + + memcpy(X,ctx->X,sizeof(X)); + + for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) + { + ((uint64_t *)ctx->b)[0]= (uint64_t)i; + Skein_Start_New_Type(ctx,OUT_FINAL); + cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t)); + n = byteCnt - i*SKEIN_512_BLOCK_BYTES; + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } +} + +__device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt) +{ + size_t n; + + if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + { + + if (ctx->h.bCnt) + { + + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; + + if (n) + { + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + + cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + + if (msgByteCnt > SKEIN_512_BLOCK_BYTES) + { + n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; + cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; + } + } + + if (msgByteCnt) + { + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } +} + +__device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +{ + if ((databitlen & 7) == 0) + { + cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3); + } + else + { + + size_t bCnt = (databitlen >> 3) + 1; + uint8_t b,mask; + + mask = (uint8_t) (1u << (7 - (databitlen & 7))); + b = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask); + + cn_skein512_update(&state->u.ctx_512,data,bCnt-1); + cn_skein512_update(&state->u.ctx_512,&b , 1 ); + + Skein_Set_Bit_Pad_Flag(state->u.h); + } +} + +__device__ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +{ + int hashbitlen = 256; + DataLength databitlen = len << 3; + skeinHashState state; + + state.statebits = 64*SKEIN_512_STATE_WORDS; + + cn_skein_init(&state, hashbitlen); + cn_skein_update(&state, data, databitlen); + cn_skein_final(&state, hashval); +} |