diff options
author | Grzegorz Hasse <grzegorz.hasse@gmail.com> | 2017-11-19 22:43:24 -0800 |
---|---|---|
committer | Grzegorz Hasse <grzegorz.hasse@gmail.com> | 2017-12-06 16:33:41 -0800 |
commit | 74d4b2d1c3e528a39253c8236f681d29580e001d (patch) | |
tree | 0fbcbcd1dba8594120bbd0c0bf78567c2b146fff /xmrstak/backend/cpu/crypto | |
parent | fd94d37aef5808fd33b4958c32139d2554e9cc75 (diff) | |
download | xmr-stak-74d4b2d1c3e528a39253c8236f681d29580e001d.zip xmr-stak-74d4b2d1c3e528a39253c8236f681d29580e001d.tar.gz |
Extend low_power_mode to do up to 5 cn hashes at a time.
The "low_power_mode" option in config.txt can be set to numeral values
between 1 and 5. A value of 5 seems optimal on certain processors
with large L4 cache.
Diffstat (limited to 'xmrstak/backend/cpu/crypto')
-rw-r--r-- | xmrstak/backend/cpu/crypto/cryptonight_aesni.h | 322 |
1 files changed, 307 insertions, 15 deletions
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 2a6a769..9b6e1dc 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -353,19 +353,19 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH> -void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) +void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - keccak((const uint8_t *)input, len, ctx0->hash_state, 200); - keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200); + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200); // Optim - 99% time boundary - cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); - cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state); - uint8_t* l0 = ctx0->long_state; - uint64_t* h0 = (uint64_t*)ctx0->hash_state; - uint8_t* l1 = ctx1->long_state; - uint64_t* h1 = (uint64_t*)ctx1->hash_state; + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; uint64_t axl0 = h0[0] ^ h0[4]; uint64_t axh0 = h0[1] ^ h0[5]; @@ -444,13 +444,305 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto } // Optim - 90% time boundary - cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); - cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state); + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state); // Optim - 99% time boundary - keccakf((uint64_t*)ctx0->hash_state, 24); - extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); - keccakf((uint64_t*)ctx1->hash_state, 24); - extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); + keccakf((uint64_t*)ctx[1]->hash_state, 24); + extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32); +} + +#define CN_STEP1(a, b, c, l, ptr, idx) \ + a = _mm_xor_si128(a, c); \ + idx = _mm_cvtsi128_si64(a); \ + ptr = (__m128i *)&l[idx & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ + c = _mm_load_si128(ptr) + +#define CN_STEP2(a, b, c, l, ptr, idx) \ + if(SOFT_AES) \ + c = soft_aesenc(c, a); \ + else \ + c = _mm_aesenc_si128(c, a); \ + b = _mm_xor_si128(b, c); \ + _mm_store_si128(ptr, b) + +#define CN_STEP3(a, b, c, l, ptr, idx) \ + idx = _mm_cvtsi128_si64(c); \ + ptr = (__m128i *)&l[idx & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ + b = _mm_load_si128(ptr) + +#define CN_STEP4(a, b, c, l, ptr, idx) \ + lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \ + a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \ + _mm_store_si128(ptr, a) + +// This lovelier creation will do 3 cn hashes at a time. +template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH> +void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 3; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, hi, lo; + __m128i *ptr0, *ptr1, *ptr2; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + } + + for (size_t i = 0; i < 3; i++) + { + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} + +// This even lovelier creation will do 4 cn hashes at a time. +template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH> +void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 4; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + uint8_t* l3 = ctx[3]->long_state; + uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + __m128i cx3 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, idx3, hi, lo; + __m128i *ptr0, *ptr1, *ptr2, *ptr3; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3); + } + + for (size_t i = 0; i < 4; i++) + { + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} + +// This most lovely creation will do 5 cn hashes at a time. +template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH> +void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 5; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + uint8_t* l3 = ctx[3]->long_state; + uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; + uint8_t* l4 = ctx[4]->long_state; + uint64_t* h4 = (uint64_t*)ctx[4]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]); + __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + __m128i cx3 = _mm_set_epi64x(0, 0); + __m128i cx4 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo; + __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4); + } + + for (size_t i = 0; i < 5; i++) + { + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } } |