summaryrefslogtreecommitdiffstats
path: root/xmrstak/backend/cpu/crypto
diff options
context:
space:
mode:
authorGrzegorz Hasse <grzegorz.hasse@gmail.com>2017-11-19 22:43:24 -0800
committerGrzegorz Hasse <grzegorz.hasse@gmail.com>2017-12-06 16:33:41 -0800
commit74d4b2d1c3e528a39253c8236f681d29580e001d (patch)
tree0fbcbcd1dba8594120bbd0c0bf78567c2b146fff /xmrstak/backend/cpu/crypto
parentfd94d37aef5808fd33b4958c32139d2554e9cc75 (diff)
downloadxmr-stak-74d4b2d1c3e528a39253c8236f681d29580e001d.zip
xmr-stak-74d4b2d1c3e528a39253c8236f681d29580e001d.tar.gz
Extend low_power_mode to do up to 5 cn hashes at a time.
The "low_power_mode" option in config.txt can be set to numeral values between 1 and 5. A value of 5 seems optimal on certain processors with large L4 cache.
Diffstat (limited to 'xmrstak/backend/cpu/crypto')
-rw-r--r--xmrstak/backend/cpu/crypto/cryptonight_aesni.h322
1 files changed, 307 insertions, 15 deletions
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 2a6a769..9b6e1dc 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -353,19 +353,19 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
-void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
+void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
- keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
- keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200);
+ keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+ keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
// Optim - 99% time boundary
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
- uint8_t* l0 = ctx0->long_state;
- uint64_t* h0 = (uint64_t*)ctx0->hash_state;
- uint8_t* l1 = ctx1->long_state;
- uint64_t* h1 = (uint64_t*)ctx1->hash_state;
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
uint64_t axl0 = h0[0] ^ h0[4];
uint64_t axh0 = h0[1] ^ h0[5];
@@ -444,13 +444,305 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
}
// Optim - 90% time boundary
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
// Optim - 99% time boundary
- keccakf((uint64_t*)ctx0->hash_state, 24);
- extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
- keccakf((uint64_t*)ctx1->hash_state, 24);
- extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32);
+ keccakf((uint64_t*)ctx[0]->hash_state, 24);
+ extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
+ keccakf((uint64_t*)ctx[1]->hash_state, 24);
+ extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32);
+}
+
+#define CN_STEP1(a, b, c, l, ptr, idx) \
+ a = _mm_xor_si128(a, c); \
+ idx = _mm_cvtsi128_si64(a); \
+ ptr = (__m128i *)&l[idx & MASK]; \
+ if(PREFETCH) \
+ _mm_prefetch((const char*)ptr, _MM_HINT_T0); \
+ c = _mm_load_si128(ptr)
+
+#define CN_STEP2(a, b, c, l, ptr, idx) \
+ if(SOFT_AES) \
+ c = soft_aesenc(c, a); \
+ else \
+ c = _mm_aesenc_si128(c, a); \
+ b = _mm_xor_si128(b, c); \
+ _mm_store_si128(ptr, b)
+
+#define CN_STEP3(a, b, c, l, ptr, idx) \
+ idx = _mm_cvtsi128_si64(c); \
+ ptr = (__m128i *)&l[idx & MASK]; \
+ if(PREFETCH) \
+ _mm_prefetch((const char*)ptr, _MM_HINT_T0); \
+ b = _mm_load_si128(ptr)
+
+#define CN_STEP4(a, b, c, l, ptr, idx) \
+ lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \
+ a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \
+ _mm_store_si128(ptr, a)
+
+// This lovelier creation will do 3 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 3; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ }
+
+ for (size_t i = 0; i < 3; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
+}
+
+// This even lovelier creation will do 4 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 4; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+ uint8_t* l3 = ctx[3]->long_state;
+ uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
+ __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+ __m128i cx3 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, idx3, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2, *ptr3;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
+ }
+
+ for (size_t i = 0; i < 4; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
+}
+
+// This most lovely creation will do 5 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 5; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+ uint8_t* l3 = ctx[3]->long_state;
+ uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
+ uint8_t* l4 = ctx[4]->long_state;
+ uint64_t* h4 = (uint64_t*)ctx[4]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
+ __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+ __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]);
+ __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+ __m128i cx3 = _mm_set_epi64x(0, 0);
+ __m128i cx4 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4);
+ }
+
+ for (size_t i = 0; i < 5; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
}
OpenPOWER on IntegriCloud