summaryrefslogtreecommitdiffstats
path: root/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
diff options
context:
space:
mode:
Diffstat (limited to 'xmrstak/backend/cpu/crypto/cryptonight_aesni.h')
-rw-r--r--xmrstak/backend/cpu/crypto/cryptonight_aesni.h453
1 files changed, 388 insertions, 65 deletions
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 9b6e1dc..85373e8 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -16,6 +16,7 @@
#pragma once
#include "cryptonight.h"
+#include "xmrstak/backend/cryptonight.hpp"
#include <memory.h>
#include <stdio.h>
@@ -148,7 +149,20 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
*x7 = soft_aesenc(*x7, key);
}
-template<size_t MEM, bool SOFT_AES, bool PREFETCH>
+inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
+{
+ __m128i tmp0 = x0;
+ x0 = _mm_xor_si128(x0, x1);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_xor_si128(x2, x3);
+ x3 = _mm_xor_si128(x3, x4);
+ x4 = _mm_xor_si128(x4, x5);
+ x5 = _mm_xor_si128(x5, x6);
+ x6 = _mm_xor_si128(x6, x7);
+ x7 = _mm_xor_si128(x7, tmp0);
+}
+
+template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
void cn_explode_scratchpad(const __m128i* input, __m128i* output)
{
// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -166,6 +180,40 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
xin6 = _mm_load_si128(input + 10);
xin7 = _mm_load_si128(input + 11);
+ if(ALGO == cryptonight_heavy)
+ {
+ for(size_t i=0; i < 16; i++)
+ {
+ if(SOFT_AES)
+ {
+ soft_aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ }
+ else
+ {
+ aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ }
+ mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+ }
+ }
+
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
{
if(SOFT_AES)
@@ -213,7 +261,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
}
}
-template<size_t MEM, bool SOFT_AES, bool PREFETCH>
+template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
void cn_implode_scratchpad(const __m128i* input, __m128i* output)
{
// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -275,6 +323,93 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
}
+
+ if(ALGO == cryptonight_heavy)
+ mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+ }
+
+ if(ALGO == cryptonight_heavy)
+ {
+ for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+ {
+ if(PREFETCH)
+ _mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
+
+ xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+ xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+ xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+ xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+
+ if(PREFETCH)
+ _mm_prefetch((const char*)input + i + 4, _MM_HINT_NTA);
+
+ xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+ xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+ xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+ xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+ if(SOFT_AES)
+ {
+ soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+ else
+ {
+ aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+
+ if(ALGO == cryptonight_heavy)
+ mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+ }
+
+ for(size_t i=0; i < 16; i++)
+ {
+ if(SOFT_AES)
+ {
+ soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+ else
+ {
+ aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+
+ mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+ }
}
_mm_store_si128(output + 4, xout0);
@@ -287,13 +422,45 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
_mm_store_si128(output + 11, xout7);
}
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+ mem_out[0] = _mm_cvtsi128_si64(tmp);
+
+ tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+ uint64_t vh = _mm_cvtsi128_si64(tmp);
+
+ uint8_t x = vh >> 24;
+ static const uint16_t table = 0x7531;
+ const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+ vh ^= ((table >> index) & 0x3) << 28;
+
+ mem_out[1] = vh;
+}
+
+template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
{
+ constexpr size_t MASK = cn_select_mask<ALGO>();
+ constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+ constexpr size_t MEM = cn_select_memory<ALGO>();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 32);
+ return;
+ }
+
keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
+ uint64_t monero_const;
+ if(ALGO == cryptonight_monero)
+ {
+ monero_const = *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35);
+ monero_const ^= *(reinterpret_cast<const uint64_t*>(ctx0->hash_state) + 24);
+ }
+
// Optim - 99% time boundary
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
uint8_t* l0 = ctx0->long_state;
uint64_t* h0 = (uint64_t*)ctx0->hash_state;
@@ -315,12 +482,16 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
else
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
- _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ if(ALGO == cryptonight_monero)
+ cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ else
+ _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+
idx0 = _mm_cvtsi128_si64(cx);
- bx0 = cx;
if(PREFETCH)
_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+ bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*)&l0[idx0 & MASK])[0];
@@ -329,19 +500,33 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
lo = _umul128(idx0, cl, &hi);
al0 += hi;
- ah0 += lo;
((uint64_t*)&l0[idx0 & MASK])[0] = al0;
- ((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
- ah0 ^= ch;
al0 ^= cl;
+ if(PREFETCH)
+ _mm_prefetch((const char*)&l0[al0 & MASK], _MM_HINT_T0);
+ ah0 += lo;
+
+ if(ALGO == cryptonight_monero)
+ ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const;
+ else
+ ((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
+ ah0 ^= ch;
+
idx0 = al0;
- if(PREFETCH)
- _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+ if(ALGO == cryptonight_heavy)
+ {
+ int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
+ int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
+ int64_t q = n / (d | 0x5);
+
+ ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+ idx0 = d ^ q;
+ }
}
// Optim - 90% time boundary
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
// Optim - 99% time boundary
@@ -352,15 +537,34 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
+ constexpr size_t MASK = cn_select_mask<ALGO>();
+ constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+ constexpr size_t MEM = cn_select_memory<ALGO>();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 64);
+ return;
+ }
+
keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
+ uint64_t monero_const_0, monero_const_1;
+ if(ALGO == cryptonight_monero)
+ {
+ monero_const_0 = *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35);
+ monero_const_0 ^= *(reinterpret_cast<const uint64_t*>(ctx[0]->hash_state) + 24);
+ monero_const_1 = *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len + 35);
+ monero_const_1 ^= *(reinterpret_cast<const uint64_t*>(ctx[1]->hash_state) + 24);
+ }
+
// Optim - 99% time boundary
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
uint8_t* l0 = ctx[0]->long_state;
uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
@@ -388,7 +592,11 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
else
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0));
- _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ if(ALGO == cryptonight_monero)
+ cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ else
+ _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+
idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx;
@@ -402,7 +610,11 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
else
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1));
- _mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
+ if(ALGO == cryptonight_monero)
+ cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
+ else
+ _mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
+
idx1 = _mm_cvtsi128_si64(cx);
bx1 = cx;
@@ -418,11 +630,26 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
axl0 += hi;
axh0 += lo;
((uint64_t*)&l0[idx0 & MASK])[0] = axl0;
- ((uint64_t*)&l0[idx0 & MASK])[1] = axh0;
+
+ if(ALGO == cryptonight_monero)
+ ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0;
+ else
+ ((uint64_t*)&l0[idx0 & MASK])[1] = axh0;
+
axh0 ^= ch;
axl0 ^= cl;
idx0 = axl0;
+ if(ALGO == cryptonight_heavy)
+ {
+ int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
+ int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
+ int64_t q = n / (d | 0x5);
+
+ ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+ idx0 = d ^ q;
+ }
+
if(PREFETCH)
_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
@@ -434,18 +661,33 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
axl1 += hi;
axh1 += lo;
((uint64_t*)&l1[idx1 & MASK])[0] = axl1;
- ((uint64_t*)&l1[idx1 & MASK])[1] = axh1;
+
+ if(ALGO == cryptonight_monero)
+ ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1;
+ else
+ ((uint64_t*)&l1[idx1 & MASK])[1] = axh1;
+
axh1 ^= ch;
axl1 ^= cl;
idx1 = axl1;
+ if(ALGO == cryptonight_heavy)
+ {
+ int64_t n = ((int64_t*)&l1[idx1 & MASK])[0];
+ int32_t d = ((int32_t*)&l1[idx1 & MASK])[2];
+ int64_t q = n / (d | 0x5);
+
+ ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
+ idx1 = d ^ q;
+ }
+
if(PREFETCH)
_mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
}
// Optim - 90% time boundary
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
// Optim - 99% time boundary
@@ -456,12 +698,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
}
#define CN_STEP1(a, b, c, l, ptr, idx) \
- a = _mm_xor_si128(a, c); \
- idx = _mm_cvtsi128_si64(a); \
ptr = (__m128i *)&l[idx & MASK]; \
if(PREFETCH) \
_mm_prefetch((const char*)ptr, _MM_HINT_T0); \
- c = _mm_load_si128(ptr)
+ c = _mm_load_si128(ptr);
#define CN_STEP2(a, b, c, l, ptr, idx) \
if(SOFT_AES) \
@@ -469,30 +709,64 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
else \
c = _mm_aesenc_si128(c, a); \
b = _mm_xor_si128(b, c); \
- _mm_store_si128(ptr, b)
+ if(ALGO == cryptonight_monero) \
+ cryptonight_monero_tweak((uint64_t*)ptr, b); \
+ else \
+ _mm_store_si128(ptr, b);\
#define CN_STEP3(a, b, c, l, ptr, idx) \
idx = _mm_cvtsi128_si64(c); \
ptr = (__m128i *)&l[idx & MASK]; \
if(PREFETCH) \
_mm_prefetch((const char*)ptr, _MM_HINT_T0); \
- b = _mm_load_si128(ptr)
+ b = _mm_load_si128(ptr);
-#define CN_STEP4(a, b, c, l, ptr, idx) \
+#define CN_STEP4(a, b, c, l, mc, ptr, idx) \
lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \
a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \
- _mm_store_si128(ptr, a)
+ if(ALGO == cryptonight_monero) \
+ _mm_store_si128(ptr, _mm_xor_si128(a, mc)); \
+ else \
+ _mm_store_si128(ptr, a);\
+ a = _mm_xor_si128(a, b); \
+ idx = _mm_cvtsi128_si64(a); \
+ if(ALGO == cryptonight_heavy) \
+ { \
+ int64_t n = ((int64_t*)&l[idx & MASK])[0]; \
+ int32_t d = ((int32_t*)&l[idx & MASK])[2]; \
+ int64_t q = n / (d | 0x5); \
+ ((int64_t*)&l[idx & MASK])[0] = n ^ q; \
+ idx = d ^ q; \
+ }
+
+#define CONST_INIT(ctx, n) \
+ __m128i mc##n = _mm_set_epi64x(*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + n * len + 35) ^ \
+ *(reinterpret_cast<const uint64_t*>((ctx)->hash_state) + 24), 0);
// This lovelier creation will do 3 cn hashes at a time.
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
+ constexpr size_t MASK = cn_select_mask<ALGO>();
+ constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+ constexpr size_t MEM = cn_select_memory<ALGO>();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 32 * 3);
+ return;
+ }
+
for (size_t i = 0; i < 3; i++)
{
keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
}
+ CONST_INIT(ctx[0], 0);
+ CONST_INIT(ctx[1], 1);
+ CONST_INIT(ctx[2], 2);
+
uint8_t* l0 = ctx[0]->long_state;
uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
uint8_t* l1 = ctx[1]->long_state;
@@ -510,9 +784,14 @@ void cryptonight_triple_hash(const void* input, size_t len, void* output, crypto
__m128i cx1 = _mm_set_epi64x(0, 0);
__m128i cx2 = _mm_set_epi64x(0, 0);
+ uint64_t idx0, idx1, idx2;
+ idx0 = _mm_cvtsi128_si64(ax0);
+ idx1 = _mm_cvtsi128_si64(ax1);
+ idx2 = _mm_cvtsi128_si64(ax2);
+
for (size_t i = 0; i < ITERATIONS/2; i++)
{
- uint64_t idx0, idx1, idx2, hi, lo;
+ uint64_t hi, lo;
__m128i *ptr0, *ptr1, *ptr2;
// EVEN ROUND
@@ -528,9 +807,9 @@ void cryptonight_triple_hash(const void* input, size_t len, void* output, crypto
CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
// ODD ROUND
CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
@@ -545,29 +824,44 @@ void cryptonight_triple_hash(const void* input, size_t len, void* output, crypto
CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
}
for (size_t i = 0; i < 3; i++)
{
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
keccakf((uint64_t*)ctx[i]->hash_state, 24);
extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
}
}
// This even lovelier creation will do 4 cn hashes at a time.
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
+ constexpr size_t MASK = cn_select_mask<ALGO>();
+ constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+ constexpr size_t MEM = cn_select_memory<ALGO>();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 32 * 4);
+ return;
+ }
+
for (size_t i = 0; i < 4; i++)
{
keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
}
+ CONST_INIT(ctx[0], 0);
+ CONST_INIT(ctx[1], 1);
+ CONST_INIT(ctx[2], 2);
+ CONST_INIT(ctx[3], 3);
+
uint8_t* l0 = ctx[0]->long_state;
uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
uint8_t* l1 = ctx[1]->long_state;
@@ -589,10 +883,16 @@ void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptoni
__m128i cx1 = _mm_set_epi64x(0, 0);
__m128i cx2 = _mm_set_epi64x(0, 0);
__m128i cx3 = _mm_set_epi64x(0, 0);
-
+
+ uint64_t idx0, idx1, idx2, idx3;
+ idx0 = _mm_cvtsi128_si64(ax0);
+ idx1 = _mm_cvtsi128_si64(ax1);
+ idx2 = _mm_cvtsi128_si64(ax2);
+ idx3 = _mm_cvtsi128_si64(ax3);
+
for (size_t i = 0; i < ITERATIONS/2; i++)
{
- uint64_t idx0, idx1, idx2, idx3, hi, lo;
+ uint64_t hi, lo;
__m128i *ptr0, *ptr1, *ptr2, *ptr3;
// EVEN ROUND
@@ -611,10 +911,10 @@ void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptoni
CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
- CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
// ODD ROUND
CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
@@ -632,30 +932,46 @@ void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptoni
CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
- CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
}
for (size_t i = 0; i < 4; i++)
{
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
keccakf((uint64_t*)ctx[i]->hash_state, 24);
extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
}
}
// This most lovely creation will do 5 cn hashes at a time.
-template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
+ constexpr size_t MASK = cn_select_mask<ALGO>();
+ constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+ constexpr size_t MEM = cn_select_memory<ALGO>();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 32 * 5);
+ return;
+ }
+
for (size_t i = 0; i < 5; i++)
{
keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
}
+ CONST_INIT(ctx[0], 0);
+ CONST_INIT(ctx[1], 1);
+ CONST_INIT(ctx[2], 2);
+ CONST_INIT(ctx[3], 3);
+ CONST_INIT(ctx[4], 4);
+
uint8_t* l0 = ctx[0]->long_state;
uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
uint8_t* l1 = ctx[1]->long_state;
@@ -683,9 +999,16 @@ void cryptonight_penta_hash(const void* input, size_t len, void* output, crypton
__m128i cx3 = _mm_set_epi64x(0, 0);
__m128i cx4 = _mm_set_epi64x(0, 0);
+ uint64_t idx0, idx1, idx2, idx3, idx4;
+ idx0 = _mm_cvtsi128_si64(ax0);
+ idx1 = _mm_cvtsi128_si64(ax1);
+ idx2 = _mm_cvtsi128_si64(ax2);
+ idx3 = _mm_cvtsi128_si64(ax3);
+ idx4 = _mm_cvtsi128_si64(ax4);
+
for (size_t i = 0; i < ITERATIONS/2; i++)
{
- uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo;
+ uint64_t hi, lo;
__m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
// EVEN ROUND
@@ -707,11 +1030,11 @@ void cryptonight_penta_hash(const void* input, size_t len, void* output, crypton
CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
- CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
- CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4);
+ CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
+ CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4);
// ODD ROUND
CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
@@ -732,16 +1055,16 @@ void cryptonight_penta_hash(const void* input, size_t len, void* output, crypton
CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
- CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
- CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4);
+ CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
+ CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4);
}
for (size_t i = 0; i < 5; i++)
{
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
keccakf((uint64_t*)ctx[i]->hash_state, 24);
extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
}
OpenPOWER on IntegriCloud