summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--crypto/cryptonight.h1
-rw-r--r--crypto/cryptonight_aesni.h207
-rw-r--r--crypto/cryptonight_common.cpp13
-rw-r--r--crypto/soft_aes.c140
-rw-r--r--jconf.cpp35
-rw-r--r--jconf.h5
-rw-r--r--minethd.cpp36
-rw-r--r--xmr-stak-cpu.cbp4
8 files changed, 337 insertions, 104 deletions
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h
index acc5fd1..f4ec843 100644
--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@@ -25,6 +25,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
void cryptonight_free_ctx(cryptonight_ctx* ctx);
void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
+void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1);
diff --git a/crypto/cryptonight_aesni.h b/crypto/cryptonight_aesni.h
index 4bf099e..4ae7d6e 100644
--- a/crypto/cryptonight_aesni.h
+++ b/crypto/cryptonight_aesni.h
@@ -42,6 +42,9 @@ extern "C"
void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
void keccakf(uint64_t st[25], int rounds);
extern void(*const extra_hashes[4])(const void *, size_t, char *);
+
+ __m128i soft_aesenc(__m128i in, __m128i key);
+ __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
}
// This will shift and xor tmp1 into itself as 4 32-bit vals such as
@@ -58,61 +61,66 @@ static inline __m128i sl_xor(__m128i tmp1)
return tmp1;
}
+static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon)
+{
+ __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon);
+ xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+ *xout0 = sl_xor(*xout0);
+ *xout0 = _mm_xor_si128(*xout0, xout1);
+ xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00);
+ xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+ *xout2 = sl_xor(*xout2);
+ *xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon)
+{
+ __m128i xout1 = soft_aeskeygenassist(*xout2, rcon);
+ xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+ *xout0 = sl_xor(*xout0);
+ *xout0 = _mm_xor_si128(*xout0, xout1);
+ xout1 = soft_aeskeygenassist(*xout0, 0x00);
+ xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+ *xout2 = sl_xor(*xout2);
+ *xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+template<bool SOFT_AES>
static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3,
__m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
{
- __m128i xout0, xout1, xout2;
+ __m128i xout0, xout2;
xout0 = _mm_load_si128(memory);
xout2 = _mm_load_si128(memory+1);
*k0 = xout0;
*k1 = xout2;
- xout1 = _mm_aeskeygenassist_si128(xout2, 0x01);
- xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
- xout0 = sl_xor(xout0);
- xout0 = _mm_xor_si128(xout0, xout1);
-
- xout1 = _mm_aeskeygenassist_si128(xout0, 0x00);
- xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
- xout2 = sl_xor(xout2);
- xout2 = _mm_xor_si128(xout2, xout1);
+ if(SOFT_AES)
+ soft_aes_genkey_sub(&xout0, &xout2, 0x01);
+ else
+ aes_genkey_sub(&xout0, &xout2, 0x01);
*k2 = xout0;
*k3 = xout2;
- xout1 = _mm_aeskeygenassist_si128(xout2, 0x02);
- xout1 = _mm_shuffle_epi32(xout1, 0xFF);
- xout0 = sl_xor(xout0);
- xout0 = _mm_xor_si128(xout0, xout1);
-
- xout1 = _mm_aeskeygenassist_si128(xout0, 0x00);
- xout1 = _mm_shuffle_epi32(xout1, 0xAA);
- xout2 = sl_xor(xout2);
- xout2 = _mm_xor_si128(xout2, xout1);
+ if(SOFT_AES)
+ soft_aes_genkey_sub(&xout0, &xout2, 0x02);
+ else
+ aes_genkey_sub(&xout0, &xout2, 0x02);
*k4 = xout0;
*k5 = xout2;
- xout1 = _mm_aeskeygenassist_si128(xout2, 0x04);
- xout1 = _mm_shuffle_epi32(xout1, 0xFF);
- xout0 = sl_xor(xout0);
- xout0 = _mm_xor_si128(xout0, xout1);
-
- xout1 = _mm_aeskeygenassist_si128(xout0, 0x00);
- xout1 = _mm_shuffle_epi32(xout1, 0xAA);
- xout2 = sl_xor(xout2);
- xout2 = _mm_xor_si128(xout2, xout1);
+ if(SOFT_AES)
+ soft_aes_genkey_sub(&xout0, &xout2, 0x04);
+ else
+ aes_genkey_sub(&xout0, &xout2, 0x04);
*k6 = xout0;
*k7 = xout2;
- xout1 = _mm_aeskeygenassist_si128(xout2, 0x08);
- xout1 = _mm_shuffle_epi32(xout1, 0xFF);
- xout0 = sl_xor(xout0);
- xout0 = _mm_xor_si128(xout0, xout1);
-
- xout1 = _mm_aeskeygenassist_si128(xout0, 0x00);
- xout1 = _mm_shuffle_epi32(xout1, 0xAA);
- xout2 = sl_xor(xout2);
- xout2 = _mm_xor_si128(xout2, xout1);
+ if(SOFT_AES)
+ soft_aes_genkey_sub(&xout0, &xout2, 0x08);
+ else
+ aes_genkey_sub(&xout0, &xout2, 0x08);
*k8 = xout0;
*k9 = xout2;
}
@@ -129,14 +137,26 @@ static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2,
*x7 = _mm_aesenc_si128(*x7, key);
}
-template<size_t MEM>
+static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+ *x0 = soft_aesenc(*x0, key);
+ *x1 = soft_aesenc(*x1, key);
+ *x2 = soft_aesenc(*x2, key);
+ *x3 = soft_aesenc(*x3, key);
+ *x4 = soft_aesenc(*x4, key);
+ *x5 = soft_aesenc(*x5, key);
+ *x6 = soft_aesenc(*x6, key);
+ *x7 = soft_aesenc(*x7, key);
+}
+
+template<size_t MEM, bool SOFT_AES>
void cn_explode_scratchpad(const __m128i* input, __m128i* output)
{
// This is more than we have registers, compiler will assign 2 keys on the stack
__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
- aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+ aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
xin0 = _mm_load_si128(input + 4);
xin1 = _mm_load_si128(input + 5);
@@ -149,16 +169,32 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
{
- aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
- aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ if(SOFT_AES)
+ {
+ soft_aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ }
+ else
+ {
+ aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ }
_mm_store_si128(output + i + 0, xin0);
_mm_store_si128(output + i + 1, xin1);
@@ -173,14 +209,14 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
}
}
-template<size_t MEM>
+template<size_t MEM, bool SOFT_AES>
void cn_implode_scratchpad(const __m128i* input, __m128i* output)
{
// This is more than we have registers, compiler will assign 2 keys on the stack
__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
- aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+ aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
xout0 = _mm_load_si128(output + 4);
xout1 = _mm_load_si128(output + 5);
@@ -204,16 +240,32 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
- aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
- aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ if(SOFT_AES)
+ {
+ soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+ else
+ {
+ aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
}
_mm_store_si128(output + 4, xout0);
@@ -226,13 +278,13 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
_mm_store_si128(output + 11, xout7);
}
-template<size_t ITERATIONS, size_t MEM, bool PREFETCH>
+template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES>
void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
{
keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
// Optim - 99% time boundary
- cn_explode_scratchpad<MEM>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
uint8_t* l0 = ctx0->long_state;
uint64_t* h0 = (uint64_t*)ctx0->hash_state;
@@ -248,7 +300,10 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
{
__m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
- cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+ if(SOFT_AES)
+ cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+ else
+ cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx;
@@ -271,7 +326,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
}
// Optim - 90% time boundary
- cn_implode_scratchpad<MEM>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
// Optim - 99% time boundary
@@ -282,15 +337,15 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
-template<size_t ITERATIONS, size_t MEM, bool PREFETCH>
+template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES>
void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
{
keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200);
// Optim - 99% time boundary
- cn_explode_scratchpad<MEM>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
- cn_explode_scratchpad<MEM>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
uint8_t* l0 = ctx0->long_state;
uint64_t* h0 = (uint64_t*)ctx0->hash_state;
@@ -310,7 +365,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
{
__m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
- cx = _mm_aesenc_si128(cx, ax0);
+ if(SOFT_AES)
+ cx = soft_aesenc(cx, ax0);
+ else
+ cx = _mm_aesenc_si128(cx, ax0);
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx;
@@ -318,7 +376,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]);
- cx = _mm_aesenc_si128(cx, ax1);
+ if(SOFT_AES)
+ cx = soft_aesenc(cx, ax1);
+ else
+ cx = _mm_aesenc_si128(cx, ax1);
_mm_store_si128((__m128i *)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx));
idx1 = _mm_cvtsi128_si64(cx);
bx1 = cx;
@@ -346,8 +407,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
}
// Optim - 90% time boundary
- cn_implode_scratchpad<MEM>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
- cn_implode_scratchpad<MEM>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
// Optim - 99% time boundary
@@ -355,4 +416,4 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
keccakf((uint64_t*)ctx1->hash_state, 24);
extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32);
-} \ No newline at end of file
+}
diff --git a/crypto/cryptonight_common.cpp b/crypto/cryptonight_common.cpp
index bc0a922..57f23e3 100644
--- a/crypto/cryptonight_common.cpp
+++ b/crypto/cryptonight_common.cpp
@@ -174,15 +174,20 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx)
void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
{
- cryptonight_hash<0x80000, MEMORY, true>(input, len, output, ctx);
+ cryptonight_hash<0x80000, MEMORY, true, false>(input, len, output, ctx);
+}
+
+void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
+{
+ cryptonight_hash<0x80000, MEMORY, true, true>(input, len, output, ctx);
}
void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
{
- cryptonight_hash<0x80000, MEMORY, false>(input, len, output, ctx);
+ cryptonight_hash<0x80000, MEMORY, false, false>(input, len, output, ctx);
}
void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
{
- cryptonight_double_hash<0x80000, MEMORY, false>(input, len, output, ctx0, ctx1);
-} \ No newline at end of file
+ cryptonight_double_hash<0x80000, MEMORY, false, false>(input, len, output, ctx0, ctx1);
+}
diff --git a/crypto/soft_aes.c b/crypto/soft_aes.c
new file mode 100644
index 0000000..10dfcba
--- /dev/null
+++ b/crypto/soft_aes.c
@@ -0,0 +1,140 @@
+/*
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * The orginal author of this AES implementation is Karl Malbrain.
+ */
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif // __GNUC__
+
+#include <inttypes.h>
+
+uint8_t Sbox[256] = { // forward s-box
+0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
+
+// combined Xtimes2[Sbox[]]
+uint8_t Xtime2Sbox[256] = {
+0xc6, 0xf8, 0xee, 0xf6, 0xff, 0xd6, 0xde, 0x91, 0x60, 0x02, 0xce, 0x56, 0xe7, 0xb5, 0x4d, 0xec,
+0x8f, 0x1f, 0x89, 0xfa, 0xef, 0xb2, 0x8e, 0xfb, 0x41, 0xb3, 0x5f, 0x45, 0x23, 0x53, 0xe4, 0x9b,
+0x75, 0xe1, 0x3d, 0x4c, 0x6c, 0x7e, 0xf5, 0x83, 0x68, 0x51, 0xd1, 0xf9, 0xe2, 0xab, 0x62, 0x2a,
+0x08, 0x95, 0x46, 0x9d, 0x30, 0x37, 0x0a, 0x2f, 0x0e, 0x24, 0x1b, 0xdf, 0xcd, 0x4e, 0x7f, 0xea,
+0x12, 0x1d, 0x58, 0x34, 0x36, 0xdc, 0xb4, 0x5b, 0xa4, 0x76, 0xb7, 0x7d, 0x52, 0xdd, 0x5e, 0x13,
+0xa6, 0xb9, 0x00, 0xc1, 0x40, 0xe3, 0x79, 0xb6, 0xd4, 0x8d, 0x67, 0x72, 0x94, 0x98, 0xb0, 0x85,
+0xbb, 0xc5, 0x4f, 0xed, 0x86, 0x9a, 0x66, 0x11, 0x8a, 0xe9, 0x04, 0xfe, 0xa0, 0x78, 0x25, 0x4b,
+0xa2, 0x5d, 0x80, 0x05, 0x3f, 0x21, 0x70, 0xf1, 0x63, 0x77, 0xaf, 0x42, 0x20, 0xe5, 0xfd, 0xbf,
+0x81, 0x18, 0x26, 0xc3, 0xbe, 0x35, 0x88, 0x2e, 0x93, 0x55, 0xfc, 0x7a, 0xc8, 0xba, 0x32, 0xe6,
+0xc0, 0x19, 0x9e, 0xa3, 0x44, 0x54, 0x3b, 0x0b, 0x8c, 0xc7, 0x6b, 0x28, 0xa7, 0xbc, 0x16, 0xad,
+0xdb, 0x64, 0x74, 0x14, 0x92, 0x0c, 0x48, 0xb8, 0x9f, 0xbd, 0x43, 0xc4, 0x39, 0x31, 0xd3, 0xf2,
+0xd5, 0x8b, 0x6e, 0xda, 0x01, 0xb1, 0x9c, 0x49, 0xd8, 0xac, 0xf3, 0xcf, 0xca, 0xf4, 0x47, 0x10,
+0x6f, 0xf0, 0x4a, 0x5c, 0x38, 0x57, 0x73, 0x97, 0xcb, 0xa1, 0xe8, 0x3e, 0x96, 0x61, 0x0d, 0x0f,
+0xe0, 0x7c, 0x71, 0xcc, 0x90, 0x06, 0xf7, 0x1c, 0xc2, 0x6a, 0xae, 0x69, 0x17, 0x99, 0x3a, 0x27,
+0xd9, 0xeb, 0x2b, 0x22, 0xd2, 0xa9, 0x07, 0x33, 0x2d, 0x3c, 0x15, 0xc9, 0x87, 0xaa, 0x50, 0xa5,
+0x03, 0x59, 0x09, 0x1a, 0x65, 0xd7, 0x84, 0xd0, 0x82, 0x29, 0x5a, 0x1e, 0x7b, 0xa8, 0x6d, 0x2c
+};
+
+// combined Xtimes3[Sbox[]]
+uint8_t Xtime3Sbox[256] = {
+0xa5, 0x84, 0x99, 0x8d, 0x0d, 0xbd, 0xb1, 0x54, 0x50, 0x03, 0xa9, 0x7d, 0x19, 0x62, 0xe6, 0x9a,
+0x45, 0x9d, 0x40, 0x87, 0x15, 0xeb, 0xc9, 0x0b, 0xec, 0x67, 0xfd, 0xea, 0xbf, 0xf7, 0x96, 0x5b,
+0xc2, 0x1c, 0xae, 0x6a, 0x5a, 0x41, 0x02, 0x4f, 0x5c, 0xf4, 0x34, 0x08, 0x93, 0x73, 0x53, 0x3f,
+0x0c, 0x52, 0x65, 0x5e, 0x28, 0xa1, 0x0f, 0xb5, 0x09, 0x36, 0x9b, 0x3d, 0x26, 0x69, 0xcd, 0x9f,
+0x1b, 0x9e, 0x74, 0x2e, 0x2d, 0xb2, 0xee, 0xfb, 0xf6, 0x4d, 0x61, 0xce, 0x7b, 0x3e, 0x71, 0x97,
+0xf5, 0x68, 0x00, 0x2c, 0x60, 0x1f, 0xc8, 0xed, 0xbe, 0x46, 0xd9, 0x4b, 0xde, 0xd4, 0xe8, 0x4a,
+0x6b, 0x2a, 0xe5, 0x16, 0xc5, 0xd7, 0x55, 0x94, 0xcf, 0x10, 0x06, 0x81, 0xf0, 0x44, 0xba, 0xe3,
+0xf3, 0xfe, 0xc0, 0x8a, 0xad, 0xbc, 0x48, 0x04, 0xdf, 0xc1, 0x75, 0x63, 0x30, 0x1a, 0x0e, 0x6d,
+0x4c, 0x14, 0x35, 0x2f, 0xe1, 0xa2, 0xcc, 0x39, 0x57, 0xf2, 0x82, 0x47, 0xac, 0xe7, 0x2b, 0x95,
+0xa0, 0x98, 0xd1, 0x7f, 0x66, 0x7e, 0xab, 0x83, 0xca, 0x29, 0xd3, 0x3c, 0x79, 0xe2, 0x1d, 0x76,
+0x3b, 0x56, 0x4e, 0x1e, 0xdb, 0x0a, 0x6c, 0xe4, 0x5d, 0x6e, 0xef, 0xa6, 0xa8, 0xa4, 0x37, 0x8b,
+0x32, 0x43, 0x59, 0xb7, 0x8c, 0x64, 0xd2, 0xe0, 0xb4, 0xfa, 0x07, 0x25, 0xaf, 0x8e, 0xe9, 0x18,
+0xd5, 0x88, 0x6f, 0x72, 0x24, 0xf1, 0xc7, 0x51, 0x23, 0x7c, 0x9c, 0x21, 0xdd, 0xdc, 0x86, 0x85,
+0x90, 0x42, 0xc4, 0xaa, 0xd8, 0x05, 0x01, 0x12, 0xa3, 0x5f, 0xf9, 0xd0, 0x91, 0x58, 0x27, 0xb9,
+0x38, 0x13, 0xb3, 0x33, 0xbb, 0x70, 0x89, 0xa7, 0xb6, 0x22, 0x92, 0x20, 0x49, 0xff, 0x78, 0x7a,
+0x8f, 0xf8, 0x80, 0x17, 0xda, 0x31, 0xc6, 0xb8, 0xc3, 0xb0, 0x77, 0x11, 0xcb, 0xfc, 0xd6, 0x3a
+};
+
+// recombine and mix each row in a column
+static inline __m128i MixSubColumns (uint8_t *state)
+{
+ uint8_t tmp[16];
+ // mixing column 0
+ tmp[0] = Xtime2Sbox[state[0]] ^ Xtime3Sbox[state[5]] ^ Sbox[state[10]] ^ Sbox[state[15]];
+ tmp[1] = Sbox[state[0]] ^ Xtime2Sbox[state[5]] ^ Xtime3Sbox[state[10]] ^ Sbox[state[15]];
+ tmp[2] = Sbox[state[0]] ^ Sbox[state[5]] ^ Xtime2Sbox[state[10]] ^ Xtime3Sbox[state[15]];
+ tmp[3] = Xtime3Sbox[state[0]] ^ Sbox[state[5]] ^ Sbox[state[10]] ^ Xtime2Sbox[state[15]];
+
+ // mixing column 1
+ tmp[4] = Xtime2Sbox[state[4]] ^ Xtime3Sbox[state[9]] ^ Sbox[state[14]] ^ Sbox[state[3]];
+ tmp[5] = Sbox[state[4]] ^ Xtime2Sbox[state[9]] ^ Xtime3Sbox[state[14]] ^ Sbox[state[3]];
+ tmp[6] = Sbox[state[4]] ^ Sbox[state[9]] ^ Xtime2Sbox[state[14]] ^ Xtime3Sbox[state[3]];
+ tmp[7] = Xtime3Sbox[state[4]] ^ Sbox[state[9]] ^ Sbox[state[14]] ^ Xtime2Sbox[state[3]];
+
+ // mixing column 2
+ tmp[8] = Xtime2Sbox[state[8]] ^ Xtime3Sbox[state[13]] ^ Sbox[state[2]] ^ Sbox[state[7]];
+ tmp[9] = Sbox[state[8]] ^ Xtime2Sbox[state[13]] ^ Xtime3Sbox[state[2]] ^ Sbox[state[7]];
+ tmp[10] = Sbox[state[8]] ^ Sbox[state[13]] ^ Xtime2Sbox[state[2]] ^ Xtime3Sbox[state[7]];
+ tmp[11] = Xtime3Sbox[state[8]] ^ Sbox[state[13]] ^ Sbox[state[2]] ^ Xtime2Sbox[state[7]];
+
+ // mixing column 3
+ tmp[12] = Xtime2Sbox[state[12]] ^ Xtime3Sbox[state[1]] ^ Sbox[state[6]] ^ Sbox[state[11]];
+ tmp[13] = Sbox[state[12]] ^ Xtime2Sbox[state[1]] ^ Xtime3Sbox[state[6]] ^ Sbox[state[11]];
+ tmp[14] = Sbox[state[12]] ^ Sbox[state[1]] ^ Xtime2Sbox[state[6]] ^ Xtime3Sbox[state[11]];
+ tmp[15] = Xtime3Sbox[state[12]] ^ Sbox[state[1]] ^ Sbox[state[6]] ^ Xtime2Sbox[state[11]];
+
+ return _mm_load_si128((__m128i*)tmp);
+}
+
+__m128i soft_aesenc(__m128i in, __m128i key)
+{
+ uint8_t state[16];
+ _mm_store_si128((__m128i*)state, in);
+ __m128i out = MixSubColumns(state);
+ return _mm_xor_si128(out, key);
+}
+
+static inline void sub_word(uint8_t* key)
+{
+ key[0] = Sbox[key[0]];
+ key[1] = Sbox[key[1]];
+ key[2] = Sbox[key[2]];
+ key[3] = Sbox[key[3]];
+}
+
+__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
+{
+ uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+ uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+ sub_word((uint8_t*)&X1);
+ sub_word((uint8_t*)&X3);
+ return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1);
+}
diff --git a/jconf.cpp b/jconf.cpp
index a14691e..a944075 100644
--- a/jconf.cpp
+++ b/jconf.cpp
@@ -22,6 +22,9 @@
#ifdef _WIN32
#define strcasecmp _stricmp
+#include <intrin.h>
+#else
+#include <cpuid.h>
#endif
#include "rapidjson/document.h"
@@ -120,6 +123,12 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
cfg.bDoubleMode = mode->GetBool();
cfg.bNoPrefetch = no_prefetch->GetBool();
+ if(!bHaveAes && (cfg.bDoubleMode || cfg.bNoPrefetch))
+ {
+ printer::inst()->print_msg(L0, "Invalid thread confg - low_power_mode and no_prefetch are unsupported on CPUs without AES-NI.");
+ return false;
+ }
+
if(aff->IsNumber())
cfg.iCpuAff = aff->GetInt64();
else
@@ -194,12 +203,38 @@ uint16_t jconf::GetHttpdPort()
return prv->configValues[iHttpdPort]->GetUint();
}
+bool jconf::check_cpu_features()
+{
+ constexpr int AESNI_BIT = 1 << 25;
+ constexpr int SSE2_BIT = 1 << 26;
+
+ int cpu_info[4];
+#ifdef _WIN32
+ __cpuid(cpu_info, 1);
+#else
+ __cpuid(1, cpu_info[0], cpu_info[1], cpu_info[2], cpu_info[3]);
+#endif
+
+ bHaveAes = (cpu_info[2] & AESNI_BIT) != 0;
+
+ if(!bHaveAes)
+ printer::inst()->print_msg(L0, "Your CPU doesn't support hardware AES. Don't expect high hashrates.");
+
+ return (cpu_info[3] & SSE2_BIT) != 0;
+}
+
bool jconf::parse_config(const char* sFilename)
{
FILE * pFile;
char * buffer;
size_t flen;
+ if(!check_cpu_features())
+ {
+ printer::inst()->print_msg(L0, "CPU support of SSE2 is required.");
+ return false;
+ }
+
pFile = fopen(sFilename, "rb");
if (pFile == NULL)
{
diff --git a/jconf.h b/jconf.h
index 1f3a497..73f6870 100644
--- a/jconf.h
+++ b/jconf.h
@@ -46,10 +46,15 @@ public:
bool PreferIpv4();
+ inline bool HaveHardwareAes() { return bHaveAes; }
+
private:
jconf();
static jconf* oInst;
+ bool check_cpu_features();
struct opaque_private;
opaque_private* prv;
+
+ bool bHaveAes;
};
diff --git a/minethd.cpp b/minethd.cpp
index 6390bed..065b3f8 100644
--- a/minethd.cpp
+++ b/minethd.cpp
@@ -21,16 +21,13 @@
#ifdef _WIN32
#include <windows.h>
-#include <intrin.h>
void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
{
SetThreadAffinityMask(h, 1 << cpu_id);
}
-
#else
#include <pthread.h>
-#include <cpuid.h>
void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
{
@@ -175,29 +172,8 @@ cryptonight_ctx* minethd_alloc_ctx()
return nullptr; //Should never happen
}
-static bool check_cpu_features()
-{
- constexpr int AESNI_BIT = 1 << 25;
- constexpr int SSE2_BIT = 1 << 26;
-
- int cpu_info[4];
-#ifdef _WIN32
- __cpuid(cpu_info, 1);
-#else
- __cpuid(1, cpu_info[0], cpu_info[1], cpu_info[2], cpu_info[3]);
-#endif
- return (cpu_info[2] & AESNI_BIT) != 0 &&
- (cpu_info[3] & SSE2_BIT) != 0;
-}
-
bool minethd::self_test()
{
- if (!check_cpu_features())
- {
- printer::inst()->print_msg(L0, "This application requires CPU support of AES-NI and SSE2 instructions.");
- return false;
- }
-
alloc_msg msg = { 0 };
size_t res;
bool fatal = false;
@@ -350,6 +326,7 @@ void minethd::work_main()
piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
iConsumeCnt++;
+ bool bHaveAes = jconf::inst()->HaveHardwareAes();
while (bQuit == 0)
{
if (oWork.bStall)
@@ -383,10 +360,15 @@ void minethd::work_main()
*piNonce = ++result.iNonce;
- if(bNoPrefetch)
- cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+ if(bHaveAes)
+ {
+ if(bNoPrefetch)
+ cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+ else
+ cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+ }
else
- cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+ cryptonight_hash_ctx_soft(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
if (*piHashVal < oWork.iTarget)
executor::inst()->push_event(ex_event(result, oWork.iPoolId));
diff --git a/xmr-stak-cpu.cbp b/xmr-stak-cpu.cbp
index 0246e37..c544a10 100644
--- a/xmr-stak-cpu.cbp
+++ b/xmr-stak-cpu.cbp
@@ -92,11 +92,15 @@
</Unit>
<Unit filename="crypto/c_skein.h" />
<Unit filename="crypto/cryptonight.h" />
+ <Unit filename="crypto/cryptonight_aesni.h" />
<Unit filename="crypto/cryptonight_common.cpp" />
<Unit filename="crypto/groestl_tables.h" />
<Unit filename="crypto/hash.h" />
<Unit filename="crypto/int-util.h" />
<Unit filename="crypto/skein_port.h" />
+ <Unit filename="crypto/soft_aes.c">
+ <Option compilerVar="CC" />
+ </Unit>
<Unit filename="donate-level.h" />
<Unit filename="executor.cpp" />
<Unit filename="executor.h" />
OpenPOWER on IntegriCloud