diff options
-rw-r--r-- | crypto/cryptonight.h | 1 | ||||
-rw-r--r-- | crypto/cryptonight_aesni.h | 207 | ||||
-rw-r--r-- | crypto/cryptonight_common.cpp | 13 | ||||
-rw-r--r-- | crypto/soft_aes.c | 140 | ||||
-rw-r--r-- | jconf.cpp | 35 | ||||
-rw-r--r-- | jconf.h | 5 | ||||
-rw-r--r-- | minethd.cpp | 36 | ||||
-rw-r--r-- | xmr-stak-cpu.cbp | 4 |
8 files changed, 337 insertions, 104 deletions
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index acc5fd1..f4ec843 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -25,6 +25,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al void cryptonight_free_ctx(cryptonight_ctx* ctx); void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx); +void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx); void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx); void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1); diff --git a/crypto/cryptonight_aesni.h b/crypto/cryptonight_aesni.h index 4bf099e..4ae7d6e 100644 --- a/crypto/cryptonight_aesni.h +++ b/crypto/cryptonight_aesni.h @@ -42,6 +42,9 @@ extern "C" void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen); void keccakf(uint64_t st[25], int rounds); extern void(*const extra_hashes[4])(const void *, size_t, char *); + + __m128i soft_aesenc(__m128i in, __m128i key); + __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon); } // This will shift and xor tmp1 into itself as 4 32-bit vals such as @@ -58,61 +61,66 @@ static inline __m128i sl_xor(__m128i tmp1) return tmp1; } +static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon) +{ + __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon); + xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem + *xout0 = sl_xor(*xout0); + *xout0 = _mm_xor_si128(*xout0, xout1); + xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00); + xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem + *xout2 = sl_xor(*xout2); + *xout2 = _mm_xor_si128(*xout2, xout1); +} + +static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon) +{ + __m128i xout1 = soft_aeskeygenassist(*xout2, rcon); + xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem + *xout0 = sl_xor(*xout0); + *xout0 = _mm_xor_si128(*xout0, xout1); + xout1 = soft_aeskeygenassist(*xout0, 0x00); + xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem + *xout2 = sl_xor(*xout2); + *xout2 = _mm_xor_si128(*xout2, xout1); +} + +template<bool SOFT_AES> static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) { - __m128i xout0, xout1, xout2; + __m128i xout0, xout2; xout0 = _mm_load_si128(memory); xout2 = _mm_load_si128(memory+1); *k0 = xout0; *k1 = xout2; - xout1 = _mm_aeskeygenassist_si128(xout2, 0x01); - xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem - xout0 = sl_xor(xout0); - xout0 = _mm_xor_si128(xout0, xout1); - - xout1 = _mm_aeskeygenassist_si128(xout0, 0x00); - xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem - xout2 = sl_xor(xout2); - xout2 = _mm_xor_si128(xout2, xout1); + if(SOFT_AES) + soft_aes_genkey_sub(&xout0, &xout2, 0x01); + else + aes_genkey_sub(&xout0, &xout2, 0x01); *k2 = xout0; *k3 = xout2; - xout1 = _mm_aeskeygenassist_si128(xout2, 0x02); - xout1 = _mm_shuffle_epi32(xout1, 0xFF); - xout0 = sl_xor(xout0); - xout0 = _mm_xor_si128(xout0, xout1); - - xout1 = _mm_aeskeygenassist_si128(xout0, 0x00); - xout1 = _mm_shuffle_epi32(xout1, 0xAA); - xout2 = sl_xor(xout2); - xout2 = _mm_xor_si128(xout2, xout1); + if(SOFT_AES) + soft_aes_genkey_sub(&xout0, &xout2, 0x02); + else + aes_genkey_sub(&xout0, &xout2, 0x02); *k4 = xout0; *k5 = xout2; - xout1 = _mm_aeskeygenassist_si128(xout2, 0x04); - xout1 = _mm_shuffle_epi32(xout1, 0xFF); - xout0 = sl_xor(xout0); - xout0 = _mm_xor_si128(xout0, xout1); - - xout1 = _mm_aeskeygenassist_si128(xout0, 0x00); - xout1 = _mm_shuffle_epi32(xout1, 0xAA); - xout2 = sl_xor(xout2); - xout2 = _mm_xor_si128(xout2, xout1); + if(SOFT_AES) + soft_aes_genkey_sub(&xout0, &xout2, 0x04); + else + aes_genkey_sub(&xout0, &xout2, 0x04); *k6 = xout0; *k7 = xout2; - xout1 = _mm_aeskeygenassist_si128(xout2, 0x08); - xout1 = _mm_shuffle_epi32(xout1, 0xFF); - xout0 = sl_xor(xout0); - xout0 = _mm_xor_si128(xout0, xout1); - - xout1 = _mm_aeskeygenassist_si128(xout0, 0x00); - xout1 = _mm_shuffle_epi32(xout1, 0xAA); - xout2 = sl_xor(xout2); - xout2 = _mm_xor_si128(xout2, xout1); + if(SOFT_AES) + soft_aes_genkey_sub(&xout0, &xout2, 0x08); + else + aes_genkey_sub(&xout0, &xout2, 0x08); *k8 = xout0; *k9 = xout2; } @@ -129,14 +137,26 @@ static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, *x7 = _mm_aesenc_si128(*x7, key); } -template<size_t MEM> +static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) +{ + *x0 = soft_aesenc(*x0, key); + *x1 = soft_aesenc(*x1, key); + *x2 = soft_aesenc(*x2, key); + *x3 = soft_aesenc(*x3, key); + *x4 = soft_aesenc(*x4, key); + *x5 = soft_aesenc(*x5, key); + *x6 = soft_aesenc(*x6, key); + *x7 = soft_aesenc(*x7, key); +} + +template<size_t MEM, bool SOFT_AES> void cn_explode_scratchpad(const __m128i* input, __m128i* output) { // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; - aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); xin0 = _mm_load_si128(input + 4); xin1 = _mm_load_si128(input + 5); @@ -149,16 +169,32 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { - aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + if(SOFT_AES) + { + soft_aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + soft_aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + } + else + { + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + } _mm_store_si128(output + i + 0, xin0); _mm_store_si128(output + i + 1, xin1); @@ -173,14 +209,14 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) } } -template<size_t MEM> +template<size_t MEM, bool SOFT_AES> void cn_implode_scratchpad(const __m128i* input, __m128i* output) { // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; - aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); xout0 = _mm_load_si128(output + 4); xout1 = _mm_load_si128(output + 5); @@ -204,16 +240,32 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); - aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); - aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + if(SOFT_AES) + { + soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } + else + { + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } } _mm_store_si128(output + 4, xout0); @@ -226,13 +278,13 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } -template<size_t ITERATIONS, size_t MEM, bool PREFETCH> +template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES> void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0) { keccak((const uint8_t *)input, len, ctx0->hash_state, 200); // Optim - 99% time boundary - cn_explode_scratchpad<MEM>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); + cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); uint8_t* l0 = ctx0->long_state; uint64_t* h0 = (uint64_t*)ctx0->hash_state; @@ -248,7 +300,10 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c { __m128i cx; cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + if(SOFT_AES) + cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); + else + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); idx0 = _mm_cvtsi128_si64(cx); bx0 = cx; @@ -271,7 +326,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c } // Optim - 90% time boundary - cn_implode_scratchpad<MEM>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); + cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); // Optim - 99% time boundary @@ -282,15 +337,15 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c // This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) -template<size_t ITERATIONS, size_t MEM, bool PREFETCH> +template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES> void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) { keccak((const uint8_t *)input, len, ctx0->hash_state, 200); keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200); // Optim - 99% time boundary - cn_explode_scratchpad<MEM>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); - cn_explode_scratchpad<MEM>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state); + cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); + cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state); uint8_t* l0 = ctx0->long_state; uint64_t* h0 = (uint64_t*)ctx0->hash_state; @@ -310,7 +365,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto { __m128i cx; cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); - cx = _mm_aesenc_si128(cx, ax0); + if(SOFT_AES) + cx = soft_aesenc(cx, ax0); + else + cx = _mm_aesenc_si128(cx, ax0); _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); idx0 = _mm_cvtsi128_si64(cx); bx0 = cx; @@ -318,7 +376,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto _mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0); cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]); - cx = _mm_aesenc_si128(cx, ax1); + if(SOFT_AES) + cx = soft_aesenc(cx, ax1); + else + cx = _mm_aesenc_si128(cx, ax1); _mm_store_si128((__m128i *)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx)); idx1 = _mm_cvtsi128_si64(cx); bx1 = cx; @@ -346,8 +407,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto } // Optim - 90% time boundary - cn_implode_scratchpad<MEM>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); - cn_implode_scratchpad<MEM>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state); + cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); + cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state); // Optim - 99% time boundary @@ -355,4 +416,4 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); keccakf((uint64_t*)ctx1->hash_state, 24); extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32); -}
\ No newline at end of file +} diff --git a/crypto/cryptonight_common.cpp b/crypto/cryptonight_common.cpp index bc0a922..57f23e3 100644 --- a/crypto/cryptonight_common.cpp +++ b/crypto/cryptonight_common.cpp @@ -174,15 +174,20 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx) void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx) { - cryptonight_hash<0x80000, MEMORY, true>(input, len, output, ctx); + cryptonight_hash<0x80000, MEMORY, true, false>(input, len, output, ctx); +} + +void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx) +{ + cryptonight_hash<0x80000, MEMORY, true, true>(input, len, output, ctx); } void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx) { - cryptonight_hash<0x80000, MEMORY, false>(input, len, output, ctx); + cryptonight_hash<0x80000, MEMORY, false, false>(input, len, output, ctx); } void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) { - cryptonight_double_hash<0x80000, MEMORY, false>(input, len, output, ctx0, ctx1); -}
\ No newline at end of file + cryptonight_double_hash<0x80000, MEMORY, false, false>(input, len, output, ctx0, ctx1); +} diff --git a/crypto/soft_aes.c b/crypto/soft_aes.c new file mode 100644 index 0000000..10dfcba --- /dev/null +++ b/crypto/soft_aes.c @@ -0,0 +1,140 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/* + * The orginal author of this AES implementation is Karl Malbrain. + */ + +#ifdef __GNUC__ +#include <x86intrin.h> +#else +#include <intrin.h> +#endif // __GNUC__ + +#include <inttypes.h> + +uint8_t Sbox[256] = { // forward s-box +0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, +0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, +0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, +0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, +0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, +0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, +0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, +0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, +0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, +0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, +0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, +0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, +0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, +0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, +0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, +0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; + +// combined Xtimes2[Sbox[]] +uint8_t Xtime2Sbox[256] = { +0xc6, 0xf8, 0xee, 0xf6, 0xff, 0xd6, 0xde, 0x91, 0x60, 0x02, 0xce, 0x56, 0xe7, 0xb5, 0x4d, 0xec, +0x8f, 0x1f, 0x89, 0xfa, 0xef, 0xb2, 0x8e, 0xfb, 0x41, 0xb3, 0x5f, 0x45, 0x23, 0x53, 0xe4, 0x9b, +0x75, 0xe1, 0x3d, 0x4c, 0x6c, 0x7e, 0xf5, 0x83, 0x68, 0x51, 0xd1, 0xf9, 0xe2, 0xab, 0x62, 0x2a, +0x08, 0x95, 0x46, 0x9d, 0x30, 0x37, 0x0a, 0x2f, 0x0e, 0x24, 0x1b, 0xdf, 0xcd, 0x4e, 0x7f, 0xea, +0x12, 0x1d, 0x58, 0x34, 0x36, 0xdc, 0xb4, 0x5b, 0xa4, 0x76, 0xb7, 0x7d, 0x52, 0xdd, 0x5e, 0x13, +0xa6, 0xb9, 0x00, 0xc1, 0x40, 0xe3, 0x79, 0xb6, 0xd4, 0x8d, 0x67, 0x72, 0x94, 0x98, 0xb0, 0x85, +0xbb, 0xc5, 0x4f, 0xed, 0x86, 0x9a, 0x66, 0x11, 0x8a, 0xe9, 0x04, 0xfe, 0xa0, 0x78, 0x25, 0x4b, +0xa2, 0x5d, 0x80, 0x05, 0x3f, 0x21, 0x70, 0xf1, 0x63, 0x77, 0xaf, 0x42, 0x20, 0xe5, 0xfd, 0xbf, +0x81, 0x18, 0x26, 0xc3, 0xbe, 0x35, 0x88, 0x2e, 0x93, 0x55, 0xfc, 0x7a, 0xc8, 0xba, 0x32, 0xe6, +0xc0, 0x19, 0x9e, 0xa3, 0x44, 0x54, 0x3b, 0x0b, 0x8c, 0xc7, 0x6b, 0x28, 0xa7, 0xbc, 0x16, 0xad, +0xdb, 0x64, 0x74, 0x14, 0x92, 0x0c, 0x48, 0xb8, 0x9f, 0xbd, 0x43, 0xc4, 0x39, 0x31, 0xd3, 0xf2, +0xd5, 0x8b, 0x6e, 0xda, 0x01, 0xb1, 0x9c, 0x49, 0xd8, 0xac, 0xf3, 0xcf, 0xca, 0xf4, 0x47, 0x10, +0x6f, 0xf0, 0x4a, 0x5c, 0x38, 0x57, 0x73, 0x97, 0xcb, 0xa1, 0xe8, 0x3e, 0x96, 0x61, 0x0d, 0x0f, +0xe0, 0x7c, 0x71, 0xcc, 0x90, 0x06, 0xf7, 0x1c, 0xc2, 0x6a, 0xae, 0x69, 0x17, 0x99, 0x3a, 0x27, +0xd9, 0xeb, 0x2b, 0x22, 0xd2, 0xa9, 0x07, 0x33, 0x2d, 0x3c, 0x15, 0xc9, 0x87, 0xaa, 0x50, 0xa5, +0x03, 0x59, 0x09, 0x1a, 0x65, 0xd7, 0x84, 0xd0, 0x82, 0x29, 0x5a, 0x1e, 0x7b, 0xa8, 0x6d, 0x2c +}; + +// combined Xtimes3[Sbox[]] +uint8_t Xtime3Sbox[256] = { +0xa5, 0x84, 0x99, 0x8d, 0x0d, 0xbd, 0xb1, 0x54, 0x50, 0x03, 0xa9, 0x7d, 0x19, 0x62, 0xe6, 0x9a, +0x45, 0x9d, 0x40, 0x87, 0x15, 0xeb, 0xc9, 0x0b, 0xec, 0x67, 0xfd, 0xea, 0xbf, 0xf7, 0x96, 0x5b, +0xc2, 0x1c, 0xae, 0x6a, 0x5a, 0x41, 0x02, 0x4f, 0x5c, 0xf4, 0x34, 0x08, 0x93, 0x73, 0x53, 0x3f, +0x0c, 0x52, 0x65, 0x5e, 0x28, 0xa1, 0x0f, 0xb5, 0x09, 0x36, 0x9b, 0x3d, 0x26, 0x69, 0xcd, 0x9f, +0x1b, 0x9e, 0x74, 0x2e, 0x2d, 0xb2, 0xee, 0xfb, 0xf6, 0x4d, 0x61, 0xce, 0x7b, 0x3e, 0x71, 0x97, +0xf5, 0x68, 0x00, 0x2c, 0x60, 0x1f, 0xc8, 0xed, 0xbe, 0x46, 0xd9, 0x4b, 0xde, 0xd4, 0xe8, 0x4a, +0x6b, 0x2a, 0xe5, 0x16, 0xc5, 0xd7, 0x55, 0x94, 0xcf, 0x10, 0x06, 0x81, 0xf0, 0x44, 0xba, 0xe3, +0xf3, 0xfe, 0xc0, 0x8a, 0xad, 0xbc, 0x48, 0x04, 0xdf, 0xc1, 0x75, 0x63, 0x30, 0x1a, 0x0e, 0x6d, +0x4c, 0x14, 0x35, 0x2f, 0xe1, 0xa2, 0xcc, 0x39, 0x57, 0xf2, 0x82, 0x47, 0xac, 0xe7, 0x2b, 0x95, +0xa0, 0x98, 0xd1, 0x7f, 0x66, 0x7e, 0xab, 0x83, 0xca, 0x29, 0xd3, 0x3c, 0x79, 0xe2, 0x1d, 0x76, +0x3b, 0x56, 0x4e, 0x1e, 0xdb, 0x0a, 0x6c, 0xe4, 0x5d, 0x6e, 0xef, 0xa6, 0xa8, 0xa4, 0x37, 0x8b, +0x32, 0x43, 0x59, 0xb7, 0x8c, 0x64, 0xd2, 0xe0, 0xb4, 0xfa, 0x07, 0x25, 0xaf, 0x8e, 0xe9, 0x18, +0xd5, 0x88, 0x6f, 0x72, 0x24, 0xf1, 0xc7, 0x51, 0x23, 0x7c, 0x9c, 0x21, 0xdd, 0xdc, 0x86, 0x85, +0x90, 0x42, 0xc4, 0xaa, 0xd8, 0x05, 0x01, 0x12, 0xa3, 0x5f, 0xf9, 0xd0, 0x91, 0x58, 0x27, 0xb9, +0x38, 0x13, 0xb3, 0x33, 0xbb, 0x70, 0x89, 0xa7, 0xb6, 0x22, 0x92, 0x20, 0x49, 0xff, 0x78, 0x7a, +0x8f, 0xf8, 0x80, 0x17, 0xda, 0x31, 0xc6, 0xb8, 0xc3, 0xb0, 0x77, 0x11, 0xcb, 0xfc, 0xd6, 0x3a +}; + +// recombine and mix each row in a column +static inline __m128i MixSubColumns (uint8_t *state) +{ + uint8_t tmp[16]; + // mixing column 0 + tmp[0] = Xtime2Sbox[state[0]] ^ Xtime3Sbox[state[5]] ^ Sbox[state[10]] ^ Sbox[state[15]]; + tmp[1] = Sbox[state[0]] ^ Xtime2Sbox[state[5]] ^ Xtime3Sbox[state[10]] ^ Sbox[state[15]]; + tmp[2] = Sbox[state[0]] ^ Sbox[state[5]] ^ Xtime2Sbox[state[10]] ^ Xtime3Sbox[state[15]]; + tmp[3] = Xtime3Sbox[state[0]] ^ Sbox[state[5]] ^ Sbox[state[10]] ^ Xtime2Sbox[state[15]]; + + // mixing column 1 + tmp[4] = Xtime2Sbox[state[4]] ^ Xtime3Sbox[state[9]] ^ Sbox[state[14]] ^ Sbox[state[3]]; + tmp[5] = Sbox[state[4]] ^ Xtime2Sbox[state[9]] ^ Xtime3Sbox[state[14]] ^ Sbox[state[3]]; + tmp[6] = Sbox[state[4]] ^ Sbox[state[9]] ^ Xtime2Sbox[state[14]] ^ Xtime3Sbox[state[3]]; + tmp[7] = Xtime3Sbox[state[4]] ^ Sbox[state[9]] ^ Sbox[state[14]] ^ Xtime2Sbox[state[3]]; + + // mixing column 2 + tmp[8] = Xtime2Sbox[state[8]] ^ Xtime3Sbox[state[13]] ^ Sbox[state[2]] ^ Sbox[state[7]]; + tmp[9] = Sbox[state[8]] ^ Xtime2Sbox[state[13]] ^ Xtime3Sbox[state[2]] ^ Sbox[state[7]]; + tmp[10] = Sbox[state[8]] ^ Sbox[state[13]] ^ Xtime2Sbox[state[2]] ^ Xtime3Sbox[state[7]]; + tmp[11] = Xtime3Sbox[state[8]] ^ Sbox[state[13]] ^ Sbox[state[2]] ^ Xtime2Sbox[state[7]]; + + // mixing column 3 + tmp[12] = Xtime2Sbox[state[12]] ^ Xtime3Sbox[state[1]] ^ Sbox[state[6]] ^ Sbox[state[11]]; + tmp[13] = Sbox[state[12]] ^ Xtime2Sbox[state[1]] ^ Xtime3Sbox[state[6]] ^ Sbox[state[11]]; + tmp[14] = Sbox[state[12]] ^ Sbox[state[1]] ^ Xtime2Sbox[state[6]] ^ Xtime3Sbox[state[11]]; + tmp[15] = Xtime3Sbox[state[12]] ^ Sbox[state[1]] ^ Sbox[state[6]] ^ Xtime2Sbox[state[11]]; + + return _mm_load_si128((__m128i*)tmp); +} + +__m128i soft_aesenc(__m128i in, __m128i key) +{ + uint8_t state[16]; + _mm_store_si128((__m128i*)state, in); + __m128i out = MixSubColumns(state); + return _mm_xor_si128(out, key); +} + +static inline void sub_word(uint8_t* key) +{ + key[0] = Sbox[key[0]]; + key[1] = Sbox[key[1]]; + key[2] = Sbox[key[2]]; + key[3] = Sbox[key[3]]; +} + +__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) +{ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); + sub_word((uint8_t*)&X1); + sub_word((uint8_t*)&X3); + return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1); +} @@ -22,6 +22,9 @@ #ifdef _WIN32 #define strcasecmp _stricmp +#include <intrin.h> +#else +#include <cpuid.h> #endif #include "rapidjson/document.h" @@ -120,6 +123,12 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) cfg.bDoubleMode = mode->GetBool(); cfg.bNoPrefetch = no_prefetch->GetBool(); + if(!bHaveAes && (cfg.bDoubleMode || cfg.bNoPrefetch)) + { + printer::inst()->print_msg(L0, "Invalid thread confg - low_power_mode and no_prefetch are unsupported on CPUs without AES-NI."); + return false; + } + if(aff->IsNumber()) cfg.iCpuAff = aff->GetInt64(); else @@ -194,12 +203,38 @@ uint16_t jconf::GetHttpdPort() return prv->configValues[iHttpdPort]->GetUint(); } +bool jconf::check_cpu_features() +{ + constexpr int AESNI_BIT = 1 << 25; + constexpr int SSE2_BIT = 1 << 26; + + int cpu_info[4]; +#ifdef _WIN32 + __cpuid(cpu_info, 1); +#else + __cpuid(1, cpu_info[0], cpu_info[1], cpu_info[2], cpu_info[3]); +#endif + + bHaveAes = (cpu_info[2] & AESNI_BIT) != 0; + + if(!bHaveAes) + printer::inst()->print_msg(L0, "Your CPU doesn't support hardware AES. Don't expect high hashrates."); + + return (cpu_info[3] & SSE2_BIT) != 0; +} + bool jconf::parse_config(const char* sFilename) { FILE * pFile; char * buffer; size_t flen; + if(!check_cpu_features()) + { + printer::inst()->print_msg(L0, "CPU support of SSE2 is required."); + return false; + } + pFile = fopen(sFilename, "rb"); if (pFile == NULL) { @@ -46,10 +46,15 @@ public: bool PreferIpv4(); + inline bool HaveHardwareAes() { return bHaveAes; } + private: jconf(); static jconf* oInst; + bool check_cpu_features(); struct opaque_private; opaque_private* prv; + + bool bHaveAes; }; diff --git a/minethd.cpp b/minethd.cpp index 6390bed..065b3f8 100644 --- a/minethd.cpp +++ b/minethd.cpp @@ -21,16 +21,13 @@ #ifdef _WIN32 #include <windows.h> -#include <intrin.h> void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) { SetThreadAffinityMask(h, 1 << cpu_id); } - #else #include <pthread.h> -#include <cpuid.h> void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) { @@ -175,29 +172,8 @@ cryptonight_ctx* minethd_alloc_ctx() return nullptr; //Should never happen } -static bool check_cpu_features() -{ - constexpr int AESNI_BIT = 1 << 25; - constexpr int SSE2_BIT = 1 << 26; - - int cpu_info[4]; -#ifdef _WIN32 - __cpuid(cpu_info, 1); -#else - __cpuid(1, cpu_info[0], cpu_info[1], cpu_info[2], cpu_info[3]); -#endif - return (cpu_info[2] & AESNI_BIT) != 0 && - (cpu_info[3] & SSE2_BIT) != 0; -} - bool minethd::self_test() { - if (!check_cpu_features()) - { - printer::inst()->print_msg(L0, "This application requires CPU support of AES-NI and SSE2 instructions."); - return false; - } - alloc_msg msg = { 0 }; size_t res; bool fatal = false; @@ -350,6 +326,7 @@ void minethd::work_main() piNonce = (uint32_t*)(oWork.bWorkBlob + 39); iConsumeCnt++; + bool bHaveAes = jconf::inst()->HaveHardwareAes(); while (bQuit == 0) { if (oWork.bStall) @@ -383,10 +360,15 @@ void minethd::work_main() *piNonce = ++result.iNonce; - if(bNoPrefetch) - cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); + if(bHaveAes) + { + if(bNoPrefetch) + cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); + else + cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); + } else - cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); + cryptonight_hash_ctx_soft(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); if (*piHashVal < oWork.iTarget) executor::inst()->push_event(ex_event(result, oWork.iPoolId)); diff --git a/xmr-stak-cpu.cbp b/xmr-stak-cpu.cbp index 0246e37..c544a10 100644 --- a/xmr-stak-cpu.cbp +++ b/xmr-stak-cpu.cbp @@ -92,11 +92,15 @@ </Unit> <Unit filename="crypto/c_skein.h" /> <Unit filename="crypto/cryptonight.h" /> + <Unit filename="crypto/cryptonight_aesni.h" /> <Unit filename="crypto/cryptonight_common.cpp" /> <Unit filename="crypto/groestl_tables.h" /> <Unit filename="crypto/hash.h" /> <Unit filename="crypto/int-util.h" /> <Unit filename="crypto/skein_port.h" /> + <Unit filename="crypto/soft_aes.c"> + <Option compilerVar="CC" /> + </Unit> <Unit filename="donate-level.h" /> <Unit filename="executor.cpp" /> <Unit filename="executor.h" /> |