diff options
-rw-r--r-- | crypto/cryptonight.h | 5 | ||||
-rw-r--r-- | crypto/cryptonight_aesni.h | 37 | ||||
-rw-r--r-- | crypto/cryptonight_common.cpp | 20 | ||||
-rw-r--r-- | jconf.cpp | 17 | ||||
-rw-r--r-- | jconf.h | 2 | ||||
-rw-r--r-- | minethd.cpp | 122 | ||||
-rw-r--r-- | minethd.h | 7 | ||||
-rw-r--r-- | xmr-stak-cpu.cbp | 2 |
8 files changed, 137 insertions, 75 deletions
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index f4ec843..978c798 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -24,11 +24,6 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); void cryptonight_free_ctx(cryptonight_ctx* ctx); -void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx); -void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx); -void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx); -void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1); - #ifdef __cplusplus } #endif diff --git a/crypto/cryptonight_aesni.h b/crypto/cryptonight_aesni.h index 37672c6..de0f186 100644 --- a/crypto/cryptonight_aesni.h +++ b/crypto/cryptonight_aesni.h @@ -279,7 +279,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } -template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES> +template<size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH, bool MULX> void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0) { keccak((const uint8_t *)input, len, ctx0->hash_state, 200); @@ -301,20 +301,28 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c { __m128i cx; cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); + if(SOFT_AES) cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); else cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); idx0 = _mm_cvtsi128_si64(cx); bx0 = cx; + if(PREFETCH) _mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0); uint64_t hi, lo, cl, ch; cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0]; ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1]; - lo = _umul128(idx0, cl, &hi); + + if(MULX) + lo = _mulx_u64(idx0, cl, (long long unsigned int*)&hi); + else + lo = _umul128(idx0, cl, &hi); + al0 += hi; ah0 += lo; ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; @@ -322,6 +330,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c ah0 ^= ch; al0 ^= cl; idx0 = al0; + if(PREFETCH) _mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0); } @@ -338,7 +347,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c // This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) -template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES> +template<size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH, bool MULX> void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) { keccak((const uint8_t *)input, len, ctx0->hash_state, 200); @@ -366,43 +375,61 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto { __m128i cx; cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); + if(SOFT_AES) cx = soft_aesenc(cx, ax0); else cx = _mm_aesenc_si128(cx, ax0); + _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); idx0 = _mm_cvtsi128_si64(cx); bx0 = cx; + if(PREFETCH) _mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0); cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]); + if(SOFT_AES) cx = soft_aesenc(cx, ax1); else cx = _mm_aesenc_si128(cx, ax1); + _mm_store_si128((__m128i *)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx)); idx1 = _mm_cvtsi128_si64(cx); bx1 = cx; + if(PREFETCH) _mm_prefetch((const char*)&l1[idx1 & 0x1FFFF0], _MM_HINT_T0); uint64_t hi, lo; cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); - lo = _umul128(idx0, _mm_cvtsi128_si64(cx), &hi); + + if(MULX) + lo = _mulx_u64(idx0, _mm_cvtsi128_si64(cx), (long long unsigned int*)&hi); + else + lo = _umul128(idx0, _mm_cvtsi128_si64(cx), &hi); + ax0 = _mm_add_epi64(ax0, _mm_set_epi64x(lo, hi)); _mm_store_si128((__m128i*)&l0[idx0 & 0x1FFFF0], ax0); ax0 = _mm_xor_si128(ax0, cx); idx0 = _mm_cvtsi128_si64(ax0); + if(PREFETCH) _mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0); cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]); - lo = _umul128(idx1, _mm_cvtsi128_si64(cx), &hi); + + if(MULX) + lo = _mulx_u64(idx1, _mm_cvtsi128_si64(cx), (long long unsigned int*)&hi); + else + lo = _umul128(idx1, _mm_cvtsi128_si64(cx), &hi); + ax1 = _mm_add_epi64(ax1, _mm_set_epi64x(lo, hi)); _mm_store_si128((__m128i*)&l1[idx1 & 0x1FFFF0], ax1); ax1 = _mm_xor_si128(ax1, cx); idx1 = _mm_cvtsi128_si64(ax1); + if(PREFETCH) _mm_prefetch((const char*)&l1[idx1 & 0x1FFFF0], _MM_HINT_T0); } diff --git a/crypto/cryptonight_common.cpp b/crypto/cryptonight_common.cpp index bbcff34..63ce3a4 100644 --- a/crypto/cryptonight_common.cpp +++ b/crypto/cryptonight_common.cpp @@ -190,23 +190,3 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx) _mm_free(ctx); } - -void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx) -{ - cryptonight_hash<0x80000, MEMORY, true, false>(input, len, output, ctx); -} - -void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx) -{ - cryptonight_hash<0x80000, MEMORY, true, true>(input, len, output, ctx); -} - -void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx) -{ - cryptonight_hash<0x80000, MEMORY, false, false>(input, len, output, ctx); -} - -void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) -{ - cryptonight_double_hash<0x80000, MEMORY, false, false>(input, len, output, ctx0, ctx1); -} @@ -253,8 +253,11 @@ bool jconf::check_cpu_features() { constexpr int AESNI_BIT = 1 << 25; constexpr int SSE2_BIT = 1 << 26; + constexpr int BMI2_BIT = 1 << 8; int cpu_info[4]; + bool bHaveSse2; + #ifdef _WIN32 __cpuid(cpu_info, 1); #else @@ -262,11 +265,23 @@ bool jconf::check_cpu_features() #endif bHaveAes = (cpu_info[2] & AESNI_BIT) != 0; + bHaveSse2 = (cpu_info[3] & SSE2_BIT) != 0; + +#ifdef _WIN32 + __cpuidex(cpu_info, 7, 0); +#else + __cpuid_count(7, 0, cpu_info[0], cpu_info[1], cpu_info[2], cpu_info[3]); +#endif + + bHaveBmi2 = (cpu_info[1] & BMI2_BIT) != 0; if(!bHaveAes) printer::inst()->print_msg(L0, "Your CPU doesn't support hardware AES. Don't expect high hashrates."); - return (cpu_info[3] & SSE2_BIT) != 0; + if(bHaveBmi2) + printer::inst()->print_msg(L0, "CPU supports BMI2 instructions. Faster multiplication enabled."); + + return bHaveSse2; } bool jconf::parse_config(const char* sFilename) @@ -56,6 +56,7 @@ public: bool PreferIpv4(); inline bool HaveHardwareAes() { return bHaveAes; } + inline bool HaveMulx() { return bHaveBmi2; } private: jconf(); @@ -66,4 +67,5 @@ private: opaque_private* prv; bool bHaveAes; + bool bHaveBmi2; }; diff --git a/minethd.cpp b/minethd.cpp index 90a6f31..cf26037 100644 --- a/minethd.cpp +++ b/minethd.cpp @@ -25,6 +25,7 @@ #include <cmath> #include <chrono> #include <thread> +#include <bitset> #include "console.h" #ifdef _WIN32 @@ -62,7 +63,7 @@ void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) #include "executor.h" #include "minethd.h" #include "jconf.h" -#include "crypto/cryptonight.h" +#include "crypto/cryptonight_aesni.h" telemetry::telemetry(size_t iThd) { @@ -244,42 +245,29 @@ bool minethd::self_test() return false; } - bool bHasLp = ctx0->ctx_info[0] == 1 && ctx1->ctx_info[0] == 1; - size_t n = jconf::inst()->GetThreadCount(); - jconf::thd_cfg cfg; - for (size_t i = 0; i < n; i++) - { - jconf::inst()->GetThreadConfig(i, cfg); - - if(!bHasLp && cfg.bNoPrefetch) - { - printer::inst()->print_msg(L0, "Wrong config. You are running in slow memory mode with no_prefetch."); - cryptonight_free_ctx(ctx0); - cryptonight_free_ctx(ctx1); - return false; - } - } - unsigned char out[64]; bool bResult; - if(jconf::inst()->HaveHardwareAes()) - { - cryptonight_hash_ctx("This is a test", 14, out, ctx0); - bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + cn_hash_fun hashf; + cn_hash_fun_dbl hashdf; - cryptonight_hash_ctx_np("This is a test", 14, out, ctx0); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + hashf = func_selector(jconf::inst()->HaveHardwareAes(), false, jconf::inst()->HaveMulx()); + hashf("This is a test", 14, out, ctx0); + bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - cryptonight_double_hash_ctx("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); - bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - } - else - { - cryptonight_hash_ctx_soft("This is a test", 14, out, ctx0); - bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - } + hashf = func_selector(jconf::inst()->HaveHardwareAes(), true, jconf::inst()->HaveMulx()); + hashf("This is a test", 14, out, ctx0); + bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + + hashdf = func_dbl_selector(jconf::inst()->HaveHardwareAes(), false, jconf::inst()->HaveMulx()); + hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); + bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + + hashdf = func_dbl_selector(jconf::inst()->HaveHardwareAes(), true, jconf::inst()->HaveMulx()); + hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); + bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; cryptonight_free_ctx(ctx0); cryptonight_free_ctx(ctx1); @@ -350,21 +338,48 @@ void minethd::consume_work() iConsumeCnt++; } +minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, bool bMulx) +{ + // We have three independent flag bits in the functions + // therefore we will build a binary digit and select the + // function as a three digit binary + // Digit order SOFT_AES, NO_PREFETCH, MULX + + static const cn_hash_fun func_table[8] = { + cryptonight_hash<0x80000, MEMORY, false, false, false>, + cryptonight_hash<0x80000, MEMORY, false, false, true>, + cryptonight_hash<0x80000, MEMORY, false, true, false>, + cryptonight_hash<0x80000, MEMORY, false, true, true>, + cryptonight_hash<0x80000, MEMORY, true, false, false>, + cryptonight_hash<0x80000, MEMORY, true, false, true>, + cryptonight_hash<0x80000, MEMORY, true, true, false>, + cryptonight_hash<0x80000, MEMORY, true, true, true> + }; + + std::bitset<3> digit; + digit.set(0, bMulx); + digit.set(1, bNoPrefetch); + digit.set(2, !bHaveAes); + + return func_table[digit.to_ulong()]; +} + void minethd::work_main() { + cn_hash_fun hash_fun; cryptonight_ctx* ctx; uint64_t iCount = 0; uint64_t* piHashVal; uint32_t* piNonce; job_result result; + hash_fun = func_selector(jconf::inst()->HaveHardwareAes(), bNoPrefetch, jconf::inst()->HaveMulx()); ctx = minethd_alloc_ctx(); piHashVal = (uint64_t*)(result.bResult + 24); piNonce = (uint32_t*)(oWork.bWorkBlob + 39); iConsumeCnt++; - bool bHaveAes = jconf::inst()->HaveHardwareAes(); while (bQuit == 0) { if (oWork.bStall) @@ -401,15 +416,7 @@ void minethd::work_main() *piNonce = ++result.iNonce; - if(bHaveAes) - { - if(bNoPrefetch) - cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); - else - cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); - } - else - cryptonight_hash_ctx_soft(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); + hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); if (*piHashVal < oWork.iTarget) executor::inst()->push_event(ex_event(result, oWork.iPoolId)); @@ -423,8 +430,35 @@ void minethd::work_main() cryptonight_free_ctx(ctx); } +minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool bMulx) +{ + // We have three independent flag bits in the functions + // therefore we will build a binary digit and select the + // function as a three digit binary + // Digit order SOFT_AES, NO_PREFETCH, MULX + + static const cn_hash_fun_dbl func_table[8] = { + cryptonight_double_hash<0x80000, MEMORY, false, false, false>, + cryptonight_double_hash<0x80000, MEMORY, false, false, true>, + cryptonight_double_hash<0x80000, MEMORY, false, true, false>, + cryptonight_double_hash<0x80000, MEMORY, false, true, true>, + cryptonight_double_hash<0x80000, MEMORY, true, false, false>, + cryptonight_double_hash<0x80000, MEMORY, true, false, true>, + cryptonight_double_hash<0x80000, MEMORY, true, true, false>, + cryptonight_double_hash<0x80000, MEMORY, true, true, true> + }; + + std::bitset<3> digit; + digit.set(0, bMulx); + digit.set(1, bNoPrefetch); + digit.set(2, !bHaveAes); + + return func_table[digit.to_ulong()]; +} + void minethd::double_work_main() { + cn_hash_fun_dbl hash_fun; cryptonight_ctx* ctx0; cryptonight_ctx* ctx1; uint64_t iCount = 0; @@ -435,6 +469,7 @@ void minethd::double_work_main() uint32_t iNonce; job_result res; + hash_fun = func_dbl_selector(jconf::inst()->HaveHardwareAes(), bNoPrefetch, jconf::inst()->HaveMulx()); ctx0 = minethd_alloc_ctx(); ctx1 = minethd_alloc_ctx(); @@ -484,7 +519,8 @@ void minethd::double_work_main() *piNonce0 = ++iNonce; *piNonce1 = ++iNonce; - cryptonight_double_hash_ctx(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1); + + hash_fun(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1); if (*piHashVal0 < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce-1, bDoubleHashOut), oWork.iPoolId)); @@ -1,6 +1,7 @@ #pragma once #include <thread> #include <atomic> +#include "crypto/cryptonight.h" class telemetry { @@ -97,6 +98,9 @@ public: std::atomic<uint64_t> iTimestamp; private: + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, __restrict cryptonight_ctx*, __restrict cryptonight_ctx*); + minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch); // We use the top 10 bits of the nonce for thread and resume @@ -110,6 +114,9 @@ private: inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume) { return start | (resume * iThreadCount + iThreadNo) << 18; } + static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, bool bMulx); + static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool bMulx); + void work_main(); void double_work_main(); void consume_work(); diff --git a/xmr-stak-cpu.cbp b/xmr-stak-cpu.cbp index cc606fd..8df8044 100644 --- a/xmr-stak-cpu.cbp +++ b/xmr-stak-cpu.cbp @@ -59,7 +59,7 @@ </Target> </Build> <Compiler> - <Add option="-march=westmere" /> + <Add option="-march=haswell" /> <Add option="-Wall" /> </Compiler> <Linker> |