summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--crypto/cryptonight.h5
-rw-r--r--crypto/cryptonight_aesni.h37
-rw-r--r--crypto/cryptonight_common.cpp20
-rw-r--r--jconf.cpp17
-rw-r--r--jconf.h2
-rw-r--r--minethd.cpp122
-rw-r--r--minethd.h7
-rw-r--r--xmr-stak-cpu.cbp2
8 files changed, 137 insertions, 75 deletions
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h
index f4ec843..978c798 100644
--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@@ -24,11 +24,6 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
void cryptonight_free_ctx(cryptonight_ctx* ctx);
-void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
-void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
-void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
-void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1);
-
#ifdef __cplusplus
}
#endif
diff --git a/crypto/cryptonight_aesni.h b/crypto/cryptonight_aesni.h
index 37672c6..de0f186 100644
--- a/crypto/cryptonight_aesni.h
+++ b/crypto/cryptonight_aesni.h
@@ -279,7 +279,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
_mm_store_si128(output + 11, xout7);
}
-template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES>
+template<size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH, bool MULX>
void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
{
keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
@@ -301,20 +301,28 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
{
__m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
+
if(SOFT_AES)
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
else
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx;
+
if(PREFETCH)
_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1];
- lo = _umul128(idx0, cl, &hi);
+
+ if(MULX)
+ lo = _mulx_u64(idx0, cl, (long long unsigned int*)&hi);
+ else
+ lo = _umul128(idx0, cl, &hi);
+
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
@@ -322,6 +330,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
+
if(PREFETCH)
_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
}
@@ -338,7 +347,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
-template<size_t ITERATIONS, size_t MEM, bool PREFETCH, bool SOFT_AES>
+template<size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH, bool MULX>
void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
{
keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
@@ -366,43 +375,61 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
{
__m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
+
if(SOFT_AES)
cx = soft_aesenc(cx, ax0);
else
cx = _mm_aesenc_si128(cx, ax0);
+
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx;
+
if(PREFETCH)
_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]);
+
if(SOFT_AES)
cx = soft_aesenc(cx, ax1);
else
cx = _mm_aesenc_si128(cx, ax1);
+
_mm_store_si128((__m128i *)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx));
idx1 = _mm_cvtsi128_si64(cx);
bx1 = cx;
+
if(PREFETCH)
_mm_prefetch((const char*)&l1[idx1 & 0x1FFFF0], _MM_HINT_T0);
uint64_t hi, lo;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
- lo = _umul128(idx0, _mm_cvtsi128_si64(cx), &hi);
+
+ if(MULX)
+ lo = _mulx_u64(idx0, _mm_cvtsi128_si64(cx), (long long unsigned int*)&hi);
+ else
+ lo = _umul128(idx0, _mm_cvtsi128_si64(cx), &hi);
+
ax0 = _mm_add_epi64(ax0, _mm_set_epi64x(lo, hi));
_mm_store_si128((__m128i*)&l0[idx0 & 0x1FFFF0], ax0);
ax0 = _mm_xor_si128(ax0, cx);
idx0 = _mm_cvtsi128_si64(ax0);
+
if(PREFETCH)
_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]);
- lo = _umul128(idx1, _mm_cvtsi128_si64(cx), &hi);
+
+ if(MULX)
+ lo = _mulx_u64(idx1, _mm_cvtsi128_si64(cx), (long long unsigned int*)&hi);
+ else
+ lo = _umul128(idx1, _mm_cvtsi128_si64(cx), &hi);
+
ax1 = _mm_add_epi64(ax1, _mm_set_epi64x(lo, hi));
_mm_store_si128((__m128i*)&l1[idx1 & 0x1FFFF0], ax1);
ax1 = _mm_xor_si128(ax1, cx);
idx1 = _mm_cvtsi128_si64(ax1);
+
if(PREFETCH)
_mm_prefetch((const char*)&l1[idx1 & 0x1FFFF0], _MM_HINT_T0);
}
diff --git a/crypto/cryptonight_common.cpp b/crypto/cryptonight_common.cpp
index bbcff34..63ce3a4 100644
--- a/crypto/cryptonight_common.cpp
+++ b/crypto/cryptonight_common.cpp
@@ -190,23 +190,3 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx)
_mm_free(ctx);
}
-
-void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
-{
- cryptonight_hash<0x80000, MEMORY, true, false>(input, len, output, ctx);
-}
-
-void cryptonight_hash_ctx_soft(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
-{
- cryptonight_hash<0x80000, MEMORY, true, true>(input, len, output, ctx);
-}
-
-void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx)
-{
- cryptonight_hash<0x80000, MEMORY, false, false>(input, len, output, ctx);
-}
-
-void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
-{
- cryptonight_double_hash<0x80000, MEMORY, false, false>(input, len, output, ctx0, ctx1);
-}
diff --git a/jconf.cpp b/jconf.cpp
index 612065e..ce82525 100644
--- a/jconf.cpp
+++ b/jconf.cpp
@@ -253,8 +253,11 @@ bool jconf::check_cpu_features()
{
constexpr int AESNI_BIT = 1 << 25;
constexpr int SSE2_BIT = 1 << 26;
+ constexpr int BMI2_BIT = 1 << 8;
int cpu_info[4];
+ bool bHaveSse2;
+
#ifdef _WIN32
__cpuid(cpu_info, 1);
#else
@@ -262,11 +265,23 @@ bool jconf::check_cpu_features()
#endif
bHaveAes = (cpu_info[2] & AESNI_BIT) != 0;
+ bHaveSse2 = (cpu_info[3] & SSE2_BIT) != 0;
+
+#ifdef _WIN32
+ __cpuidex(cpu_info, 7, 0);
+#else
+ __cpuid_count(7, 0, cpu_info[0], cpu_info[1], cpu_info[2], cpu_info[3]);
+#endif
+
+ bHaveBmi2 = (cpu_info[1] & BMI2_BIT) != 0;
if(!bHaveAes)
printer::inst()->print_msg(L0, "Your CPU doesn't support hardware AES. Don't expect high hashrates.");
- return (cpu_info[3] & SSE2_BIT) != 0;
+ if(bHaveBmi2)
+ printer::inst()->print_msg(L0, "CPU supports BMI2 instructions. Faster multiplication enabled.");
+
+ return bHaveSse2;
}
bool jconf::parse_config(const char* sFilename)
diff --git a/jconf.h b/jconf.h
index 6c3954b..7444a01 100644
--- a/jconf.h
+++ b/jconf.h
@@ -56,6 +56,7 @@ public:
bool PreferIpv4();
inline bool HaveHardwareAes() { return bHaveAes; }
+ inline bool HaveMulx() { return bHaveBmi2; }
private:
jconf();
@@ -66,4 +67,5 @@ private:
opaque_private* prv;
bool bHaveAes;
+ bool bHaveBmi2;
};
diff --git a/minethd.cpp b/minethd.cpp
index 90a6f31..cf26037 100644
--- a/minethd.cpp
+++ b/minethd.cpp
@@ -25,6 +25,7 @@
#include <cmath>
#include <chrono>
#include <thread>
+#include <bitset>
#include "console.h"
#ifdef _WIN32
@@ -62,7 +63,7 @@ void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
#include "executor.h"
#include "minethd.h"
#include "jconf.h"
-#include "crypto/cryptonight.h"
+#include "crypto/cryptonight_aesni.h"
telemetry::telemetry(size_t iThd)
{
@@ -244,42 +245,29 @@ bool minethd::self_test()
return false;
}
- bool bHasLp = ctx0->ctx_info[0] == 1 && ctx1->ctx_info[0] == 1;
- size_t n = jconf::inst()->GetThreadCount();
- jconf::thd_cfg cfg;
- for (size_t i = 0; i < n; i++)
- {
- jconf::inst()->GetThreadConfig(i, cfg);
-
- if(!bHasLp && cfg.bNoPrefetch)
- {
- printer::inst()->print_msg(L0, "Wrong config. You are running in slow memory mode with no_prefetch.");
- cryptonight_free_ctx(ctx0);
- cryptonight_free_ctx(ctx1);
- return false;
- }
- }
-
unsigned char out[64];
bool bResult;
- if(jconf::inst()->HaveHardwareAes())
- {
- cryptonight_hash_ctx("This is a test", 14, out, ctx0);
- bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+ cn_hash_fun hashf;
+ cn_hash_fun_dbl hashdf;
- cryptonight_hash_ctx_np("This is a test", 14, out, ctx0);
- bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+ hashf = func_selector(jconf::inst()->HaveHardwareAes(), false, jconf::inst()->HaveMulx());
+ hashf("This is a test", 14, out, ctx0);
+ bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
- cryptonight_double_hash_ctx("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
- bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
- "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
- }
- else
- {
- cryptonight_hash_ctx_soft("This is a test", 14, out, ctx0);
- bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
- }
+ hashf = func_selector(jconf::inst()->HaveHardwareAes(), true, jconf::inst()->HaveMulx());
+ hashf("This is a test", 14, out, ctx0);
+ bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+ hashdf = func_dbl_selector(jconf::inst()->HaveHardwareAes(), false, jconf::inst()->HaveMulx());
+ hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
+ bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+ "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+ hashdf = func_dbl_selector(jconf::inst()->HaveHardwareAes(), true, jconf::inst()->HaveMulx());
+ hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
+ bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+ "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
cryptonight_free_ctx(ctx0);
cryptonight_free_ctx(ctx1);
@@ -350,21 +338,48 @@ void minethd::consume_work()
iConsumeCnt++;
}
+minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, bool bMulx)
+{
+ // We have three independent flag bits in the functions
+ // therefore we will build a binary digit and select the
+ // function as a three digit binary
+ // Digit order SOFT_AES, NO_PREFETCH, MULX
+
+ static const cn_hash_fun func_table[8] = {
+ cryptonight_hash<0x80000, MEMORY, false, false, false>,
+ cryptonight_hash<0x80000, MEMORY, false, false, true>,
+ cryptonight_hash<0x80000, MEMORY, false, true, false>,
+ cryptonight_hash<0x80000, MEMORY, false, true, true>,
+ cryptonight_hash<0x80000, MEMORY, true, false, false>,
+ cryptonight_hash<0x80000, MEMORY, true, false, true>,
+ cryptonight_hash<0x80000, MEMORY, true, true, false>,
+ cryptonight_hash<0x80000, MEMORY, true, true, true>
+ };
+
+ std::bitset<3> digit;
+ digit.set(0, bMulx);
+ digit.set(1, bNoPrefetch);
+ digit.set(2, !bHaveAes);
+
+ return func_table[digit.to_ulong()];
+}
+
void minethd::work_main()
{
+ cn_hash_fun hash_fun;
cryptonight_ctx* ctx;
uint64_t iCount = 0;
uint64_t* piHashVal;
uint32_t* piNonce;
job_result result;
+ hash_fun = func_selector(jconf::inst()->HaveHardwareAes(), bNoPrefetch, jconf::inst()->HaveMulx());
ctx = minethd_alloc_ctx();
piHashVal = (uint64_t*)(result.bResult + 24);
piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
iConsumeCnt++;
- bool bHaveAes = jconf::inst()->HaveHardwareAes();
while (bQuit == 0)
{
if (oWork.bStall)
@@ -401,15 +416,7 @@ void minethd::work_main()
*piNonce = ++result.iNonce;
- if(bHaveAes)
- {
- if(bNoPrefetch)
- cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
- else
- cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
- }
- else
- cryptonight_hash_ctx_soft(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+ hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
if (*piHashVal < oWork.iTarget)
executor::inst()->push_event(ex_event(result, oWork.iPoolId));
@@ -423,8 +430,35 @@ void minethd::work_main()
cryptonight_free_ctx(ctx);
}
+minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool bMulx)
+{
+ // We have three independent flag bits in the functions
+ // therefore we will build a binary digit and select the
+ // function as a three digit binary
+ // Digit order SOFT_AES, NO_PREFETCH, MULX
+
+ static const cn_hash_fun_dbl func_table[8] = {
+ cryptonight_double_hash<0x80000, MEMORY, false, false, false>,
+ cryptonight_double_hash<0x80000, MEMORY, false, false, true>,
+ cryptonight_double_hash<0x80000, MEMORY, false, true, false>,
+ cryptonight_double_hash<0x80000, MEMORY, false, true, true>,
+ cryptonight_double_hash<0x80000, MEMORY, true, false, false>,
+ cryptonight_double_hash<0x80000, MEMORY, true, false, true>,
+ cryptonight_double_hash<0x80000, MEMORY, true, true, false>,
+ cryptonight_double_hash<0x80000, MEMORY, true, true, true>
+ };
+
+ std::bitset<3> digit;
+ digit.set(0, bMulx);
+ digit.set(1, bNoPrefetch);
+ digit.set(2, !bHaveAes);
+
+ return func_table[digit.to_ulong()];
+}
+
void minethd::double_work_main()
{
+ cn_hash_fun_dbl hash_fun;
cryptonight_ctx* ctx0;
cryptonight_ctx* ctx1;
uint64_t iCount = 0;
@@ -435,6 +469,7 @@ void minethd::double_work_main()
uint32_t iNonce;
job_result res;
+ hash_fun = func_dbl_selector(jconf::inst()->HaveHardwareAes(), bNoPrefetch, jconf::inst()->HaveMulx());
ctx0 = minethd_alloc_ctx();
ctx1 = minethd_alloc_ctx();
@@ -484,7 +519,8 @@ void minethd::double_work_main()
*piNonce0 = ++iNonce;
*piNonce1 = ++iNonce;
- cryptonight_double_hash_ctx(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1);
+
+ hash_fun(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1);
if (*piHashVal0 < oWork.iTarget)
executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce-1, bDoubleHashOut), oWork.iPoolId));
diff --git a/minethd.h b/minethd.h
index d4470b1..aa40344 100644
--- a/minethd.h
+++ b/minethd.h
@@ -1,6 +1,7 @@
#pragma once
#include <thread>
#include <atomic>
+#include "crypto/cryptonight.h"
class telemetry
{
@@ -97,6 +98,9 @@ public:
std::atomic<uint64_t> iTimestamp;
private:
+ typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+ typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, __restrict cryptonight_ctx*, __restrict cryptonight_ctx*);
+
minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch);
// We use the top 10 bits of the nonce for thread and resume
@@ -110,6 +114,9 @@ private:
inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume)
{ return start | (resume * iThreadCount + iThreadNo) << 18; }
+ static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, bool bMulx);
+ static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool bMulx);
+
void work_main();
void double_work_main();
void consume_work();
diff --git a/xmr-stak-cpu.cbp b/xmr-stak-cpu.cbp
index cc606fd..8df8044 100644
--- a/xmr-stak-cpu.cbp
+++ b/xmr-stak-cpu.cbp
@@ -59,7 +59,7 @@
</Target>
</Build>
<Compiler>
- <Add option="-march=westmere" />
+ <Add option="-march=haswell" />
<Add option="-Wall" />
</Compiler>
<Linker>
OpenPOWER on IntegriCloud