summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorfireice-uk <fireice2@o2.pl>2017-01-15 00:58:12 +0000
committerfireice-uk <fireice2@o2.pl>2017-01-15 00:58:12 +0000
commitf59c4d1776209fb87efcea75501af52fa2f487fa (patch)
treef849c04a27920b1142fbb3024b9909ac1cceb495
parente3bda576251367f4c5e48b7830d525d3708e08c5 (diff)
downloadxmr-stak-f59c4d1776209fb87efcea75501af52fa2f487fa.zip
xmr-stak-f59c4d1776209fb87efcea75501af52fa2f487fa.tar.gz
No prefetch option
-rw-r--r--config.txt23
-rw-r--r--crypto/cryptonight.h2
-rw-r--r--crypto/cryptonight_aesni.c48
-rw-r--r--jconf.cpp9
-rw-r--r--jconf.h1
-rw-r--r--minethd.cpp30
-rw-r--r--minethd.h3
7 files changed, 101 insertions, 15 deletions
diff --git a/config.txt b/config.txt
index dba5723..e78a909 100644
--- a/config.txt
+++ b/config.txt
@@ -7,16 +7,23 @@
/*
* Thread configuration for each thread. Make sure it matches the number above.
- * low_power_mode will double the cache usage, and double the single thread performance. It will consume much
- * less power (as less cores are working), but will max out at around 80-85% of the maximum performance.
- * affine_to_cpu can be either false (no affinity), or the CPU core number. Note that on hyperthreading systems
- * it is better to assign threads to physical cores. On Windows this usually means selecting even or odd numbered
- * cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 physical core CPU you should select
- * cpu numbers 0-3.
+ * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will
+ * consume much less power (as less cores are working), but will max out at around 80-85% of
+ * the maximum performance.
+ *
+ * no_prefetch - This mode meant for large pages only. It will generate an error if running on slow memory
+ * Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
+ * things slower.
+ *
+ * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
+ * systems it is better to assign threads to physical cores. On Windows this usually means selecting
+ * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
+ * physical core CPU you should select cpu numbers 0-3.
+ *
*/
"cpu_threads_conf" : [
- { "low_power_mode" : false, "affine_to_cpu" : 0 },
- { "low_power_mode" : false, "affine_to_cpu" : 1 },
+ { "low_power_mode" : false, "no_prefetch" : false, "affine_to_cpu" : 0 },
+ { "low_power_mode" : false, "no_prefetch" : false, "affine_to_cpu" : 1 },
],
/*
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h
index bf0c413..acc5fd1 100644
--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@@ -23,7 +23,9 @@ typedef struct {
size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
void cryptonight_free_ctx(cryptonight_ctx* ctx);
+
void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
+void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1);
#ifdef __cplusplus
diff --git a/crypto/cryptonight_aesni.c b/crypto/cryptonight_aesni.c
index 1d91adb..0351931 100644
--- a/crypto/cryptonight_aesni.c
+++ b/crypto/cryptonight_aesni.c
@@ -270,6 +270,54 @@ void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonig
extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output);
}
+void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
+{
+ keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
+
+ // Optim - 99% time boundary
+ cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+
+ uint8_t* l0 = ctx0->long_state;
+ uint64_t* h0 = (uint64_t*)ctx0->hash_state;
+
+ uint64_t al0 = h0[0] ^ h0[4];
+ uint64_t ah0 = h0[1] ^ h0[5];
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+ uint64_t idx0 = h0[0] ^ h0[4];
+
+ // Optim - 90% time boundary
+ for(size_t i = 0; i < 0x80000; i++)
+ {
+ __m128i cx;
+ cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
+ cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+ _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+ idx0 = _mm_cvtsi128_si64(cx);
+ bx0 = cx;
+
+ uint64_t hi, lo, cl, ch;
+ cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0];
+ ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1];
+ lo = _umul128(idx0, cl, &hi);
+ al0 += hi;
+ ah0 += lo;
+ ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+ ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+ ah0 ^= ch;
+ al0 ^= cl;
+ idx0 = al0;
+ }
+
+ // Optim - 90% time boundary
+ cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+
+ // Optim - 99% time boundary
+
+ keccakf((uint64_t*)ctx0->hash_state, 24);
+ extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output);
+}
+
// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
diff --git a/jconf.cpp b/jconf.cpp
index b5ce836..c32f1d4 100644
--- a/jconf.cpp
+++ b/jconf.cpp
@@ -99,14 +99,15 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
- const Value *mode, *aff;
+ const Value *mode, *no_prefetch, *aff;
mode = GetObjectMember(oThdConf, "low_power_mode");
+ no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
- if(mode == nullptr || aff == nullptr)
+ if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
return false;
- if(!mode->IsBool())
+ if(!mode->IsBool() || !no_prefetch->IsBool())
return false;
if(!aff->IsNumber() && !aff->IsBool())
@@ -116,6 +117,8 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
return false;
cfg.bDoubleMode = mode->GetBool();
+ cfg.bNoPrefetch = no_prefetch->GetBool();
+
if(aff->IsNumber())
cfg.iCpuAff = aff->GetInt64();
else
diff --git a/jconf.h b/jconf.h
index 0a2a7f8..c04bc1d 100644
--- a/jconf.h
+++ b/jconf.h
@@ -15,6 +15,7 @@ public:
struct thd_cfg {
bool bDoubleMode;
+ bool bNoPrefetch;
long long iCpuAff;
};
diff --git a/minethd.cpp b/minethd.cpp
index c74883c..6300b5c 100644
--- a/minethd.cpp
+++ b/minethd.cpp
@@ -117,7 +117,7 @@ void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTime
iBucketTop[iThd] = (iTop + 1) & iBucketMask;
}
-minethd::minethd(miner_work& pWork, size_t iNo, bool double_work)
+minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch)
{
oWork = pWork;
bQuit = 0;
@@ -125,6 +125,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work)
iJobNo = 0;
iHashCount = 0;
iTimestamp = 0;
+ bNoPrefetch = no_prefetch;
if(double_work)
oWorkThd = std::thread(&minethd::double_work_main, this);
@@ -242,12 +243,31 @@ bool minethd::self_test()
return false;
}
+ bool bHasLp = ctx0->ctx_info[0] == 1 && ctx1->ctx_info[1];
+ size_t n = jconf::inst()->GetThreadCount();
+ jconf::thd_cfg cfg;
+ for (size_t i = 0; i < n; i++)
+ {
+ jconf::inst()->GetThreadConfig(i, cfg);
+
+ if(!bHasLp && cfg.bNoPrefetch)
+ {
+ printer::inst()->print_msg(L0, "Wrong config. You are running in slow memory mode with no_prefetch.");
+ cryptonight_free_ctx(ctx0);
+ cryptonight_free_ctx(ctx1);
+ return false;
+ }
+ }
+
unsigned char out[64];
bool bResult;
cryptonight_hash_ctx("This is a test", 14, out, ctx0);
bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+ cryptonight_hash_ctx_np("This is a test", 14, out, ctx0);
+ bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
cryptonight_double_hash_ctx("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
@@ -278,7 +298,7 @@ std::vector<minethd*>* minethd::thread_starter(miner_work& pWork)
{
jconf::inst()->GetThreadConfig(i, cfg);
- minethd* thd = new minethd(pWork, i, cfg.bDoubleMode);
+ minethd* thd = new minethd(pWork, i, cfg.bDoubleMode, cfg.bNoPrefetch);
if(cfg.iCpuAff >= 0)
thd_setaffinity(thd->oWorkThd.native_handle(), cfg.iCpuAff);
@@ -362,7 +382,11 @@ void minethd::work_main()
iCount++;
*piNonce = ++result.iNonce;
- cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+
+ if(bNoPrefetch)
+ cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+ else
+ cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
if (*piHashVal < oWork.iTarget)
executor::inst()->push_event(ex_event(result, oWork.iPoolId));
diff --git a/minethd.h b/minethd.h
index e130a21..6d6104f 100644
--- a/minethd.h
+++ b/minethd.h
@@ -94,7 +94,7 @@ public:
std::atomic<uint64_t> iTimestamp;
private:
- minethd(miner_work& pWork, size_t iNo, bool double_work);
+ minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch);
// We use the top 10 bits of the nonce for thread and resume
// This allows us to resume up to 128 threads 4 times before
@@ -119,5 +119,6 @@ private:
uint8_t iThreadNo;
bool bQuit;
+ bool bNoPrefetch;
};
OpenPOWER on IntegriCloud