diff options
author | fireice-uk <fireice2@o2.pl> | 2017-01-15 00:58:12 +0000 |
---|---|---|
committer | fireice-uk <fireice2@o2.pl> | 2017-01-15 00:58:12 +0000 |
commit | f59c4d1776209fb87efcea75501af52fa2f487fa (patch) | |
tree | f849c04a27920b1142fbb3024b9909ac1cceb495 | |
parent | e3bda576251367f4c5e48b7830d525d3708e08c5 (diff) | |
download | xmr-stak-f59c4d1776209fb87efcea75501af52fa2f487fa.zip xmr-stak-f59c4d1776209fb87efcea75501af52fa2f487fa.tar.gz |
No prefetch option
-rw-r--r-- | config.txt | 23 | ||||
-rw-r--r-- | crypto/cryptonight.h | 2 | ||||
-rw-r--r-- | crypto/cryptonight_aesni.c | 48 | ||||
-rw-r--r-- | jconf.cpp | 9 | ||||
-rw-r--r-- | jconf.h | 1 | ||||
-rw-r--r-- | minethd.cpp | 30 | ||||
-rw-r--r-- | minethd.h | 3 |
7 files changed, 101 insertions, 15 deletions
@@ -7,16 +7,23 @@ /*
* Thread configuration for each thread. Make sure it matches the number above.
- * low_power_mode will double the cache usage, and double the single thread performance. It will consume much
- * less power (as less cores are working), but will max out at around 80-85% of the maximum performance.
- * affine_to_cpu can be either false (no affinity), or the CPU core number. Note that on hyperthreading systems
- * it is better to assign threads to physical cores. On Windows this usually means selecting even or odd numbered
- * cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 physical core CPU you should select
- * cpu numbers 0-3.
+ * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will
+ * consume much less power (as less cores are working), but will max out at around 80-85% of
+ * the maximum performance.
+ *
+ * no_prefetch - This mode meant for large pages only. It will generate an error if running on slow memory
+ * Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
+ * things slower.
+ *
+ * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
+ * systems it is better to assign threads to physical cores. On Windows this usually means selecting
+ * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
+ * physical core CPU you should select cpu numbers 0-3.
+ *
*/
"cpu_threads_conf" : [
- { "low_power_mode" : false, "affine_to_cpu" : 0 },
- { "low_power_mode" : false, "affine_to_cpu" : 1 },
+ { "low_power_mode" : false, "no_prefetch" : false, "affine_to_cpu" : 0 },
+ { "low_power_mode" : false, "no_prefetch" : false, "affine_to_cpu" : 1 },
],
/*
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index bf0c413..acc5fd1 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -23,7 +23,9 @@ typedef struct { size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); void cryptonight_free_ctx(cryptonight_ctx* ctx); + void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx); +void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx); void cryptonight_double_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1); #ifdef __cplusplus diff --git a/crypto/cryptonight_aesni.c b/crypto/cryptonight_aesni.c index 1d91adb..0351931 100644 --- a/crypto/cryptonight_aesni.c +++ b/crypto/cryptonight_aesni.c @@ -270,6 +270,54 @@ void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonig extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output); } +void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx0) +{ + keccak((const uint8_t *)input, len, ctx0->hash_state, 200); + + // Optim - 99% time boundary + cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); + + uint8_t* l0 = ctx0->long_state; + uint64_t* h0 = (uint64_t*)ctx0->hash_state; + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + + // Optim - 90% time boundary + for(size_t i = 0; i < 0x80000; i++) + { + __m128i cx; + cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); + idx0 = _mm_cvtsi128_si64(cx); + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + al0 += hi; + ah0 += lo; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + } + + // Optim - 90% time boundary + cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); + + // Optim - 99% time boundary + + keccakf((uint64_t*)ctx0->hash_state, 24); + extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output); +} + // This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) @@ -99,14 +99,15 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *mode, *aff; + const Value *mode, *no_prefetch, *aff; mode = GetObjectMember(oThdConf, "low_power_mode"); + no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); - if(mode == nullptr || aff == nullptr) + if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) return false; - if(!mode->IsBool()) + if(!mode->IsBool() || !no_prefetch->IsBool()) return false; if(!aff->IsNumber() && !aff->IsBool()) @@ -116,6 +117,8 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return false; cfg.bDoubleMode = mode->GetBool(); + cfg.bNoPrefetch = no_prefetch->GetBool(); + if(aff->IsNumber()) cfg.iCpuAff = aff->GetInt64(); else @@ -15,6 +15,7 @@ public: struct thd_cfg { bool bDoubleMode; + bool bNoPrefetch; long long iCpuAff; }; diff --git a/minethd.cpp b/minethd.cpp index c74883c..6300b5c 100644 --- a/minethd.cpp +++ b/minethd.cpp @@ -117,7 +117,7 @@ void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTime iBucketTop[iThd] = (iTop + 1) & iBucketMask; } -minethd::minethd(miner_work& pWork, size_t iNo, bool double_work) +minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch) { oWork = pWork; bQuit = 0; @@ -125,6 +125,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work) iJobNo = 0; iHashCount = 0; iTimestamp = 0; + bNoPrefetch = no_prefetch; if(double_work) oWorkThd = std::thread(&minethd::double_work_main, this); @@ -242,12 +243,31 @@ bool minethd::self_test() return false; } + bool bHasLp = ctx0->ctx_info[0] == 1 && ctx1->ctx_info[1]; + size_t n = jconf::inst()->GetThreadCount(); + jconf::thd_cfg cfg; + for (size_t i = 0; i < n; i++) + { + jconf::inst()->GetThreadConfig(i, cfg); + + if(!bHasLp && cfg.bNoPrefetch) + { + printer::inst()->print_msg(L0, "Wrong config. You are running in slow memory mode with no_prefetch."); + cryptonight_free_ctx(ctx0); + cryptonight_free_ctx(ctx1); + return false; + } + } + unsigned char out[64]; bool bResult; cryptonight_hash_ctx("This is a test", 14, out, ctx0); bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + cryptonight_hash_ctx_np("This is a test", 14, out, ctx0); + bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + cryptonight_double_hash_ctx("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; @@ -278,7 +298,7 @@ std::vector<minethd*>* minethd::thread_starter(miner_work& pWork) { jconf::inst()->GetThreadConfig(i, cfg); - minethd* thd = new minethd(pWork, i, cfg.bDoubleMode); + minethd* thd = new minethd(pWork, i, cfg.bDoubleMode, cfg.bNoPrefetch); if(cfg.iCpuAff >= 0) thd_setaffinity(thd->oWorkThd.native_handle(), cfg.iCpuAff); @@ -362,7 +382,11 @@ void minethd::work_main() iCount++; *piNonce = ++result.iNonce; - cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); + + if(bNoPrefetch) + cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); + else + cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); if (*piHashVal < oWork.iTarget) executor::inst()->push_event(ex_event(result, oWork.iPoolId)); @@ -94,7 +94,7 @@ public: std::atomic<uint64_t> iTimestamp; private: - minethd(miner_work& pWork, size_t iNo, bool double_work); + minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch); // We use the top 10 bits of the nonce for thread and resume // This allows us to resume up to 128 threads 4 times before @@ -119,5 +119,6 @@ private: uint8_t iThreadNo; bool bQuit; + bool bNoPrefetch; }; |