No prefetch option

author: fireice-uk <fireice2@o2.pl> 2017-01-15 00:58:12 +0000
committer: fireice-uk <fireice2@o2.pl> 2017-01-15 00:58:12 +0000
commit: f59c4d1776209fb87efcea75501af52fa2f487fa (patch)
tree: f849c04a27920b1142fbb3024b9909ac1cceb495
parent: e3bda576251367f4c5e48b7830d525d3708e08c5 (diff)
download: xmr-stak-f59c4d1776209fb87efcea75501af52fa2f487fa.zip
xmr-stak-f59c4d1776209fb87efcea75501af52fa2f487fa.tar.gz
7 files changed, 101 insertions, 15 deletions
diff --git a/config.txt b/config.txt
index dba5723..e78a909 100644
--- a/config.txt
+++ b/config.txt
@@ -7,16 +7,23 @@
 
 /*
  * Thread configuration for each thread. Make sure it matches the number above.
- * low_power_mode will double the cache usage, and double the single thread performance. It will consume much 
- * less power (as less cores are working), but will max out at around 80-85% of the maximum performance.
- * affine_to_cpu can be either false (no affinity), or the CPU core number. Note that on hyperthreading systems
- * it is better to assign threads to physical cores. On Windows this usually means selecting even or odd numbered
- * cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 physical core CPU you should select
- * cpu numbers 0-3.
+ * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will 
+ *                  consume much less power (as less cores are working), but will max out at around 80-85% of 
+ *                  the maximum performance.
+ *
+ * no_prefetch -    This mode meant for large pages only. It will generate an error if running on slow memory
+ *                  Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
+ *                  things slower.
+ *
+ * affine_to_cpu -  This can be either false (no affinity), or the CPU core number. Note that on hyperthreading 
+ *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting 
+ *                  even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 
+ *                  physical core CPU you should select cpu numbers 0-3.
+ *
  */
 "cpu_threads_conf" : [ 
-	{ "low_power_mode" : false, "affine_to_cpu" : 0 },
-	{ "low_power_mode" : false, "affine_to_cpu" : 1 },
+	{ "low_power_mode" : false, "no_prefetch" : false, "affine_to_cpu" : 0 },
+	{ "low_power_mode" : false, "no_prefetch" : false, "affine_to_cpu" : 1 },
 ],
 
 /*
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h
index bf0c413..acc5fd1 100644
--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@@ -23,7 +23,9 @@ typedef struct {
 size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
 cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
 void cryptonight_free_ctx(cryptonight_ctx* ctx);
+
 void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
+void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx);
 void cryptonight_double_hash_ctx(const void*  input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1);
 
 #ifdef __cplusplus
diff --git a/crypto/cryptonight_aesni.c b/crypto/cryptonight_aesni.c
index 1d91adb..0351931 100644
--- a/crypto/cryptonight_aesni.c
+++ b/crypto/cryptonight_aesni.c
@@ -270,6 +270,54 @@ void cryptonight_hash_ctx(const void* input, size_t len, void* output, cryptonig
 	extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output);
 }
 
+void cryptonight_hash_ctx_np(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
+{
+	keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
+
+	// Optim - 99% time boundary
+	cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+
+	uint8_t* l0 = ctx0->long_state;
+	uint64_t* h0 = (uint64_t*)ctx0->hash_state;
+
+	uint64_t al0 = h0[0] ^ h0[4];
+	uint64_t ah0 = h0[1] ^ h0[5];
+	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+	uint64_t idx0 = h0[0] ^ h0[4];
+
+	// Optim - 90% time boundary
+	for(size_t i = 0; i < 0x80000; i++)
+	{
+		__m128i cx;
+		cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
+		cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+		_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+		idx0 = _mm_cvtsi128_si64(cx);
+		bx0 = cx;
+
+		uint64_t hi, lo, cl, ch;
+		cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0];
+		ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1];
+		lo = _umul128(idx0, cl, &hi);
+		al0 += hi;
+		ah0 += lo;
+		((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+		((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+		ah0 ^= ch;
+		al0 ^= cl;
+		idx0 = al0;
+	}
+
+	// Optim - 90% time boundary
+	cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+
+	// Optim - 99% time boundary
+
+	keccakf((uint64_t*)ctx0->hash_state, 24);
+	extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, output);
+}
+
 // This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
 // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
 // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
diff --git a/jconf.cpp b/jconf.cpp
index b5ce836..c32f1d4 100644
--- a/jconf.cpp
+++ b/jconf.cpp
@@ -99,14 +99,15 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *mode, *aff;
+	const Value *mode, *no_prefetch, *aff;
 	mode = GetObjectMember(oThdConf, "low_power_mode");
+	no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 
-	if(mode == nullptr || aff == nullptr)
+	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
 		return false;
 
-	if(!mode->IsBool())
+	if(!mode->IsBool() || !no_prefetch->IsBool())
 		return false;
 
 	if(!aff->IsNumber() && !aff->IsBool())
@@ -116,6 +117,8 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 
 	cfg.bDoubleMode = mode->GetBool();
+	cfg.bNoPrefetch = no_prefetch->GetBool();
+
 	if(aff->IsNumber())
 		cfg.iCpuAff = aff->GetInt64();
 	else
diff --git a/jconf.h b/jconf.h
index 0a2a7f8..c04bc1d 100644
--- a/jconf.h
+++ b/jconf.h
@@ -15,6 +15,7 @@ public:
 
 	struct thd_cfg {
 		bool bDoubleMode;
+		bool bNoPrefetch;
 		long long iCpuAff;
 	};
 
diff --git a/minethd.cpp b/minethd.cpp
index c74883c..6300b5c 100644
--- a/minethd.cpp
+++ b/minethd.cpp
@@ -117,7 +117,7 @@ void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTime
 	iBucketTop[iThd] = (iTop + 1) & iBucketMask;
 }
 
-minethd::minethd(miner_work& pWork, size_t iNo, bool double_work)
+minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch)
 {
 	oWork = pWork;
 	bQuit = 0;
@@ -125,6 +125,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work)
 	iJobNo = 0;
 	iHashCount = 0;
 	iTimestamp = 0;
+	bNoPrefetch = no_prefetch;
 
 	if(double_work)
 		oWorkThd = std::thread(&minethd::double_work_main, this);
@@ -242,12 +243,31 @@ bool minethd::self_test()
 		return false;
 	}
 
+	bool bHasLp = ctx0->ctx_info[0] == 1 && ctx1->ctx_info[1];
+	size_t n = jconf::inst()->GetThreadCount();
+	jconf::thd_cfg cfg;
+	for (size_t i = 0; i < n; i++)
+	{
+		jconf::inst()->GetThreadConfig(i, cfg);
+
+		if(!bHasLp && cfg.bNoPrefetch)
+		{
+			printer::inst()->print_msg(L0, "Wrong config. You are running in slow memory mode with no_prefetch.");
+			cryptonight_free_ctx(ctx0);
+			cryptonight_free_ctx(ctx1);
+			return false;
+		}
+	}
+
 	unsigned char out[64];
 	bool bResult;
 
 	cryptonight_hash_ctx("This is a test", 14, out, ctx0);
 	bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
+	cryptonight_hash_ctx_np("This is a test", 14, out, ctx0);
+	bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
 	cryptonight_double_hash_ctx("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
 	bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
 	                       "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
@@ -278,7 +298,7 @@ std::vector<minethd*>* minethd::thread_starter(miner_work& pWork)
 	{
 		jconf::inst()->GetThreadConfig(i, cfg);
 
-		minethd* thd = new minethd(pWork, i, cfg.bDoubleMode);
+		minethd* thd = new minethd(pWork, i, cfg.bDoubleMode, cfg.bNoPrefetch);
 
 		if(cfg.iCpuAff >= 0)
 			thd_setaffinity(thd->oWorkThd.native_handle(), cfg.iCpuAff);
@@ -362,7 +382,11 @@ void minethd::work_main()
 			iCount++;
 
 			*piNonce = ++result.iNonce;
-			cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+
+			if(bNoPrefetch)
+				cryptonight_hash_ctx_np(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+			else
+				cryptonight_hash_ctx(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
 
 			if (*piHashVal < oWork.iTarget)
 				executor::inst()->push_event(ex_event(result, oWork.iPoolId));
diff --git a/minethd.h b/minethd.h
index e130a21..6d6104f 100644
--- a/minethd.h
+++ b/minethd.h
@@ -94,7 +94,7 @@ public:
 	std::atomic<uint64_t> iTimestamp;
 
 private:
-	minethd(miner_work& pWork, size_t iNo, bool double_work);
+	minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch);
 
 	// We use the top 10 bits of the nonce for thread and resume
 	// This allows us to resume up to 128 threads 4 times before
@@ -119,5 +119,6 @@ private:
 	uint8_t iThreadNo;
 
 	bool bQuit;
+	bool bNoPrefetch;
 };
author	fireice-uk <fireice2@o2.pl>	2017-01-15 00:58:12 +0000
committer	fireice-uk <fireice2@o2.pl>	2017-01-15 00:58:12 +0000
commit	f59c4d1776209fb87efcea75501af52fa2f487fa (patch)
tree	f849c04a27920b1142fbb3024b9909ac1cceb495
parent	e3bda576251367f4c5e48b7830d525d3708e08c5 (diff)
download	xmr-stak-f59c4d1776209fb87efcea75501af52fa2f487fa.zip xmr-stak-f59c4d1776209fb87efcea75501af52fa2f487fa.tar.gz