9 files changed, 90 insertions, 75 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index f9908cb..37adc5b 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -844,7 +844,8 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 		}
 	}*/
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+	size_t tmpNonce = ctx->Nonce;
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 		return ERR_OCL_API;
@@ -897,7 +898,8 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 			BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size;
 			// number of global threads must be a multiple of the work group size (w_size)
 			assert(BranchNonces[i]%w_size == 0);
-			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &ctx->Nonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+			size_t tmpNonce = ctx->Nonce;
+			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
 				return ERR_OCL_API;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 5ff7ea1..c2d708d 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -36,7 +36,7 @@ struct GpuContext
 	int computeUnits;
 	std::string name;
 
-	size_t Nonce;
+	uint32_t Nonce;
 
 };
 
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 5ca10d3..9d18860 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -167,13 +167,10 @@ void minethd::consume_work()
 void minethd::work_main()
 {
 	uint64_t iCount = 0;
-
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/);
-	
 	globalStates::inst().iConsumeCnt++;
-	uint32_t* piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
 	
 	while (bQuit == 0)
 	{
@@ -190,17 +187,24 @@ void minethd::work_main()
 			continue;
 		}
 
-		if(oWork.bNiceHash)
-			pGpuCtx->Nonce = calc_nicehash_nonce(*piNonce, oWork.iResumeCnt);
-		else
-			pGpuCtx->Nonce = calc_start_nonce(oWork.iResumeCnt);
+		uint32_t h_per_round = pGpuCtx->rawIntensity;
+		size_t round_ctr = 0;
 
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
 		uint32_t target = oWork.iTarget32;
 		XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target);
 
+		if(oWork.bNiceHash)
+			pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39);
+
 		while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 		{
+			//Allocate a new nonce every 16 rounds
+			if((round_ctr++ & 0xF) == 0)
+			{
+				globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, h_per_round * 16);
+			}
+
 			cl_uint results[0x100];
 			memset(results,0,sizeof(cl_uint)*(0x100));
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 625fbe4..ac80cc2 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -335,31 +335,36 @@ void minethd::work_main()
 			    either because of network latency, or a socket problem. Since we are
 			    raison d'etre of this software it us sensible to just wait until we have something*/
 
-			while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
 			consume_work();
 			continue;
 		}
 
-		if(oWork.bNiceHash)
-			result.iNonce = calc_nicehash_nonce(*piNonce, oWork.iResumeCnt);
-		else
-			result.iNonce = calc_start_nonce(oWork.iResumeCnt);
+		size_t nonce_ctr = 0;
+		constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2
 
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
 		memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID));
 
-		while(globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		if(oWork.bNiceHash)
+			result.iNonce = *piNonce;
+
+		while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 		{
-			if ((iCount & 0xF) == 0) //Store stats every 16 hashes
+			if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes
 			{
 				using namespace std::chrono;
 				uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
 				iHashCount.store(iCount, std::memory_order_relaxed);
 				iTimestamp.store(iStamp, std::memory_order_relaxed);
 			}
-			iCount++;
+
+			if((nonce_ctr++ & (nonce_chunk-1)) == 0)
+			{
+				globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk);
+			}
 
 			*piNonce = ++result.iNonce;
 
@@ -446,7 +451,7 @@ void minethd::double_work_main()
 			either because of network latency, or a socket problem. Since we are
 			raison d'etre of this software it us sensible to just wait until we have something*/
 
-			while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
 			consume_work();
@@ -456,14 +461,15 @@ void minethd::double_work_main()
 			continue;
 		}
 
-		if(oWork.bNiceHash)
-			iNonce = calc_nicehash_nonce(*piNonce0, oWork.iResumeCnt);
-		else
-			iNonce = calc_start_nonce(oWork.iResumeCnt);
+		size_t nonce_ctr = 0;
+		constexpr size_t nonce_chunk = 4096; //Needs to be a power of 2
 
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
 
-		while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		if(oWork.bNiceHash)
+			iNonce = *piNonce0;
+
+		while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 		{
 			if ((iCount & 0x7) == 0) //Store stats every 16 hashes
 			{
@@ -472,8 +478,14 @@ void minethd::double_work_main()
 				iHashCount.store(iCount, std::memory_order_relaxed);
 				iTimestamp.store(iStamp, std::memory_order_relaxed);
 			}
-
 			iCount += 2;
+			
+			
+			if((nonce_ctr++ & (nonce_chunk/2 - 1)) == 0)
+			{
+				globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, nonce_chunk);
+			}
+
 
 			*piNonce0 = ++iNonce;
 			*piNonce1 = ++iNonce;
diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp
index 9104040..78823c5 100644
--- a/xmrstak/backend/globalStates.cpp
+++ b/xmrstak/backend/globalStates.cpp
@@ -34,7 +34,7 @@ namespace xmrstak
 {
 
 
-void globalStates::switch_work(miner_work& pWork)
+void globalStates::switch_work(miner_work& pWork, pool_data& dat)
 {
 	// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
 	// faster than threads can consume them. This should never happen in real life.
@@ -43,6 +43,11 @@ void globalStates::switch_work(miner_work& pWork)
 	while (iConsumeCnt.load(std::memory_order_seq_cst) < iThreadCount)
 		std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
+	size_t xid = dat.pool_id;
+	dat.pool_id = pool_id;
+	pool_id = xid;
+
+	dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_seq_cst);
 	oGlobalWork = pWork;
 	iConsumeCnt.store(0, std::memory_order_seq_cst);
 	iGlobalJobNo++;
diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp
index 73ccf74..1c28d5c 100644
--- a/xmrstak/backend/globalStates.hpp
+++ b/xmrstak/backend/globalStates.hpp
@@ -2,6 +2,7 @@
 
 #include "miner_work.hpp"
 #include "xmrstak/misc/environment.hpp"
+#include "xmrstak/misc/console.hpp"
 
 #include <atomic>
 
@@ -9,9 +10,18 @@
 namespace xmrstak
 {
 
-struct globalStates
+struct pool_data
 {
+	uint32_t iSavedNonce;
+	size_t   pool_id;
+ 
+	pool_data() : iSavedNonce(0), pool_id(0)
+	{
+	}
+};
 
+struct globalStates
+{
 	static inline globalStates& inst()
 	{
 		auto& env = environment::inst();
@@ -20,19 +30,28 @@ struct globalStates
 		return *env.pglobalStates;
 	}
 
-	void switch_work(miner_work& pWork);
+	//pool_data is in-out winapi style
+	void switch_work(miner_work& pWork, pool_data& dat);
+
+	inline void calc_start_nonce(uint32_t& nonce, bool use_nicehash, uint32_t reserve_count)
+	{
+		if(use_nicehash)
+			nonce = (nonce & 0xFF000000) | iGlobalNonce.fetch_add(reserve_count);
+		else
+			nonce = iGlobalNonce.fetch_add(reserve_count);
+	}
 
 	miner_work oGlobalWork;
 	std::atomic<uint64_t> iGlobalJobNo;
 	std::atomic<uint64_t> iConsumeCnt;
+	std::atomic<uint32_t> iGlobalNonce;
 	uint64_t iThreadCount;
+	size_t pool_id;
 
-	private:
-
+private:
 	globalStates() : iThreadCount(0)
 	{
 	}
-	
 };
 
 } // namepsace xmrstak
diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp
index 0be8f0a..ab964ce 100644
--- a/xmrstak/backend/iBackend.hpp
+++ b/xmrstak/backend/iBackend.hpp
@@ -9,38 +9,8 @@
 
 namespace xmrstak
 {
-	// only allowed for unsigned value \todo add static assert
-	template<typename T>
-	T reverseBits(T value)
-	{
-		/* init with value (to get LSB) */
-		T result = value;
-		/* extra shift needed at end */
-		int s = sizeof(T) * CHAR_BIT - 1;
-		for (value >>= 1; value; value >>= 1)
-		{
-			result <<= 1;
-			result |= value & 1;
-			s--;
-		}
-		/* shift when values highest bits are zero */
-		result <<= s;
-		return result;
-	}
-
 	struct iBackend
 	{
-		inline uint32_t calc_start_nonce(uint32_t resume)
-		{
-			return reverseBits<uint32_t>(static_cast<uint32_t>(iThreadNo + globalStates::inst().iThreadCount * resume));
-		}
-
-		// Limited version of the nonce calc above
-		inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume)
-		{
-			return start | ( calc_start_nonce(resume) >> 8u );
-		}
-
 		std::atomic<uint64_t> iHashCount;
 		std::atomic<uint64_t> iTimestamp;
 		uint32_t iThreadNo;
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
index aecbd70..6b5720c 100644
--- a/xmrstak/backend/miner_work.hpp
+++ b/xmrstak/backend/miner_work.hpp
@@ -15,7 +15,6 @@ namespace xmrstak
 		char        sJobID[64];
 		uint8_t     bWorkBlob[112];
 		uint32_t    iWorkSize;
-		uint32_t    iResumeCnt;
 		uint64_t    iTarget;
 		// \todo remove workaround needed for amd
 		uint32_t    iTarget32;
@@ -25,8 +24,8 @@ namespace xmrstak
 
 		miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(0) { }
 
-		miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, uint32_t iResumeCnt,
-			uint64_t iTarget, bool bNiceHash, size_t iPoolId) : iWorkSize(iWorkSize), iResumeCnt(iResumeCnt),
+		miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
+			uint64_t iTarget, bool bNiceHash, size_t iPoolId) : iWorkSize(iWorkSize),
 			iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId)
 		{
 			assert(iWorkSize <= sizeof(bWorkBlob));
@@ -41,7 +40,6 @@ namespace xmrstak
 			assert(this != &from);
 
 			iWorkSize = from.iWorkSize;
-			iResumeCnt = from.iResumeCnt;
 			iTarget = from.iTarget;
 			iTarget32 = from.iTarget32;
 			bNiceHash = from.bNiceHash;
@@ -68,7 +66,6 @@ namespace xmrstak
 			assert(this != &from);
 
 			iWorkSize = from.iWorkSize;
-			iResumeCnt = from.iResumeCnt;
 			iTarget = from.iTarget;
 			iTarget32 = from.iTarget32;
 			bNiceHash = from.bNiceHash;
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index fcd01cd..7549c86 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -192,11 +192,10 @@ void minethd::consume_work()
 void minethd::work_main()
 {
 	uint64_t iCount = 0;
-	uint32_t iNonce;
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/);
-	uint32_t* piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
+	uint32_t iNonce;
 
 	globalStates::inst().iConsumeCnt++;
 
@@ -222,16 +221,23 @@ void minethd::work_main()
 		}
 
 		cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize);
-		if(oWork.bNiceHash)
-			iNonce = calc_nicehash_nonce(*piNonce, oWork.iResumeCnt);
-		else
-			iNonce = calc_start_nonce(oWork.iResumeCnt);
+
+		uint32_t h_per_round = ctx.device_blocks * ctx.device_threads;
+		size_t round_ctr = 0;
 
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
 
+		if(oWork.bNiceHash)
+			iNonce = *(uint32_t*)(oWork.bWorkBlob + 39);
+
 		while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 		{
-
+			//Allocate a new nonce every 16 rounds
+			if((round_ctr++ & 0xF) == 0)
+			{
+				globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, h_per_round * 16);
+			}
+			
 			uint32_t foundNonce[10];
 			uint32_t foundCount;
 
@@ -257,8 +263,8 @@ void minethd::work_main()
 					executor::inst()->log_result_error("NVIDIA Invalid Result");
 			}
 
-			iCount += ctx.device_blocks * ctx.device_threads;
-			iNonce += ctx.device_blocks * ctx.device_threads;
+			iCount += h_per_round;
+			iNonce += h_per_round;
 
 			using namespace std::chrono;
 			uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();