summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--doc/tuning.md14
-rw-r--r--xmrstak/backend/cpu/config.tpl6
-rw-r--r--xmrstak/backend/cpu/crypto/cryptonight_aesni.h322
-rw-r--r--xmrstak/backend/cpu/jconf.cpp11
-rw-r--r--xmrstak/backend/cpu/jconf.hpp2
-rw-r--r--xmrstak/backend/cpu/minethd.cpp251
-rw-r--r--xmrstak/backend/cpu/minethd.hpp16
7 files changed, 515 insertions, 107 deletions
diff --git a/doc/tuning.md b/doc/tuning.md
index 8b28a43..53e682b 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -10,6 +10,8 @@
* [Add more GPUs](#add-more-gpus)
* [Increase Memory Pool](#increase-memory-pool)
* [Scratchpad Indexing](#scratchpad-indexing)
+* [CPU Backend](#cpu-backend)
+ * [Choose Value for `low_power_mode`](#choose-value-for-low_power_mode)
## Windows
"Run As Administrator" prompt (UAC) confirmation is needed to use large pages on Windows 7.
@@ -93,3 +95,15 @@ export GPU_SINGLE_ALLOC_PERCENT=99
The layout of the hash scratchpad memory can be changed for each GPU with the option `strided_index` in `amd.txt`.
Try to change the value from the default `true` to `false`.
+
+## CPU Backend
+
+By default the CPU backend can be tuned in the config file `cpu.txt`
+
+### Choose Value for `low_power_mode`
+
+The optimal value for `low_power_mode` depends on the cache size of your CPU, and the number of threads.
+
+The `low_power_mode` can be set to a number between `1` to `5`. When set to a value `N` greater than `1`, this mode increases the single thread performance by `N` times, but also requires at least `2*N` MB of cache per thread. It can also be set to `false` or `true`. The value `false` is equivalent to `1`, and `true` is equivalent to `2`.
+
+This setting is particularly useful for CPUs with very large cache. For example the Intel Crystal Well Processors are equipped with 128MB L4 cache, enough to run 8 threads at an optimal `low_power_mode` value of `5`.
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index 990a31d..b21a22d 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -1,9 +1,11 @@
R"===(
/*
* Thread configuration for each thread. Make sure it matches the number above.
- * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will
+ * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true,
+ this mode will double the cache usage, and double the single thread performance. It will
* consume much less power (as less cores are working), but will max out at around 80-85% of
- * the maximum performance.
+ * the maximum performance. When set to a number N greater than 1, this mode will increase the
+ * cache usage and single thread performance by N times.
*
* no_prefetch - Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
* things slower.
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 2a6a769..9b6e1dc 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -353,19 +353,19 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
-void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
+void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
- keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
- keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200);
+ keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+ keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
// Optim - 99% time boundary
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
- uint8_t* l0 = ctx0->long_state;
- uint64_t* h0 = (uint64_t*)ctx0->hash_state;
- uint8_t* l1 = ctx1->long_state;
- uint64_t* h1 = (uint64_t*)ctx1->hash_state;
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
uint64_t axl0 = h0[0] ^ h0[4];
uint64_t axh0 = h0[1] ^ h0[5];
@@ -444,13 +444,305 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
}
// Optim - 90% time boundary
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
// Optim - 99% time boundary
- keccakf((uint64_t*)ctx0->hash_state, 24);
- extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
- keccakf((uint64_t*)ctx1->hash_state, 24);
- extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32);
+ keccakf((uint64_t*)ctx[0]->hash_state, 24);
+ extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
+ keccakf((uint64_t*)ctx[1]->hash_state, 24);
+ extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32);
+}
+
+#define CN_STEP1(a, b, c, l, ptr, idx) \
+ a = _mm_xor_si128(a, c); \
+ idx = _mm_cvtsi128_si64(a); \
+ ptr = (__m128i *)&l[idx & MASK]; \
+ if(PREFETCH) \
+ _mm_prefetch((const char*)ptr, _MM_HINT_T0); \
+ c = _mm_load_si128(ptr)
+
+#define CN_STEP2(a, b, c, l, ptr, idx) \
+ if(SOFT_AES) \
+ c = soft_aesenc(c, a); \
+ else \
+ c = _mm_aesenc_si128(c, a); \
+ b = _mm_xor_si128(b, c); \
+ _mm_store_si128(ptr, b)
+
+#define CN_STEP3(a, b, c, l, ptr, idx) \
+ idx = _mm_cvtsi128_si64(c); \
+ ptr = (__m128i *)&l[idx & MASK]; \
+ if(PREFETCH) \
+ _mm_prefetch((const char*)ptr, _MM_HINT_T0); \
+ b = _mm_load_si128(ptr)
+
+#define CN_STEP4(a, b, c, l, ptr, idx) \
+ lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \
+ a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \
+ _mm_store_si128(ptr, a)
+
+// This lovelier creation will do 3 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 3; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ }
+
+ for (size_t i = 0; i < 3; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
+}
+
+// This even lovelier creation will do 4 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 4; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+ uint8_t* l3 = ctx[3]->long_state;
+ uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
+ __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+ __m128i cx3 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, idx3, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2, *ptr3;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
+ }
+
+ for (size_t i = 0; i < 4; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
+}
+
+// This most lovely creation will do 5 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 5; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+ uint8_t* l3 = ctx[3]->long_state;
+ uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
+ uint8_t* l4 = ctx[4]->long_state;
+ uint64_t* h4 = (uint64_t*)ctx[4]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
+ __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+ __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]);
+ __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+ __m128i cx3 = _mm_set_epi64x(0, 0);
+ __m128i cx4 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4);
+ }
+
+ for (size_t i = 0; i < 5; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
}
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 2ded8c0..6e709bd 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -116,7 +116,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
return false;
- if(!mode->IsBool() || !no_prefetch->IsBool())
+ if(!mode->IsBool() && !mode->IsNumber())
+ return false;
+
+ if(!no_prefetch->IsBool())
return false;
if(!aff->IsNumber() && !aff->IsBool())
@@ -125,7 +128,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(aff->IsNumber() && aff->GetInt64() < 0)
return false;
- cfg.bDoubleMode = mode->GetBool();
+ if(mode->IsNumber())
+ cfg.iMultiway = (int)mode->GetInt64();
+ else
+ cfg.iMultiway = mode->GetBool() ? 2 : 1;
+
cfg.bNoPrefetch = no_prefetch->GetBool();
if(aff->IsNumber())
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index f843ed4..e98ed16 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -22,7 +22,7 @@ public:
bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str());
struct thd_cfg {
- bool bDoubleMode;
+ int iMultiway;
bool bNoPrefetch;
long long iCpuAff;
};
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index cbb01f9..1c0e491 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -92,7 +92,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
#endif
}
-minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity)
+minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity)
{
this->backendType = iBackend::CPU;
oWork = pWork;
@@ -105,10 +105,25 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefet
std::unique_lock<std::mutex> lck(thd_aff_set);
std::future<void> order_guard = order_fix.get_future();
- if(double_work)
+ switch (iMultiway)
+ {
+ case 5:
+ oWorkThd = std::thread(&minethd::penta_work_main, this);
+ break;
+ case 4:
+ oWorkThd = std::thread(&minethd::quad_work_main, this);
+ break;
+ case 3:
+ oWorkThd = std::thread(&minethd::triple_work_main, this);
+ break;
+ case 2:
oWorkThd = std::thread(&minethd::double_work_main, this);
- else
+ break;
+ case 1:
+ default:
oWorkThd = std::thread(&minethd::work_main, this);
+ break;
+ }
order_guard.wait();
@@ -154,6 +169,7 @@ cryptonight_ctx* minethd::minethd_alloc_ctx()
return nullptr; //Should never happen
}
+static constexpr size_t MAX_N = 5;
bool minethd::self_test()
{
alloc_msg msg = { 0 };
@@ -191,14 +207,15 @@ bool minethd::self_test()
if(res == 0 && fatal)
return false;
- cryptonight_ctx *ctx0, *ctx1;
- if((ctx0 = minethd_alloc_ctx()) == nullptr)
- return false;
-
- if((ctx1 = minethd_alloc_ctx()) == nullptr)
+ cryptonight_ctx *ctx[MAX_N] = {0};
+ for (int i = 0; i < MAX_N; i++)
{
- cryptonight_free_ctx(ctx0);
- return false;
+ if ((ctx[i] = minethd_alloc_ctx()) == nullptr)
+ {
+ for (int j = 0; j < i; j++)
+ cryptonight_free_ctx(ctx[j]);
+ return false;
+ }
}
bool bResult = true;
@@ -206,31 +223,52 @@ bool minethd::self_test()
bool mineMonero = ::jconf::inst()->IsCurrencyMonero();
if(mineMonero)
{
- unsigned char out[64];
+ unsigned char out[32 * MAX_N];
cn_hash_fun hashf;
- cn_hash_fun_dbl hashdf;
-
+ cn_hash_fun_multi hashf_multi;
hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero);
- hashf("This is a test", 14, out, ctx0);
+ hashf("This is a test", 14, out, ctx[0]);
bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero);
- hashf("This is a test", 14, out, ctx0);
+ hashf("This is a test", 14, out, ctx[0]);
bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
- hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero);
- hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
+ hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
- "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+ "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
- hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero);
- hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
+ hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, mineMonero);
+ hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
- "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+ "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+ hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
+ bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
+
+ hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+ bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
+
+ hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+ bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
}
- cryptonight_free_ctx(ctx0);
- cryptonight_free_ctx(ctx1);
+
+ for (int i = 0; i < MAX_N; i++)
+ cryptonight_free_ctx(ctx[i]);
if(!bResult)
printer::inst()->print_msg(L0,
@@ -272,12 +310,12 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
#endif
- printer::inst()->print_msg(L1, "Starting %s thread, affinity: %d.", cfg.bDoubleMode ? "double" : "single", (int)cfg.iCpuAff);
+ printer::inst()->print_msg(L1, "Starting %dx thread, affinity: %d.", cfg.iMultiway, (int)cfg.iCpuAff);
}
else
- printer::inst()->print_msg(L1, "Starting %s thread, no affinity.", cfg.bDoubleMode ? "double" : "single");
+ printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway);
- minethd* thd = new minethd(pWork, i + threadOffset, cfg.bDoubleMode, cfg.bNoPrefetch, cfg.iCpuAff);
+ minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff);
pvThreads.push_back(thd);
}
@@ -326,7 +364,7 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, boo
// define aeon settings
#if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO)
- // ignore 3rd bit if only on currency is active
+ // ignore 3rd bit if only one currency is active
digit.set(2, 0);
#else
digit.set(2, !mineMonero);
@@ -416,22 +454,34 @@ void minethd::work_main()
cryptonight_free_ctx(ctx);
}
-minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero)
+minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero)
{
// We have two independent flag bits in the functions
// therefore we will build a binary digit and select the
// function as a two digit binary
- // Digit order SOFT_AES, NO_PREFETCH, MINER_ALGO
+ // Digit order SOFT_AES, NO_PREFETCH
- static const cn_hash_fun_dbl func_table[] = {
- /* there will be 8 function entries if `CONF_NO_MONERO` and `CONF_NO_AEON`
- * is not defined. If one is defined there will be 4 entries.
+ static const cn_hash_fun_multi func_table[] = {
+ /* there will be 8*(MAX_N-1) function entries if `CONF_NO_MONERO` and `CONF_NO_AEON`
+ * is not defined. If one is defined there will be 4*(MAX_N-1) entries.
*/
#ifndef CONF_NO_MONERO
cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
- cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>
+ cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
+ cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
+ cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
+ cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
+ cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
+ cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
+ cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
+ cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
+ cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
+ cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
+ cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
+ cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
+ cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>
#endif
#if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO))
// comma will be added only if Monero and Aeon is build
@@ -441,33 +491,71 @@ minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefe
cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
- cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>
+ cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
+ cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
+ cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
+ cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
+ cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
+ cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
+ cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
+ cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
+ cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
+ cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
+ cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
+ cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
+ cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>
#endif
};
- std::bitset<3> digit;
+ std::bitset<2> digit;
digit.set(0, !bNoPrefetch);
digit.set(1, !bHaveAes);
// define aeon settings
#if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO)
- // ignore 3rd bit if only on currency is active
- digit.set(2, 0);
+ // ignore miner algo if only one currency is active
+ size_t miner_algo_base = 0;
#else
- digit.set(2, !mineMonero);
+ size_t miner_algo_base = mineMonero ? 0 : 4*(MAX_N-1);
#endif
- return func_table[digit.to_ulong()];
+ N = (N<2) ? 2 : (N>MAX_N) ? MAX_N : N;
+ return func_table[miner_algo_base + 4*(N-2) + digit.to_ulong()];
}
-uint32_t* minethd::prep_double_work(uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2])
+void minethd::double_work_main()
{
- memcpy(bDoubleWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
- memcpy(bDoubleWorkBlob + oWork.iWorkSize, oWork.bWorkBlob, oWork.iWorkSize);
- return (uint32_t*)(bDoubleWorkBlob + oWork.iWorkSize + 39);
+ multiway_work_main<2>(func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
}
-void minethd::double_work_main()
+void minethd::triple_work_main()
+{
+ multiway_work_main<3>(func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+}
+
+void minethd::quad_work_main()
+{
+ multiway_work_main<4>(func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+}
+
+void minethd::penta_work_main()
+{
+ multiway_work_main<5>(func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+}
+
+template<size_t N>
+void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce)
+{
+ for (size_t i = 0; i < N; i++)
+ {
+ memcpy(bWorkBlob + oWork.iWorkSize * i, oWork.bWorkBlob, oWork.iWorkSize);
+ if (i > 0)
+ piNonce[i] = (uint32_t*)(bWorkBlob + oWork.iWorkSize * i + 39);
+ }
+}
+
+template<size_t N>
+void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
{
if(affinity >= 0) //-1 means no affinity
bindMemoryToNUMANode(affinity);
@@ -477,31 +565,26 @@ void minethd::double_work_main()
lck.release();
std::this_thread::yield();
- cn_hash_fun_dbl hash_fun;
- cryptonight_ctx* ctx0;
- cryptonight_ctx* ctx1;
+ cryptonight_ctx *ctx[MAX_N];
uint64_t iCount = 0;
- uint64_t *piHashVal0, *piHashVal1;
- uint32_t *piNonce0, *piNonce1;
- uint8_t bDoubleHashOut[64];
- uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2];
+ uint64_t *piHashVal[MAX_N];
+ uint32_t *piNonce[MAX_N];
+ uint8_t bHashOut[MAX_N * 32];
+ uint8_t bWorkBlob[sizeof(miner_work::bWorkBlob) * MAX_N];
uint32_t iNonce;
job_result res;
- hash_fun = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero());
- ctx0 = minethd_alloc_ctx();
- ctx1 = minethd_alloc_ctx();
-
- piHashVal0 = (uint64_t*)(bDoubleHashOut + 24);
- piHashVal1 = (uint64_t*)(bDoubleHashOut + 32 + 24);
- piNonce0 = (uint32_t*)(bDoubleWorkBlob + 39);
+ for (size_t i = 0; i < N; i++)
+ {
+ ctx[i] = minethd_alloc_ctx();
+ piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24);
+ piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr;
+ }
if(!oWork.bStall)
- piNonce1 = prep_double_work(bDoubleWorkBlob);
- else
- piNonce1 = nullptr;
+ prep_multiway_work<N>(bWorkBlob, piNonce);
- globalStates::inst().inst().iConsumeCnt++;
+ globalStates::inst().iConsumeCnt++;
while (bQuit == 0)
{
@@ -515,55 +598,57 @@ void minethd::double_work_main()
std::this_thread::sleep_for(std::chrono::milliseconds(100));
consume_work();
- piNonce1 = prep_double_work(bDoubleWorkBlob);
+ prep_multiway_work<N>(bWorkBlob, piNonce);
continue;
}
- size_t nonce_ctr = 0;
- constexpr size_t nonce_chunk = 4096; //Needs to be a power of 2
+ constexpr uint32_t nonce_chunk = 4096;
+ int64_t nonce_ctr = 0;
assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
if(oWork.bNiceHash)
- iNonce = *piNonce0;
+ iNonce = *piNonce[0];
while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
{
- if ((iCount & 0x7) == 0) //Store stats every 16 hashes
+ if ((iCount++ & 0x7) == 0) //Store stats every 8*N hashes
{
using namespace std::chrono;
uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
- iHashCount.store(iCount, std::memory_order_relaxed);
+ iHashCount.store(iCount * N, std::memory_order_relaxed);
iTimestamp.store(iStamp, std::memory_order_relaxed);
}
- iCount += 2;
-
-
- if((nonce_ctr++ & (nonce_chunk/2 - 1)) == 0)
+
+ nonce_ctr -= N;
+ if(nonce_ctr <= 0)
{
globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, nonce_chunk);
+ nonce_ctr = nonce_chunk;
}
- *piNonce0 = ++iNonce;
- *piNonce1 = ++iNonce;
+ for (size_t i = 0; i < N; i++)
+ *piNonce[i] = ++iNonce;
- hash_fun(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1);
+ hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx);
- if (*piHashVal0 < oWork.iTarget)
- executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce-1, bDoubleHashOut, iThreadNo), oWork.iPoolId));
-
- if (*piHashVal1 < oWork.iTarget)
- executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce, bDoubleHashOut + 32, iThreadNo), oWork.iPoolId));
+ for (size_t i = 0; i < N; i++)
+ {
+ if (*piHashVal[i] < oWork.iTarget)
+ {
+ executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + 1 + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId));
+ }
+ }
std::this_thread::yield();
}
consume_work();
- piNonce1 = prep_double_work(bDoubleWorkBlob);
+ prep_multiway_work<N>(bWorkBlob, piNonce);
}
- cryptonight_free_ctx(ctx0);
- cryptonight_free_ctx(ctx1);
+ for (int i = 0; i < N; i++)
+ cryptonight_free_ctx(ctx[i]);
}
} // namespace cpu
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 5520d9e..0433d0d 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -29,16 +29,24 @@ public:
static cryptonight_ctx* minethd_alloc_ctx();
private:
+ typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**);
+ static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero);
- typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, cryptonight_ctx* __restrict, cryptonight_ctx* __restrict);
- static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero);
+ minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
- minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity);
+ template<size_t N>
+ void multiway_work_main(cn_hash_fun_multi hash_fun_multi);
+
+ template<size_t N>
+ void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce);
void work_main();
void double_work_main();
+ void triple_work_main();
+ void quad_work_main();
+ void penta_work_main();
+
void consume_work();
- uint32_t* prep_double_work(uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2]);
uint64_t iJobNo;
OpenPOWER on IntegriCloud