diff options
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | README.md | 40 | ||||
-rw-r--r-- | doc/FAQ.md | 6 | ||||
-rw-r--r-- | doc/compile_Windows.md | 2 | ||||
-rw-r--r-- | doc/pgp_keys.md | 69 | ||||
-rw-r--r-- | doc/tuning.md | 14 | ||||
-rw-r--r-- | xmrstak/backend/cpu/config.tpl | 6 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/cryptonight_aesni.h | 322 | ||||
-rw-r--r-- | xmrstak/backend/cpu/jconf.cpp | 11 | ||||
-rw-r--r-- | xmrstak/backend/cpu/jconf.hpp | 2 | ||||
-rw-r--r-- | xmrstak/backend/cpu/minethd.cpp | 251 | ||||
-rw-r--r-- | xmrstak/backend/cpu/minethd.hpp | 16 |
12 files changed, 591 insertions, 150 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 9540d7a..b8f1eef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -348,7 +348,7 @@ if(HWLOC_ENABLE) /usr/local /usr ENV "PROGRAMFILES(X86)" - ENV "MICROHTTPD_ROOT" + ENV "HWLOC_ROOT" PATH_SUFFIXES include) @@ -14,8 +14,8 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV * [HowTo Compile](doc/compile.md) * [FAQ](doc/FAQ.md) * [Developer Donation](#default-developer-donation) -* [Cheksums](#checksums) -* [PGP Key](#pgp-key) +* [Release Cheksums](#release-checksums) +* [Developer PGP Key's](doc/pgp_keys.md) ## Features @@ -51,7 +51,7 @@ psychocrypt: 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU ``` -## Checksums +## Release Checksums ``` -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA256 @@ -87,37 +87,3 @@ cvX6gTCsFYfLw/p+sz+DN7kh7zJlCvIFga3HaFByxCSuyMY08qerXS/0862ZMdo= -----END PGP SIGNATURE----- ``` - -## PGP Key -``` ------BEGIN PGP PUBLIC KEY BLOCK----- -Version: GnuPG v2 - -mQENBFhYUmUBCAC6493W5y1MMs38ApRbI11jWUqNdFm686XLkZWGDfYImzL6pEYk -RdWkyt9ziCyA6NUeWFQYniv/z10RxYKq8ulVVJaKb9qPGMU0ESfdxlFNJkU/pf28 -sEVBagGvGw8uFxjQONnBJ7y7iNRWMN7qSRS636wN5ryTHNsmqI4ClXPHkXkDCDUX -QvhXZpG9RRM6jsE3jBGz/LJi3FyZLo/vB60OZBODJ2IA0wSR41RRiOq01OqDueva -9jPoAokNglJfn/CniQ+lqUEXj1vjAZ1D5Mn9fISzA/UPen5Z7Sipaa9aAtsDBOfP -K9iPKOsWa2uTafoyXgiwEVXCCeMMUjCGaoFBABEBAAG0ImZpcmVpY2VfdWsgPGZp -cmVpY2UueG1yQGdtYWlsLmNvbT6JATcEEwEIACEFAlhYUmUCGwMFCwkIBwIGFQgJ -CgsCBBYCAwECHgECF4AACgkQ+yT3mn7UHDTEcQf8CMhqaZ0IOBxeBnsq5HZr2X6z -E5bODp5cPs6ha1tjH3CWpk1AFeykNtXH7kPW9hcDt/e4UQtcHs+lu6YU59X7xLJQ -udOkpWdmooJMXRWS/zeeon4ivT9d69jNnwubh8EJOyw8xm/se6n48BcewfHekW/6 -mVrbhLbF1dnuUGXzRN1WxsUZx3uJd2UvrkJhAtHtX92/qIVhT0+3PXV0bmpHURlK -YKhhm8dPLV9jPX8QVRHQXCOHSMqy/KoWEe6CnT0Isbkq3JtS3K4VBVeTX9gkySRc -IFxrNJdXsI9BxKv4O8yajP8DohpoGLMDKZKSO0yq0BRMgMh0cw6Lk22uyulGALkB -DQRYWFJlAQgAqikfViOmIccCZKVMZfNHjnigKtQqNrbJpYZCOImql4FqbZu9F7TD -9HIXA43SPcwziWlyazSy8Pa9nCpc6PuPPO1wxAaNIc5nt+w/x2EGGTIFGjRoubmP -3i5jZzOFYsvR2W3PgVa3/ujeYYJYo1oeVeuGmmJRejs0rp1mbvBSKw1Cq6C4cI0x -GTY1yXFGLIgdfYNMmiLsTy1Qwq8YStbFKeUYAMMG3128SAIaT3Eet911f5Jx4tC8 -6kWUr6PX1rQ0LQJqyIsLq9U53XybUksRfJC9IEfgvgBxRBHSD8WfqEhHjhW1VsZG -dcYgr7A1PIneWsCEY+5VUnqTlt2HPaKweQARAQABiQEfBBgBCAAJBQJYWFJlAhsM -AAoJEPsk95p+1Bw0Pr8H/0vZ6U2zaih03jOHOvsrYxRfDXSmgudOp1VS45aHIREd -2nrJ+drleeFVyb14UQqO/6iX9GuDX2yBEHdCg2aljeP98AaMU//RiEtebE6CUWsL -HPVXHIkxwBCBe0YkJINHUQqLz/5f6qLsNUp1uTH2++zhdBWvg+gErTYbx8aFMFYH -0GoOtqE5rtlAh5MTvDZm+UcDwKJCxhrLaN3R3dDoyrDNRTgHQQuX5/opJBiUnVNK -d+vugnxzpMIJQP11yCZkz/KxV8zQ2QPMuZdAoh3znd/vGCJcp0rWphn4pqxA4vDp -c4hC0Yg9Dha1OoE5CJCqVL+ic4vAyB1urAwBlsd/wH8= -=B5I+ ------END PGP PUBLIC KEY BLOCK----- -``` @@ -56,10 +56,8 @@ This typically means you are trying to run it on a CPU that does not have [AES]( ## Virus Protection Alert -Some Virus protection software flag the miner binary as *Male Ware*. -In this case the binary is moved to the quarantine area of the protection software. -This is a wrong alert and not avoid by use. -Add the binary to to protection software white list to solve this issue.s +Some virus protection software flags the miner binary as *malware*. This is a false positive — the software does not contain any malware (and since it is open source, you can verify that yourself!) +If your antivirus software flags **xmr-stak**, it will likely move it to its quarantine area. You may have to whitelist **xmr-stak** in your antivirus. ## Change Currency to Mine diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md index 0ce4a0c..c9a8ff7 100644 --- a/doc/compile_Windows.md +++ b/doc/compile_Windows.md @@ -12,7 +12,7 @@ - download VS2017 Community and install from [https://www.visualstudio.com/downloads/](https://www.visualstudio.com/downloads/) - during the install chose the components - `Desktop development with C++` (left side) - - `Toolset for Visual Studio C++ 2015.3 v140...` (right side) + - `VC++ 2015.3 v140 toolset for desktop` (right side) ### CMake for Win64 diff --git a/doc/pgp_keys.md b/doc/pgp_keys.md new file mode 100644 index 0000000..69ab41a --- /dev/null +++ b/doc/pgp_keys.md @@ -0,0 +1,69 @@ +# Developer PGP Key's + +## Key @fireice-uk +``` +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2 + +mQENBFhYUmUBCAC6493W5y1MMs38ApRbI11jWUqNdFm686XLkZWGDfYImzL6pEYk +RdWkyt9ziCyA6NUeWFQYniv/z10RxYKq8ulVVJaKb9qPGMU0ESfdxlFNJkU/pf28 +sEVBagGvGw8uFxjQONnBJ7y7iNRWMN7qSRS636wN5ryTHNsmqI4ClXPHkXkDCDUX +QvhXZpG9RRM6jsE3jBGz/LJi3FyZLo/vB60OZBODJ2IA0wSR41RRiOq01OqDueva +9jPoAokNglJfn/CniQ+lqUEXj1vjAZ1D5Mn9fISzA/UPen5Z7Sipaa9aAtsDBOfP +K9iPKOsWa2uTafoyXgiwEVXCCeMMUjCGaoFBABEBAAG0ImZpcmVpY2VfdWsgPGZp +cmVpY2UueG1yQGdtYWlsLmNvbT6JATcEEwEIACEFAlhYUmUCGwMFCwkIBwIGFQgJ +CgsCBBYCAwECHgECF4AACgkQ+yT3mn7UHDTEcQf8CMhqaZ0IOBxeBnsq5HZr2X6z +E5bODp5cPs6ha1tjH3CWpk1AFeykNtXH7kPW9hcDt/e4UQtcHs+lu6YU59X7xLJQ +udOkpWdmooJMXRWS/zeeon4ivT9d69jNnwubh8EJOyw8xm/se6n48BcewfHekW/6 +mVrbhLbF1dnuUGXzRN1WxsUZx3uJd2UvrkJhAtHtX92/qIVhT0+3PXV0bmpHURlK +YKhhm8dPLV9jPX8QVRHQXCOHSMqy/KoWEe6CnT0Isbkq3JtS3K4VBVeTX9gkySRc +IFxrNJdXsI9BxKv4O8yajP8DohpoGLMDKZKSO0yq0BRMgMh0cw6Lk22uyulGALkB +DQRYWFJlAQgAqikfViOmIccCZKVMZfNHjnigKtQqNrbJpYZCOImql4FqbZu9F7TD +9HIXA43SPcwziWlyazSy8Pa9nCpc6PuPPO1wxAaNIc5nt+w/x2EGGTIFGjRoubmP +3i5jZzOFYsvR2W3PgVa3/ujeYYJYo1oeVeuGmmJRejs0rp1mbvBSKw1Cq6C4cI0x +GTY1yXFGLIgdfYNMmiLsTy1Qwq8YStbFKeUYAMMG3128SAIaT3Eet911f5Jx4tC8 +6kWUr6PX1rQ0LQJqyIsLq9U53XybUksRfJC9IEfgvgBxRBHSD8WfqEhHjhW1VsZG +dcYgr7A1PIneWsCEY+5VUnqTlt2HPaKweQARAQABiQEfBBgBCAAJBQJYWFJlAhsM +AAoJEPsk95p+1Bw0Pr8H/0vZ6U2zaih03jOHOvsrYxRfDXSmgudOp1VS45aHIREd +2nrJ+drleeFVyb14UQqO/6iX9GuDX2yBEHdCg2aljeP98AaMU//RiEtebE6CUWsL +HPVXHIkxwBCBe0YkJINHUQqLz/5f6qLsNUp1uTH2++zhdBWvg+gErTYbx8aFMFYH +0GoOtqE5rtlAh5MTvDZm+UcDwKJCxhrLaN3R3dDoyrDNRTgHQQuX5/opJBiUnVNK +d+vugnxzpMIJQP11yCZkz/KxV8zQ2QPMuZdAoh3znd/vGCJcp0rWphn4pqxA4vDp +c4hC0Yg9Dha1OoE5CJCqVL+ic4vAyB1urAwBlsd/wH8= +=B5I+ +-----END PGP PUBLIC KEY BLOCK----- +``` + +## Key @psychocrypt +``` +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2 + +mQENBFoq84cBCACm4moXhW5kLRByCh7q1lZz/RP88fINfX2jsxS3QyagjsAD7yxy +CZPlwF+NsZTX/jVbMTUTHDO1KEejWO4peYYeJT50BcTS7MOF+O3EVI3j/G45v5L5 +yI0MOgsQFVM4k5A9n0W+oULZK7ejiBSolzSG7PbwMpUjUSMEl4boUd5zFPNq/kpo +OMmJR/Q17LOqvnQsbkQDUprl+qvbD5q2xEFPNKt3KmUUEOF8a1dIDkUZmXEklLFp +cUjLTviscgP1+Mfyasz6cAjfaN+7IwYS+vGnFcwXx93sIq4J3wkpgpyMNdtoK1hY +ALJxzk8TF7NRFU68uIqDGrROEDa5asW9L75DABEBAAG0JnBzeWNob2NyeXB0IDxw +c3ljaG9jcnlwdGhwY0BnbWFpbC5jb20+iQE3BBMBCAAhBQJaKvOHAhsDBQsJCAcC +BhUICQoLAgQWAgMBAh4BAheAAAoJEAUWOMCIZelDeLoH/j+nZE3E636tKvHoP2Uv +7PG1dP9F+fTHhru53iVIxR+UXubobgAYH8lOo7yBuO+JGWDf4KjyNRrRf/To1xD5 +udpU6BrJ8iS3MTPk0jQ1delk+7jaFLXaJbQPdOVRR7dddi32j3Vw6wPaCWhK8xt/ +xDIEJJ6TlSwNBiYIriLa2uB5q0DhwdWBsZqna5xhp2jihxhtEUXs4IkkIETIVs8e +FIzqxNALUNWRit2Bm1Etm4KId9GV5N6eyjekYUk3zGLivsyTHbl6XhNFmQk8UzIP +N5OjcJb1UFr7Q43kRvUGDZEh08l+k5P2qQ1y3g1WypcPsfSh/+XMeCe27DaLeOwZ +SD25AQ0EWirzhwEIANUkGzShhAscwJt5L/huftn/TQYaBIcYtKLYyIyQsG7c3/bO +aNB4t5ZkYBPrVRyqRcnaJffIvi4oq+wSnCUn++jXZbH1OrSCZhcPBsdvgHz0KV9D +71KpJ2p9cdjqO6MWM7DrKy30QNSf5eiDzhqTB4NMKLidgGIDCz7ahFZDH2vONaOn +1A8WFXMy06lFWsYTe4TjpPOG5ZFHhSNsTthYp7sUgLwvThKhXRl0nM5C9mwShw8I +9r5/gbzEvyJJx0anJLHyka2AYFtrCcK9WGx4I6PsTfj318T1bVzWfjnq9FXZ1p3c +UeTYX8f92EqAs0dcvC0hrNWyhNr2D4G+YF6KAysAEQEAAYkBHwQYAQgACQUCWirz +hwIbDAAKCRAFFjjAiGXpQ5CPCACCuhM8SbtnG83oQJbxWA63DdXZPTyuFGbiMRS4 +9EJkx7hVu94NJEaJMFGETn+qKLjaV+QtsBK/ZtZBRbKoqBhFzbRt8NOfC26JHEx0 +tdrBb4Ct8SAPhEhZDZFJt3kac038E3mBeXDoDAqdoltqG8C24uk99QHJwAhjWNb9 +uOMTGcm/j7ieyGF87bMKCdnTDXWABTAUbeBTD+MCfyEJgeMa6G3LWSsoj2cOwj0K +Nla4ixBctXWPewAyobNaN+EGJj99TMuz/3EMtxSzh//u2czenic3IUzoG1jSWwi1 ++5AETDxKdVzpZYolUBYZGmnsStLvyh/+n6Xt19LM1+NBos1y +=JTYP +-----END PGP PUBLIC KEY BLOCK----- +``` diff --git a/doc/tuning.md b/doc/tuning.md index 8b28a43..53e682b 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -10,6 +10,8 @@ * [Add more GPUs](#add-more-gpus) * [Increase Memory Pool](#increase-memory-pool) * [Scratchpad Indexing](#scratchpad-indexing) +* [CPU Backend](#cpu-backend) + * [Choose Value for `low_power_mode`](#choose-value-for-low_power_mode) ## Windows "Run As Administrator" prompt (UAC) confirmation is needed to use large pages on Windows 7. @@ -93,3 +95,15 @@ export GPU_SINGLE_ALLOC_PERCENT=99 The layout of the hash scratchpad memory can be changed for each GPU with the option `strided_index` in `amd.txt`. Try to change the value from the default `true` to `false`. + +## CPU Backend + +By default the CPU backend can be tuned in the config file `cpu.txt` + +### Choose Value for `low_power_mode` + +The optimal value for `low_power_mode` depends on the cache size of your CPU, and the number of threads. + +The `low_power_mode` can be set to a number between `1` to `5`. When set to a value `N` greater than `1`, this mode increases the single thread performance by `N` times, but also requires at least `2*N` MB of cache per thread. It can also be set to `false` or `true`. The value `false` is equivalent to `1`, and `true` is equivalent to `2`. + +This setting is particularly useful for CPUs with very large cache. For example the Intel Crystal Well Processors are equipped with 128MB L4 cache, enough to run 8 threads at an optimal `low_power_mode` value of `5`. diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 990a31d..b21a22d 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -1,9 +1,11 @@ R"===( /* * Thread configuration for each thread. Make sure it matches the number above. - * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will + * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true, + this mode will double the cache usage, and double the single thread performance. It will * consume much less power (as less cores are working), but will max out at around 80-85% of - * the maximum performance. + * the maximum performance. When set to a number N greater than 1, this mode will increase the + * cache usage and single thread performance by N times. * * no_prefetch - Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 2a6a769..9b6e1dc 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -353,19 +353,19 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c // to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output // We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH> -void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1) +void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - keccak((const uint8_t *)input, len, ctx0->hash_state, 200); - keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200); + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200); // Optim - 99% time boundary - cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); - cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state); - uint8_t* l0 = ctx0->long_state; - uint64_t* h0 = (uint64_t*)ctx0->hash_state; - uint8_t* l1 = ctx1->long_state; - uint64_t* h1 = (uint64_t*)ctx1->hash_state; + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; uint64_t axl0 = h0[0] ^ h0[4]; uint64_t axh0 = h0[1] ^ h0[5]; @@ -444,13 +444,305 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto } // Optim - 90% time boundary - cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); - cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state); + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state); // Optim - 99% time boundary - keccakf((uint64_t*)ctx0->hash_state, 24); - extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); - keccakf((uint64_t*)ctx1->hash_state, 24); - extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); + keccakf((uint64_t*)ctx[1]->hash_state, 24); + extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32); +} + +#define CN_STEP1(a, b, c, l, ptr, idx) \ + a = _mm_xor_si128(a, c); \ + idx = _mm_cvtsi128_si64(a); \ + ptr = (__m128i *)&l[idx & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ + c = _mm_load_si128(ptr) + +#define CN_STEP2(a, b, c, l, ptr, idx) \ + if(SOFT_AES) \ + c = soft_aesenc(c, a); \ + else \ + c = _mm_aesenc_si128(c, a); \ + b = _mm_xor_si128(b, c); \ + _mm_store_si128(ptr, b) + +#define CN_STEP3(a, b, c, l, ptr, idx) \ + idx = _mm_cvtsi128_si64(c); \ + ptr = (__m128i *)&l[idx & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ + b = _mm_load_si128(ptr) + +#define CN_STEP4(a, b, c, l, ptr, idx) \ + lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \ + a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \ + _mm_store_si128(ptr, a) + +// This lovelier creation will do 3 cn hashes at a time. +template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH> +void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 3; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, hi, lo; + __m128i *ptr0, *ptr1, *ptr2; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + } + + for (size_t i = 0; i < 3; i++) + { + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} + +// This even lovelier creation will do 4 cn hashes at a time. +template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH> +void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 4; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + uint8_t* l3 = ctx[3]->long_state; + uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + __m128i cx3 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, idx3, hi, lo; + __m128i *ptr0, *ptr1, *ptr2, *ptr3; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3); + } + + for (size_t i = 0; i < 4; i++) + { + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} + +// This most lovely creation will do 5 cn hashes at a time. +template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH> +void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + for (size_t i = 0; i < 5; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + uint8_t* l3 = ctx[3]->long_state; + uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; + uint8_t* l4 = ctx[4]->long_state; + uint64_t* h4 = (uint64_t*)ctx[4]->hash_state; + + __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]); + __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + __m128i cx0 = _mm_set_epi64x(0, 0); + __m128i cx1 = _mm_set_epi64x(0, 0); + __m128i cx2 = _mm_set_epi64x(0, 0); + __m128i cx3 = _mm_set_epi64x(0, 0); + __m128i cx4 = _mm_set_epi64x(0, 0); + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo; + __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4); + } + + for (size_t i = 0; i < 5; i++) + { + cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } } diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 2ded8c0..6e709bd 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -116,7 +116,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) return false; - if(!mode->IsBool() || !no_prefetch->IsBool()) + if(!mode->IsBool() && !mode->IsNumber()) + return false; + + if(!no_prefetch->IsBool()) return false; if(!aff->IsNumber() && !aff->IsBool()) @@ -125,7 +128,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(aff->IsNumber() && aff->GetInt64() < 0) return false; - cfg.bDoubleMode = mode->GetBool(); + if(mode->IsNumber()) + cfg.iMultiway = (int)mode->GetInt64(); + else + cfg.iMultiway = mode->GetBool() ? 2 : 1; + cfg.bNoPrefetch = no_prefetch->GetBool(); if(aff->IsNumber()) diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index f843ed4..e98ed16 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -22,7 +22,7 @@ public: bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str()); struct thd_cfg { - bool bDoubleMode; + int iMultiway; bool bNoPrefetch; long long iCpuAff; }; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index cbb01f9..1c0e491 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -92,7 +92,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id #endif } -minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity) +minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity) { this->backendType = iBackend::CPU; oWork = pWork; @@ -105,10 +105,25 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefet std::unique_lock<std::mutex> lck(thd_aff_set); std::future<void> order_guard = order_fix.get_future(); - if(double_work) + switch (iMultiway) + { + case 5: + oWorkThd = std::thread(&minethd::penta_work_main, this); + break; + case 4: + oWorkThd = std::thread(&minethd::quad_work_main, this); + break; + case 3: + oWorkThd = std::thread(&minethd::triple_work_main, this); + break; + case 2: oWorkThd = std::thread(&minethd::double_work_main, this); - else + break; + case 1: + default: oWorkThd = std::thread(&minethd::work_main, this); + break; + } order_guard.wait(); @@ -154,6 +169,7 @@ cryptonight_ctx* minethd::minethd_alloc_ctx() return nullptr; //Should never happen } +static constexpr size_t MAX_N = 5; bool minethd::self_test() { alloc_msg msg = { 0 }; @@ -191,14 +207,15 @@ bool minethd::self_test() if(res == 0 && fatal) return false; - cryptonight_ctx *ctx0, *ctx1; - if((ctx0 = minethd_alloc_ctx()) == nullptr) - return false; - - if((ctx1 = minethd_alloc_ctx()) == nullptr) + cryptonight_ctx *ctx[MAX_N] = {0}; + for (int i = 0; i < MAX_N; i++) { - cryptonight_free_ctx(ctx0); - return false; + if ((ctx[i] = minethd_alloc_ctx()) == nullptr) + { + for (int j = 0; j < i; j++) + cryptonight_free_ctx(ctx[j]); + return false; + } } bool bResult = true; @@ -206,31 +223,52 @@ bool minethd::self_test() bool mineMonero = ::jconf::inst()->IsCurrencyMonero(); if(mineMonero) { - unsigned char out[64]; + unsigned char out[32 * MAX_N]; cn_hash_fun hashf; - cn_hash_fun_dbl hashdf; - + cn_hash_fun_multi hashf_multi; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero); - hashf("This is a test", 14, out, ctx0); + hashf("This is a test", 14, out, ctx[0]); bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero); - hashf("This is a test", 14, out, ctx0); + hashf("This is a test", 14, out, ctx[0]); bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero); - hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); + hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, mineMonero); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero); - hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); + hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, mineMonero); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + + hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, mineMonero); + hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); + bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; + + hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, mineMonero); + hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); + bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; + + hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, mineMonero); + hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); + bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; } - cryptonight_free_ctx(ctx0); - cryptonight_free_ctx(ctx1); + + for (int i = 0; i < MAX_N; i++) + cryptonight_free_ctx(ctx[i]); if(!bResult) printer::inst()->print_msg(L0, @@ -272,12 +310,12 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory."); #endif - printer::inst()->print_msg(L1, "Starting %s thread, affinity: %d.", cfg.bDoubleMode ? "double" : "single", (int)cfg.iCpuAff); + printer::inst()->print_msg(L1, "Starting %dx thread, affinity: %d.", cfg.iMultiway, (int)cfg.iCpuAff); } else - printer::inst()->print_msg(L1, "Starting %s thread, no affinity.", cfg.bDoubleMode ? "double" : "single"); + printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway); - minethd* thd = new minethd(pWork, i + threadOffset, cfg.bDoubleMode, cfg.bNoPrefetch, cfg.iCpuAff); + minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff); pvThreads.push_back(thd); } @@ -326,7 +364,7 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, boo // define aeon settings #if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO) - // ignore 3rd bit if only on currency is active + // ignore 3rd bit if only one currency is active digit.set(2, 0); #else digit.set(2, !mineMonero); @@ -416,22 +454,34 @@ void minethd::work_main() cryptonight_free_ctx(ctx); } -minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero) +minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero) { // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary - // Digit order SOFT_AES, NO_PREFETCH, MINER_ALGO + // Digit order SOFT_AES, NO_PREFETCH - static const cn_hash_fun_dbl func_table[] = { - /* there will be 8 function entries if `CONF_NO_MONERO` and `CONF_NO_AEON` - * is not defined. If one is defined there will be 4 entries. + static const cn_hash_fun_multi func_table[] = { + /* there will be 8*(MAX_N-1) function entries if `CONF_NO_MONERO` and `CONF_NO_AEON` + * is not defined. If one is defined there will be 4*(MAX_N-1) entries. */ #ifndef CONF_NO_MONERO cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>, cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>, cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>, - cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true> + cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>, + cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>, + cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>, + cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>, + cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>, + cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>, + cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>, + cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>, + cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>, + cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>, + cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>, + cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>, + cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true> #endif #if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO)) // comma will be added only if Monero and Aeon is build @@ -441,33 +491,71 @@ minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefe cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>, cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>, cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>, - cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true> + cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>, + cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>, + cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>, + cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>, + cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>, + cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>, + cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>, + cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>, + cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>, + cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>, + cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>, + cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>, + cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true> #endif }; - std::bitset<3> digit; + std::bitset<2> digit; digit.set(0, !bNoPrefetch); digit.set(1, !bHaveAes); // define aeon settings #if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO) - // ignore 3rd bit if only on currency is active - digit.set(2, 0); + // ignore miner algo if only one currency is active + size_t miner_algo_base = 0; #else - digit.set(2, !mineMonero); + size_t miner_algo_base = mineMonero ? 0 : 4*(MAX_N-1); #endif - return func_table[digit.to_ulong()]; + N = (N<2) ? 2 : (N>MAX_N) ? MAX_N : N; + return func_table[miner_algo_base + 4*(N-2) + digit.to_ulong()]; } -uint32_t* minethd::prep_double_work(uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2]) +void minethd::double_work_main() { - memcpy(bDoubleWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); - memcpy(bDoubleWorkBlob + oWork.iWorkSize, oWork.bWorkBlob, oWork.iWorkSize); - return (uint32_t*)(bDoubleWorkBlob + oWork.iWorkSize + 39); + multiway_work_main<2>(func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero())); } -void minethd::double_work_main() +void minethd::triple_work_main() +{ + multiway_work_main<3>(func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero())); +} + +void minethd::quad_work_main() +{ + multiway_work_main<4>(func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero())); +} + +void minethd::penta_work_main() +{ + multiway_work_main<5>(func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero())); +} + +template<size_t N> +void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce) +{ + for (size_t i = 0; i < N; i++) + { + memcpy(bWorkBlob + oWork.iWorkSize * i, oWork.bWorkBlob, oWork.iWorkSize); + if (i > 0) + piNonce[i] = (uint32_t*)(bWorkBlob + oWork.iWorkSize * i + 39); + } +} + +template<size_t N> +void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi) { if(affinity >= 0) //-1 means no affinity bindMemoryToNUMANode(affinity); @@ -477,31 +565,26 @@ void minethd::double_work_main() lck.release(); std::this_thread::yield(); - cn_hash_fun_dbl hash_fun; - cryptonight_ctx* ctx0; - cryptonight_ctx* ctx1; + cryptonight_ctx *ctx[MAX_N]; uint64_t iCount = 0; - uint64_t *piHashVal0, *piHashVal1; - uint32_t *piNonce0, *piNonce1; - uint8_t bDoubleHashOut[64]; - uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2]; + uint64_t *piHashVal[MAX_N]; + uint32_t *piNonce[MAX_N]; + uint8_t bHashOut[MAX_N * 32]; + uint8_t bWorkBlob[sizeof(miner_work::bWorkBlob) * MAX_N]; uint32_t iNonce; job_result res; - hash_fun = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()); - ctx0 = minethd_alloc_ctx(); - ctx1 = minethd_alloc_ctx(); - - piHashVal0 = (uint64_t*)(bDoubleHashOut + 24); - piHashVal1 = (uint64_t*)(bDoubleHashOut + 32 + 24); - piNonce0 = (uint32_t*)(bDoubleWorkBlob + 39); + for (size_t i = 0; i < N; i++) + { + ctx[i] = minethd_alloc_ctx(); + piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24); + piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr; + } if(!oWork.bStall) - piNonce1 = prep_double_work(bDoubleWorkBlob); - else - piNonce1 = nullptr; + prep_multiway_work<N>(bWorkBlob, piNonce); - globalStates::inst().inst().iConsumeCnt++; + globalStates::inst().iConsumeCnt++; while (bQuit == 0) { @@ -515,55 +598,57 @@ void minethd::double_work_main() std::this_thread::sleep_for(std::chrono::milliseconds(100)); consume_work(); - piNonce1 = prep_double_work(bDoubleWorkBlob); + prep_multiway_work<N>(bWorkBlob, piNonce); continue; } - size_t nonce_ctr = 0; - constexpr size_t nonce_chunk = 4096; //Needs to be a power of 2 + constexpr uint32_t nonce_chunk = 4096; + int64_t nonce_ctr = 0; assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); if(oWork.bNiceHash) - iNonce = *piNonce0; + iNonce = *piNonce[0]; while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { - if ((iCount & 0x7) == 0) //Store stats every 16 hashes + if ((iCount++ & 0x7) == 0) //Store stats every 8*N hashes { using namespace std::chrono; uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); - iHashCount.store(iCount, std::memory_order_relaxed); + iHashCount.store(iCount * N, std::memory_order_relaxed); iTimestamp.store(iStamp, std::memory_order_relaxed); } - iCount += 2; - - - if((nonce_ctr++ & (nonce_chunk/2 - 1)) == 0) + + nonce_ctr -= N; + if(nonce_ctr <= 0) { globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, nonce_chunk); + nonce_ctr = nonce_chunk; } - *piNonce0 = ++iNonce; - *piNonce1 = ++iNonce; + for (size_t i = 0; i < N; i++) + *piNonce[i] = ++iNonce; - hash_fun(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1); + hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx); - if (*piHashVal0 < oWork.iTarget) - executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce-1, bDoubleHashOut, iThreadNo), oWork.iPoolId)); - - if (*piHashVal1 < oWork.iTarget) - executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce, bDoubleHashOut + 32, iThreadNo), oWork.iPoolId)); + for (size_t i = 0; i < N; i++) + { + if (*piHashVal[i] < oWork.iTarget) + { + executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + 1 + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId)); + } + } std::this_thread::yield(); } consume_work(); - piNonce1 = prep_double_work(bDoubleWorkBlob); + prep_multiway_work<N>(bWorkBlob, piNonce); } - cryptonight_free_ctx(ctx0); - cryptonight_free_ctx(ctx1); + for (int i = 0; i < N; i++) + cryptonight_free_ctx(ctx[i]); } } // namespace cpu diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 5520d9e..0433d0d 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -29,16 +29,24 @@ public: static cryptonight_ctx* minethd_alloc_ctx(); private: + typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**); + static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero); - typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, cryptonight_ctx* __restrict, cryptonight_ctx* __restrict); - static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero); + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); - minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity); + template<size_t N> + void multiway_work_main(cn_hash_fun_multi hash_fun_multi); + + template<size_t N> + void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce); void work_main(); void double_work_main(); + void triple_work_main(); + void quad_work_main(); + void penta_work_main(); + void consume_work(); - uint32_t* prep_double_work(uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2]); uint64_t iJobNo; |