diff options
author | Judemir Ribeiro <jribeiro@venetian.bioinfo.puc.cl> | 2018-04-01 15:58:14 -0300 |
---|---|---|
committer | Judemir Ribeiro <jribeiro@venetian.bioinfo.puc.cl> | 2018-04-01 15:58:14 -0300 |
commit | 3290a3cd851c0af76aef1b8dcaca73241a5c5761 (patch) | |
tree | 11b52d66ab0ce86fa98387450aa6884f282f7738 /xmrstak/backend/cpu | |
parent | a036cd81592e3b3de804ba88bb8f94729ab60b7d (diff) | |
download | xmr-stak-3290a3cd851c0af76aef1b8dcaca73241a5c5761.zip xmr-stak-3290a3cd851c0af76aef1b8dcaca73241a5c5761.tar.gz |
Ported xmr-stak 2.3.0 rc to ppc64le
Diffstat (limited to 'xmrstak/backend/cpu')
-rw-r--r-- | xmrstak/backend/cpu/autoAdjust.hpp | 54 | ||||
-rw-r--r-- | xmrstak/backend/cpu/autoAdjustHwloc.hpp | 4 | ||||
-rw-r--r-- | xmrstak/backend/cpu/config.tpl | 8 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/cryptonight_altivec.h | 1187 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/cryptonight_common.cpp | 186 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/soft_aes.hpp | 107 | ||||
-rw-r--r-- | xmrstak/backend/cpu/jconf.cpp | 9 | ||||
-rw-r--r-- | xmrstak/backend/cpu/minethd.cpp | 10 |
8 files changed, 1220 insertions, 345 deletions
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index db805ec..6134aa7 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -65,7 +65,7 @@ public: if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048)) printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size); - conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); + conf += std::string(" { \"low_power_mode\" : false, \"be_mode\" : true, \"affine_to_cpu\" : false },\n"); printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down."); } else @@ -89,7 +89,7 @@ public: conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"be_mode\" : true, \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); @@ -120,59 +120,13 @@ public: private: bool detectL3Size() { - int32_t cpu_info[4]; - char cpustr[13] = {0}; - - ::jconf::cpuid(0, 0, cpu_info); - memcpy(cpustr, &cpu_info[1], 4); - memcpy(cpustr+4, &cpu_info[3], 4); - memcpy(cpustr+8, &cpu_info[2], 4); - - if(strcmp(cpustr, "GenuineIntel") == 0) - { - ::jconf::cpuid(4, 3, cpu_info); - - if(get_masked(cpu_info[0], 7, 5) != 3) - { - printer::inst()->print_msg(L0, "Autoconf failed: Couln't find L3 cache page."); - return false; - } - - L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) * - (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024; - - return true; - } - else if(strcmp(cpustr, "AuthenticAMD") == 0) - { - ::jconf::cpuid(0x80000006, 0, cpu_info); - - L3KB_size = get_masked(cpu_info[3], 31, 18) * 512; - - ::jconf::cpuid(1, 0, cpu_info); - if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen - old_amd = true; - - return true; - } - else - { - printer::inst()->print_msg(L0, "Autoconf failed: Unknown CPU type: %s.", cpustr); - return false; - } - } + return false; + } void detectCPUConf() { -#ifdef _WIN32 - SYSTEM_INFO info; - GetSystemInfo(&info); - corecnt = info.dwNumberOfProcessors; - linux_layout = false; -#else corecnt = sysconf(_SC_NPROCESSORS_ONLN); linux_layout = true; -#endif // _WIN32 } int32_t L3KB_size = 0; diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 568abb5..a9cc2d7 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -67,7 +67,7 @@ public: { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"be_mode\" : true, \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } @@ -75,7 +75,7 @@ public: catch(const std::runtime_error& err) { // \todo add fallback to default auto adjust - conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); + conf += std::string(" { \"low_power_mode\" : false, \"be_mode\" : true, \"affine_to_cpu\" : false },\n"); printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what()); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index cb4b950..91a67af 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -7,8 +7,8 @@ R"===( * the maximum performance. When set to a number N greater than 1, this mode will increase the * cache usage and single thread performance by N times. * - * no_prefetch - Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make - * things slower. + * be_mode - Power8 likes BE, Power9 does not. + * * * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading * systems it is better to assign threads to physical cores. On Windows this usually means selecting @@ -21,8 +21,8 @@ R"===( * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "be_mode" : true, "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "be_mode" : true, "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : diff --git a/xmrstak/backend/cpu/crypto/cryptonight_altivec.h b/xmrstak/backend/cpu/crypto/cryptonight_altivec.h new file mode 100644 index 0000000..3727a01 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cryptonight_altivec.h @@ -0,0 +1,1187 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ +#pragma once + +#include "cryptonight.h" +#include "xmrstak/backend/cryptonight.hpp" +#include "soft_aes.hpp" +#include <memory.h> +#include <stdio.h> +#include <altivec.h> +#undef vector +#undef pixel +#undef bool +typedef __vector unsigned char __m128i; +typedef __vector unsigned long long __m128ll; + +static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi) +{ + uint64_t lo; + asm( + "mulld %0, %1, %2" : + "=r" (lo) : + "r" (a), + "r" (b)); + asm( + "mulhdu %0, %1, %2" : + "=r" (*hi) : + "r" (a), + "r" (b)); + return lo; +} + +extern "C" +{ + void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen); + void keccakf(uint64_t st[25], int rounds); + extern void(*const extra_hashes[4])(const void *, size_t, char *); + +} + +static inline __m128i _mm_set_epi64x(uint64_t a, uint64_t b){ + return (__m128ll){b,a}; +} +// This will shift and xor tmp1 into itself as 4 32-bit vals such as +// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) +static inline __m128i sl_xor(__m128i tmp1) +{ + __m128i tmp4; + tmp4 = vec_slo(tmp1, (__m128i){0x20}); + tmp1 = vec_xor(tmp1, tmp4); + tmp4 = vec_slo(tmp4, (__m128i){0x20}); + tmp1 = vec_xor(tmp1, tmp4); + tmp4 = vec_slo(tmp4, (__m128i){0x20}); + tmp1 = vec_xor(tmp1, tmp4); + return tmp1; +} + +static inline __m128i sl_xor_be(__m128i tmp1) +{ + __m128i tmp4; + tmp4 = vec_sro(tmp1, (__m128i){0x20}); + tmp1 = vec_xor(tmp1, tmp4); + tmp4 = vec_sro(tmp4, (__m128i){0x20}); + tmp1 = vec_xor(tmp1, tmp4); + tmp4 = vec_sro(tmp4, (__m128i){0x20}); + tmp1 = vec_xor(tmp1, tmp4); + return tmp1; +} +static inline __m128i v_rev(const __m128i& tmp1) +{ + return(vec_perm(tmp1,tmp1,(__m128i){ 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 })); +} + + +static inline __m128i _mm_aesenc_si128(__m128i in, __m128i key) +{ + return v_rev(__builtin_crypto_vcipher(v_rev(in),v_rev(key))); +} + +static inline __m128i _mm_aesenc_si128_beIN(__m128i in, __m128i key) +{ + return v_rev(__builtin_crypto_vcipher(in,v_rev(key))); +} + +static inline __m128i _mm_aesenc_si128_beK(__m128i in, __m128i key) +{ + return v_rev(__builtin_crypto_vcipher(v_rev(in),key)); +} +static inline __m128i _mm_aesenc_si128_be(__m128i in, __m128i key) +{ + return __builtin_crypto_vcipher(in,key); +} + + +template<uint8_t rcon> +static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2) +{ + __m128i xout1 = soft_aeskeygenassist(*xout2, rcon); + xout1 = vec_perm(xout1,xout1,(__m128i){0xc,0xd,0xe,0xf, 0xc,0xd,0xe,0xf, 0xc,0xd,0xe,0xf, 0xc,0xd,0xe,0xf}); + *xout0 = sl_xor(*xout0); + *xout0 = vec_xor(*xout0, xout1); + xout1 = soft_aeskeygenassist(*xout0, 0x00); + xout1 = vec_perm(xout1,xout1,(__m128i){0x8,0x9,0xa,0xb, 0x8,0x9,0xa,0xb, 0x8,0x9,0xa,0xb, 0x8,0x9,0xa,0xb}); + *xout2 = sl_xor(*xout2); + *xout2 = vec_xor(*xout2, xout1); +} + +template<uint8_t rcon> +static inline void aes_genkey_sub_be(__m128i* xout0, __m128i* xout2) +{ + __m128i xout1 = soft_aeskeygenassist_be(*xout2, rcon); + xout1 = vec_perm(xout1,xout1,(__m128i){0x0,0x1,0x2,0x3, 0x0,0x1,0x2,0x3, 0x0,0x1,0x2,0x3, 0x0,0x1,0x2,0x3}); + *xout0 = sl_xor_be(*xout0); + *xout0 = vec_xor(*xout0, xout1); + xout1 = soft_aeskeygenassist_be(*xout0, 0x00); + xout1 = vec_perm(xout1,xout1,(__m128i){0x4,0x5,0x6,0x7, 0x4,0x5,0x6,0x7, 0x4,0x5,0x6,0x7, 0x4,0x5,0x6,0x7}); + *xout2 = sl_xor_be(*xout2); + *xout2 = vec_xor(*xout2, xout1); +} + + +template<bool SOFT_AES> +static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, + __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) +{ + __m128i xout0, xout2; + + xout0 = vec_ld(0,memory); + xout2 = vec_ld(16,memory); + *k0 = xout0; + *k1 = xout2; + + aes_genkey_sub<0x01>(&xout0, &xout2); + *k2 = xout0; + *k3 = xout2; + + aes_genkey_sub<0x02>(&xout0, &xout2); + *k4 = xout0; + *k5 = xout2; + + aes_genkey_sub<0x04>(&xout0, &xout2); + *k6 = xout0; + *k7 = xout2; + + aes_genkey_sub<0x08>(&xout0, &xout2); + *k8 = xout0; + *k9 = xout2; +} + +template<bool SOFT_AES> +static inline void aes_genkey_be(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, + __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) +{ + __m128i xout0, xout2; + + xout0 = v_rev(vec_ld(0,memory)); + xout2 = v_rev(vec_ld(16,memory)); + *k0 = xout0; + *k1 = xout2; + + aes_genkey_sub_be<0x01>(&xout0, &xout2); + *k2 = xout0; + *k3 = xout2; + + aes_genkey_sub_be<0x02>(&xout0, &xout2); + *k4 = xout0; + *k5 = xout2; + + aes_genkey_sub_be<0x04>(&xout0, &xout2); + *k6 = xout0; + *k7 = xout2; + + aes_genkey_sub_be<0x08>(&xout0, &xout2); + *k8 = xout0; + *k9 = xout2; +} +static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) +{ + *x0 = _mm_aesenc_si128(*x0, key); + *x1 = _mm_aesenc_si128(*x1, key); + *x2 = _mm_aesenc_si128(*x2, key); + *x3 = _mm_aesenc_si128(*x3, key); + *x4 = _mm_aesenc_si128(*x4, key); + *x5 = _mm_aesenc_si128(*x5, key); + *x6 = _mm_aesenc_si128(*x6, key); + *x7 = _mm_aesenc_si128(*x7, key); +} + +static inline void aes_round_be(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) +{ + *x0 = _mm_aesenc_si128_be(*x0, key); + *x1 = _mm_aesenc_si128_be(*x1, key); + *x2 = _mm_aesenc_si128_be(*x2, key); + *x3 = _mm_aesenc_si128_be(*x3, key); + *x4 = _mm_aesenc_si128_be(*x4, key); + *x5 = _mm_aesenc_si128_be(*x5, key); + *x6 = _mm_aesenc_si128_be(*x6, key); + *x7 = _mm_aesenc_si128_be(*x7, key); + +} + + +inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7) +{ + __m128i tmp0 = x0; + x0 = vec_xor(x0, x1); + x1 = vec_xor(x1, x2); + x2 = vec_xor(x2, x3); + x3 = vec_xor(x3, x4); + x4 = vec_xor(x4, x5); + x5 = vec_xor(x5, x6); + x6 = vec_xor(x6, x7); + x7 = vec_xor(x7, tmp0); +} + +template<size_t MEM, bool SOFT_AES, bool BE_MODE, xmrstak_algo ALGO> +void cn_explode_scratchpad(const __m128i* input, __m128i* output) +{ + // This is more than we have registers, compiler will assign 2 keys on the stack + __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xin0 = vec_ld(64,input); + xin1 = vec_ld(80,input); + xin2 = vec_ld(96,input); + xin3 = vec_ld(112,input); + xin4 = vec_ld(128,input); + xin5 = vec_ld(144,input); + xin6 = vec_ld(160,input); + xin7 = vec_ld(176,input); + + if(ALGO == cryptonight_heavy) + { + for(size_t i=0; i < 16; i++) + { + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7); + } + } + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + { + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + vec_st(xin0,i*16,output); + vec_st(xin1,(i+1)*16,output); + vec_st(xin2,(i+2)*16,output); + vec_st(xin3,(i+3)*16,output); + vec_st(xin4,(i+4)*16,output); + vec_st(xin5,(i+5)*16,output); + vec_st(xin6,(i+6)*16,output); + vec_st(xin7,(i+7)*16,output); + + } +} +template<size_t MEM, bool SOFT_AES, bool BE_MODE, xmrstak_algo ALGO> +void cn_explode_scratchpad_be(const __m128i* input, __m128i* output) +{ + // This is more than we have registers, compiler will assign 2 keys on the stack + __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + aes_genkey_be<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xin0 = vec_ld(64,input); + xin1 = vec_ld(80,input); + xin2 = vec_ld(96,input); + xin3 = vec_ld(112,input); + xin4 = vec_ld(128,input); + xin5 = vec_ld(144,input); + xin6 = vec_ld(160,input); + xin7 = vec_ld(176,input); + + xin0 = v_rev(xin0); + xin1 = v_rev(xin1); + xin2 = v_rev(xin2); + xin3 = v_rev(xin3); + xin4 = v_rev(xin4); + xin5 = v_rev(xin5); + xin6 = v_rev(xin6); + xin7 = v_rev(xin7); + + if(ALGO == cryptonight_heavy) + { + for(size_t i=0; i < 16; i++) + { + aes_round_be(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7); + } + } + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + { + aes_round_be(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round_be(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + vec_st(v_rev(xin0),i*16,output); + vec_st(v_rev(xin1),(i+1)*16,output); + vec_st(v_rev(xin2),(i+2)*16,output); + vec_st(v_rev(xin3),(i+3)*16,output); + vec_st(v_rev(xin4),(i+4)*16,output); + vec_st(v_rev(xin5),(i+5)*16,output); + vec_st(v_rev(xin6),(i+6)*16,output); + vec_st(v_rev(xin7),(i+7)*16,output); + + } +} + +template<size_t MEM, bool SOFT_AES, bool BE_MODE, xmrstak_algo ALGO> +void cn_implode_scratchpad(const __m128i* input, __m128i* output) +{ + // This is more than we have registers, compiler will assign 2 keys on the stack + __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + + aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + + xout0 = vec_ld(64,output); + xout1 = vec_ld(80,output); + xout2 = vec_ld(96,output); + xout3 = vec_ld(112,output); + xout4 = vec_ld(128,output); + xout5 = vec_ld(144,output); + xout6 = vec_ld(160,output); + xout7 = vec_ld(176,output); + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + { + + xout0 = vec_xor(vec_ld(i*16,input), xout0); + xout1 = vec_xor(vec_ld((i+1)*16,input), xout1); + xout2 = vec_xor(vec_ld((i+2)*16,input), xout2); + xout3 = vec_xor(vec_ld((i+3)*16,input), xout3); + xout4 = vec_xor(vec_ld((i+4)*16,input), xout4); + xout5 = vec_xor(vec_ld((i+5)*16,input), xout5); + xout6 = vec_xor(vec_ld((i+6)*16,input), xout6); + xout7 = vec_xor(vec_ld((i+7)*16,input), xout7); + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + if(ALGO == cryptonight_heavy) + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + if(ALGO == cryptonight_heavy) + { + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + { + + xout0 = vec_xor(vec_ld(i*16,input), xout0); + xout1 = vec_xor(vec_ld((i+1)*16,input), xout1); + xout2 = vec_xor(vec_ld((i+2)*16,input), xout2); + xout3 = vec_xor(vec_ld((i+3)*16,input), xout3); + xout4 = vec_xor(vec_ld((i+4)*16,input), xout4); + xout5 = vec_xor(vec_ld((i+5)*16,input), xout5); + xout6 = vec_xor(vec_ld((i+6)*16,input), xout6); + xout7 = vec_xor(vec_ld((i+7)*16,input), xout7); + + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + for(size_t i=0; i < 16; i++) + { + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + } + + vec_st(xout0,64,output); + vec_st(xout1,80,output); + vec_st(xout2,96,output); + vec_st(xout3,112,output); + vec_st(xout4,128,output); + vec_st(xout5,144,output); + vec_st(xout6,160,output); + vec_st(xout7,176,output); +} + +template<size_t MEM, bool SOFT_AES, bool BE_MODE, xmrstak_algo ALGO> +void cn_implode_scratchpad_be(const __m128i* input, __m128i* output) +{ + // This is more than we have registers, compiler will assign 2 keys on the stack + __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + + aes_genkey_be<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xout0 = vec_ld(64,output); + xout1 = vec_ld(80,output); + xout2 = vec_ld(96,output); + xout3 = vec_ld(112,output); + xout4 = vec_ld(128,output); + xout5 = vec_ld(144,output); + xout6 = vec_ld(160,output); + xout7 = vec_ld(176,output); + + xout0 = v_rev(xout0); + xout1 = v_rev(xout1); + xout2 = v_rev(xout2); + xout3 = v_rev(xout3); + xout4 = v_rev(xout4); + xout5 = v_rev(xout5); + xout6 = v_rev(xout6); + xout7 = v_rev(xout7); + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + { + + xout0 = vec_xor(v_rev(vec_ld(i*16,input)), xout0); + xout1 = vec_xor(v_rev(vec_ld((i+1)*16,input)), xout1); + xout2 = vec_xor(v_rev(vec_ld((i+2)*16,input)), xout2); + xout3 = vec_xor(v_rev(vec_ld((i+3)*16,input)), xout3); + xout4 = vec_xor(v_rev(vec_ld((i+4)*16,input)), xout4); + xout5 = vec_xor(v_rev(vec_ld((i+5)*16,input)), xout5); + xout6 = vec_xor(v_rev(vec_ld((i+6)*16,input)), xout6); + xout7 = vec_xor(v_rev(vec_ld((i+7)*16,input)), xout7); + aes_round_be(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + if(ALGO == cryptonight_heavy) + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + if(ALGO == cryptonight_heavy) + { + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + { + + xout0 = vec_xor(v_rev(vec_ld(i*16,input)), xout0); + xout1 = vec_xor(v_rev(vec_ld((i+1)*16,input)), xout1); + xout2 = vec_xor(v_rev(vec_ld((i+2)*16,input)), xout2); + xout3 = vec_xor(v_rev(vec_ld((i+3)*16,input)), xout3); + xout4 = vec_xor(v_rev(vec_ld((i+4)*16,input)), xout4); + xout5 = vec_xor(v_rev(vec_ld((i+5)*16,input)), xout5); + xout6 = vec_xor(v_rev(vec_ld((i+6)*16,input)), xout6); + xout7 = vec_xor(v_rev(vec_ld((i+7)*16,input)), xout7); + + aes_round_be(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + for(size_t i=0; i < 16; i++) + { + aes_round_be(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round_be(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + } + + vec_st(v_rev(xout0),64,output); + vec_st(v_rev(xout1),80,output); + vec_st(v_rev(xout2),96,output); + vec_st(v_rev(xout3),112,output); + vec_st(v_rev(xout4),128,output); + vec_st(v_rev(xout5),144,output); + vec_st(v_rev(xout6),160,output); + vec_st(v_rev(xout7),176,output); +} +inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) +{ + mem_out[0] = ((uint64_t*)&tmp)[0]; + tmp = vec_perm(tmp,tmp,(__m128i){0x8,0x9,0xa,0xb, 0xc,0xd,0xe,0xf, 0x8,0x9,0xa,0xb, 0xc,0xd,0xe,0xf}); + uint64_t vh = ((uint64_t*)&tmp)[0]; + uint8_t x = vh >> 24; + static const uint16_t table = 0x7531; + const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; + vh ^= ((table >> index) & 0x3) << 28; + mem_out[1] = vh; +} + +template<xmrstak_algo ALGO, bool SOFT_AES, bool BE_MODE> +void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0) +{ + constexpr size_t MASK = cn_select_mask<ALGO>(); + constexpr size_t ITERATIONS = cn_select_iter<ALGO>(); + constexpr size_t MEM = cn_select_memory<ALGO>(); + + if(ALGO == cryptonight_monero && len < 43) + { + memset(output, 0, 32); + return; + } + + keccak((const uint8_t *)input, len, ctx0->hash_state, 200); + + uint64_t monero_const; + if(ALGO == cryptonight_monero) + { + monero_const = *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35); + monero_const ^= *(reinterpret_cast<const uint64_t*>(ctx0->hash_state) + 24); + } + + // Optim - 99% time boundary + if(BE_MODE) cn_explode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); + else cn_explode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); + uint8_t* l0 = ctx0->long_state; + uint64_t* h0 = (uint64_t*)ctx0->hash_state; + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = (__m128ll){h0[2] ^ h0[6],h0[3] ^ h0[7]}; + + uint64_t idx0 = al0; + + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) + { + __m128i cx; + cx = vec_vsx_ld(0,(__m128i *)&l0[idx0 & MASK]); + cx = _mm_aesenc_si128(cx, (__m128ll){al0, ah0}); + + if(ALGO == cryptonight_monero) + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], vec_xor(bx0, cx)); + else + vec_vsx_st(vec_xor(bx0, cx),0,(__m128i *)&l0[idx0 & MASK]); + + idx0 = ((uint64_t*)&cx)[0]; + + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*)&l0[idx0 & MASK])[0]; + ch = ((uint64_t*)&l0[idx0 & MASK])[1]; + + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ((uint64_t*)&l0[idx0 & MASK])[0] = al0; + al0 ^= cl; + ah0 += lo; + + if(ALGO == cryptonight_monero) + ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const; + else + ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; + ah0 ^= ch; + + idx0 = al0; + + if(ALGO == cryptonight_heavy) + { + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + } + } + + // Optim - 90% time boundary + if(BE_MODE) cn_implode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); + else cn_implode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); + + // Optim - 99% time boundary + + keccakf((uint64_t*)ctx0->hash_state, 24); + extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); +} + +// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon +// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output +// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) +template<xmrstak_algo ALGO, bool SOFT_AES, bool BE_MODE> +void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MASK = cn_select_mask<ALGO>(); + constexpr size_t ITERATIONS = cn_select_iter<ALGO>(); + constexpr size_t MEM = cn_select_memory<ALGO>(); + + if(ALGO == cryptonight_monero && len < 43) + { + memset(output, 0, 64); + return; + } + + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200); + + uint64_t monero_const_0, monero_const_1; + if(ALGO == cryptonight_monero) + { + monero_const_0 = *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35); + monero_const_0 ^= *(reinterpret_cast<const uint64_t*>(ctx[0]->hash_state) + 24); + monero_const_1 = *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len + 35); + monero_const_1 ^= *(reinterpret_cast<const uint64_t*>(ctx[1]->hash_state) + 24); + } + + // Optim - 99% time boundary + if(BE_MODE){ + cn_explode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + cn_explode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);} + else{ + cn_explode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + cn_explode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);} + + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + + uint64_t axl0 = h0[0] ^ h0[4]; + uint64_t axh0 = h0[1] ^ h0[5]; + __m128i bx0 = (__m128ll){h0[2] ^ h0[6],h0[3] ^ h0[7]}; + uint64_t axl1 = h1[0] ^ h1[4]; + uint64_t axh1 = h1[1] ^ h1[5]; + __m128i bx1 = (__m128ll){h1[2] ^ h1[6],h1[3] ^ h1[7]}; + + uint64_t idx0 = h0[0] ^ h0[4]; + uint64_t idx1 = h1[0] ^ h1[4]; + + // Optim - 90% time boundary + for (size_t i = 0; i < ITERATIONS; i++) + { + __m128i cx; + cx = vec_vsx_ld(0,(__m128i *)&l0[idx0 & MASK]); + cx = _mm_aesenc_si128(cx, (__m128ll){axl0, axh0}); + + if(ALGO == cryptonight_monero) + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], vec_xor(bx0, cx)); + else + vec_vsx_st(vec_xor(bx0, cx),0,(__m128i *)&l0[idx0 & MASK]); + + idx0 = ((uint64_t*)&cx)[0]; + bx0 = cx; + + cx = vec_vsx_ld(0,(__m128i *)&l1[idx1 & MASK]); + cx = _mm_aesenc_si128(cx, (__m128ll){axl1, axh1}); + + if(ALGO == cryptonight_monero) + cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], vec_xor(bx1, cx)); + else + vec_vsx_st(vec_xor(bx1, cx),0,(__m128i *)&l1[idx1 & MASK]); + + idx1 = ((uint64_t*)&cx)[0]; + bx1 = cx; + + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*)&l0[idx0 & MASK])[0]; + ch = ((uint64_t*)&l0[idx0 & MASK])[1]; + + lo = _umul128(idx0, cl, &hi); + + axl0 += hi; + axh0 += lo; + ((uint64_t*)&l0[idx0 & MASK])[0] = axl0; + + if(ALGO == cryptonight_monero) + ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0; + else + ((uint64_t*)&l0[idx0 & MASK])[1] = axh0; + + axh0 ^= ch; + axl0 ^= cl; + idx0 = axl0; + + if(ALGO == cryptonight_heavy) + { + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + } + + + cl = ((uint64_t*)&l1[idx1 & MASK])[0]; + ch = ((uint64_t*)&l1[idx1 & MASK])[1]; + + lo = _umul128(idx1, cl, &hi); + + axl1 += hi; + axh1 += lo; + ((uint64_t*)&l1[idx1 & MASK])[0] = axl1; + + if(ALGO == cryptonight_monero) + ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1; + else + ((uint64_t*)&l1[idx1 & MASK])[1] = axh1; + + axh1 ^= ch; + axl1 ^= cl; + idx1 = axl1; + + if(ALGO == cryptonight_heavy) + { + int64_t n = ((int64_t*)&l1[idx1 & MASK])[0]; + int32_t d = ((int32_t*)&l1[idx1 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + } + + } + + // Optim - 90% time boundary + if(BE_MODE){ + cn_implode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + cn_implode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);} + else{ + cn_implode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + cn_implode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);} + + // Optim - 99% time boundary + + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); + keccakf((uint64_t*)ctx[1]->hash_state, 24); + extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32); +} + +#define CN_STEP1(a, b, c, l, ptr, idx) \ + ptr = (__m128i *)&l[idx & MASK]; \ + c = vec_vsx_ld(0,ptr); + +#define CN_STEP2(a, b, c, l, ptr, idx) \ + c = _mm_aesenc_si128(c, a); \ + b = vec_xor(b, c); \ + if(ALGO == cryptonight_monero) \ + cryptonight_monero_tweak((uint64_t*)ptr, b); \ + else \ + vec_vsx_st(b,0,ptr);\ + +#define CN_STEP3(a, b, c, l, ptr, idx) \ + idx = ((uint64_t*)&c)[0]; \ + ptr = (__m128i*)&l[idx & MASK]; \ + b = vec_vsx_ld(0,ptr); + +#define CN_STEP4(a, b, c, l, mc, ptr, idx) \ + lo = _umul128(idx, ((uint64_t*)&b)[0], &hi); \ + a = (__m128ll)a + (__m128ll){hi, lo}; \ + if(ALGO == cryptonight_monero) \ + vec_vsx_st(vec_xor(a, mc),0,ptr); \ + else \ + vec_vsx_st(a,0,ptr);\ + a = vec_xor(a, b); \ + idx = ((uint64_t*)&a)[0]; \ + if(ALGO == cryptonight_heavy) \ + { \ + int64_t n = ((int64_t*)&l[idx & MASK])[0]; \ + int32_t d = ((int32_t*)&l[idx & MASK])[2]; \ + int64_t q = n / (d | 0x5); \ + ((int64_t*)&l[idx & MASK])[0] = n ^ q; \ + idx = d ^ q; \ + } + +#define CONST_INIT(ctx, n) \ + __m128i mc##n = _mm_set_epi64x(*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + n * len + 35) ^ \ + *(reinterpret_cast<const uint64_t*>((ctx)->hash_state) + 24), 0); +// This lovelier creation will do 3 cn hashes at a time. +template<xmrstak_algo ALGO, bool SOFT_AES, bool BE_MODE> +void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MASK = cn_select_mask<ALGO>(); + constexpr size_t ITERATIONS = cn_select_iter<ALGO>(); + constexpr size_t MEM = cn_select_memory<ALGO>(); + + if(ALGO == cryptonight_monero && len < 43) + { + memset(output, 0, 32 * 3); + return; + } + + for (size_t i = 0; i < 3; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + if(BE_MODE) cn_explode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + else cn_explode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + + } + + CONST_INIT(ctx[0], 0); + CONST_INIT(ctx[1], 1); + CONST_INIT(ctx[2], 2); + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + + __m128i ax0 = (__m128ll){h0[0] ^ h0[4], h0[1] ^ h0[5]}; + __m128i bx0 = (__m128ll){h0[2] ^ h0[6], h0[3] ^ h0[7]}; + __m128i ax1 = (__m128ll){h1[0] ^ h1[4], h1[1] ^ h1[5]}; + __m128i bx1 = (__m128ll){h1[2] ^ h1[6], h1[3] ^ h1[7]}; + __m128i ax2 = (__m128ll){h2[0] ^ h2[4], h2[1] ^ h2[5]}; + __m128i bx2 = (__m128ll){h2[2] ^ h2[6], h2[3] ^ h2[7]}; + __m128i cx0 = (__m128ll){0, 0}; + __m128i cx1 = (__m128ll){0, 0}; + __m128i cx2 = (__m128ll){0, 0}; + + uint64_t idx0, idx1, idx2; + idx0 = ((uint64_t*)&ax0)[0]; + idx1 = ((uint64_t*)&ax1)[0]; + idx2 = ((uint64_t*)&ax2)[0]; + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t hi, lo; + __m128i *ptr0, *ptr1, *ptr2; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + + CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + + CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); + } + + for (size_t i = 0; i < 3; i++) + { + if(BE_MODE) cn_implode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + else cn_implode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} + +// This even lovelier creation will do 4 cn hashes at a time. +template<xmrstak_algo ALGO, bool SOFT_AES, bool BE_MODE> +void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MASK = cn_select_mask<ALGO>(); + constexpr size_t ITERATIONS = cn_select_iter<ALGO>(); + constexpr size_t MEM = cn_select_memory<ALGO>(); + + if(ALGO == cryptonight_monero && len < 43) + { + memset(output, 0, 32 * 4); + return; + } + + for (size_t i = 0; i < 4; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + if(BE_MODE) cn_explode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + else cn_explode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + CONST_INIT(ctx[0], 0); + CONST_INIT(ctx[1], 1); + CONST_INIT(ctx[2], 2); + CONST_INIT(ctx[3], 3); + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + uint8_t* l3 = ctx[3]->long_state; + uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; + + __m128i ax0 = (__m128ll){h0[0] ^ h0[4], h0[1] ^ h0[5]}; + __m128i bx0 = (__m128ll){h0[2] ^ h0[6], h0[3] ^ h0[7]}; + __m128i ax1 = (__m128ll){h1[0] ^ h1[4], h1[1] ^ h1[5]}; + __m128i bx1 = (__m128ll){h1[2] ^ h1[6], h1[3] ^ h1[7]}; + __m128i ax2 = (__m128ll){h2[0] ^ h2[4], h2[1] ^ h2[5]}; + __m128i bx2 = (__m128ll){h2[2] ^ h2[6], h2[3] ^ h2[7]}; + __m128i ax3 = (__m128ll){h3[0] ^ h3[4], h3[1] ^ h3[5]}; + __m128i bx3 = (__m128ll){h3[2] ^ h3[6], h3[3] ^ h3[7]}; + __m128i cx0 = (__m128ll){0, 0}; + __m128i cx1 = (__m128ll){0, 0}; + __m128i cx2 = (__m128ll){0, 0}; + __m128i cx3 = (__m128ll){0, 0}; + + uint64_t idx0, idx1, idx2, idx3; + idx0 = ((uint64_t*)&ax0)[0]; + idx1 = ((uint64_t*)&ax1)[0]; + idx2 = ((uint64_t*)&ax2)[0]; + idx3 = ((uint64_t*)&ax3)[0]; + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t hi, lo; + __m128i *ptr0, *ptr1, *ptr2, *ptr3; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); + + CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); + CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); + + CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); + CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3); + } + + for (size_t i = 0; i < 4; i++) + { + if(BE_MODE) cn_implode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + else cn_implode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} + +// This most lovely creation will do 5 cn hashes at a time. +template<xmrstak_algo ALGO, bool SOFT_AES, bool BE_MODE> +void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MASK = cn_select_mask<ALGO>(); + constexpr size_t ITERATIONS = cn_select_iter<ALGO>(); + constexpr size_t MEM = cn_select_memory<ALGO>(); + + if(ALGO == cryptonight_monero && len < 43) + { + memset(output, 0, 32 * 5); + return; + } + + for (size_t i = 0; i < 5; i++) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + if(BE_MODE) cn_explode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + else cn_explode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + CONST_INIT(ctx[0], 0); + CONST_INIT(ctx[1], 1); + CONST_INIT(ctx[2], 2); + CONST_INIT(ctx[3], 3); + CONST_INIT(ctx[4], 4); + + uint8_t* l0 = ctx[0]->long_state; + uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; + uint8_t* l1 = ctx[1]->long_state; + uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + uint8_t* l2 = ctx[2]->long_state; + uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; + uint8_t* l3 = ctx[3]->long_state; + uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; + uint8_t* l4 = ctx[4]->long_state; + uint64_t* h4 = (uint64_t*)ctx[4]->hash_state; + + __m128i ax0 = (__m128ll){h0[0] ^ h0[4], h0[1] ^ h0[5]}; + __m128i bx0 = (__m128ll){h0[2] ^ h0[6], h0[3] ^ h0[7]}; + __m128i ax1 = (__m128ll){h1[0] ^ h1[4], h1[1] ^ h1[5]}; + __m128i bx1 = (__m128ll){h1[2] ^ h1[6], h1[3] ^ h1[7]}; + __m128i ax2 = (__m128ll){h2[0] ^ h2[4], h2[1] ^ h2[5]}; + __m128i bx2 = (__m128ll){h2[2] ^ h2[6], h2[3] ^ h2[7]}; + __m128i ax3 = (__m128ll){h3[0] ^ h3[4], h3[1] ^ h3[5]}; + __m128i bx3 = (__m128ll){h3[2] ^ h3[6], h3[3] ^ h3[7]}; + __m128i ax4 = (__m128ll){h4[0] ^ h4[4], h4[1] ^ h4[5]}; + __m128i bx4 = (__m128ll){h4[2] ^ h4[6], h4[3] ^ h4[7]}; + __m128i cx0 = (__m128ll){0, 0}; + __m128i cx1 = (__m128ll){0, 0}; + __m128i cx2 = (__m128ll){0, 0}; + __m128i cx3 = (__m128ll){0, 0}; + __m128i cx4 = (__m128ll){0, 0}; + + uint64_t idx0, idx1, idx2, idx3, idx4; + idx0 = ((uint64_t*)&ax0)[0]; + idx1 = ((uint64_t*)&ax1)[0]; + idx2 = ((uint64_t*)&ax2)[0]; + idx3 = ((uint64_t*)&ax3)[0]; + idx4 = ((uint64_t*)&ax4)[0]; + + for (size_t i = 0; i < ITERATIONS/2; i++) + { + uint64_t hi, lo; + __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4; + + // EVEN ROUND + CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); + CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); + CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); + CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); + CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4); + + CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); + CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); + CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); + CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3); + CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4); + + // ODD ROUND + CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); + CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); + CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); + CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); + CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4); + + CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); + CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); + CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); + CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3); + CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4); + } + + for (size_t i = 0; i < 5; i++) + { + if(BE_MODE) cn_implode_scratchpad_be<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + else cn_implode_scratchpad<MEM, SOFT_AES, BE_MODE, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } +} diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index 17fa24b..8c30e2c 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp @@ -30,30 +30,18 @@ extern "C" } #include "xmrstak/backend/cryptonight.hpp" #include "cryptonight.h" -#include "cryptonight_aesni.h" +#include "cryptonight_altivec.h" #include "xmrstak/misc/console.hpp" #include "xmrstak/jconf.hpp" #include <stdio.h> #include <stdlib.h> -#ifdef __GNUC__ -#include <mm_malloc.h> -#else #include <malloc.h> -#endif // __GNUC__ -#if defined(__APPLE__) -#include <mach/vm_statistics.h> -#endif -#ifdef _WIN32 -#include <windows.h> -#include <ntsecapi.h> -#else #include <sys/mman.h> #include <errno.h> #include <string.h> -#endif // _WIN32 void do_blake_hash(const void* input, size_t len, char* output) { blake256_hash((uint8_t*)output, (const uint8_t*)input, len); @@ -73,190 +61,33 @@ void do_skein_hash(const void* input, size_t len, char* output) { void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; -#ifdef _WIN32 -#include "xmrstak/misc/uac.hpp" - -BOOL bRebootDesirable = FALSE; //If VirtualAlloc fails, suggest a reboot - -BOOL AddPrivilege(TCHAR* pszPrivilege) -{ - HANDLE hToken; - TOKEN_PRIVILEGES tp; - BOOL status; - - if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) - return FALSE; - - if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) - return FALSE; - - tp.PrivilegeCount = 1; - tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); - - if (!status || (GetLastError() != ERROR_SUCCESS)) - return FALSE; - - CloseHandle(hToken); - return TRUE; -} - -BOOL AddLargePageRights() -{ - HANDLE hToken; - PTOKEN_USER user = NULL; - - if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE) - { - TOKEN_ELEVATION Elevation; - DWORD cbSize = sizeof(TOKEN_ELEVATION); - BOOL bIsElevated = FALSE; - - if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) - bIsElevated = Elevation.TokenIsElevated; - - DWORD size = 0; - GetTokenInformation(hToken, TokenUser, NULL, 0, &size); - - if (size > 0 && bIsElevated) - { - user = (PTOKEN_USER)LocalAlloc(LPTR, size); - GetTokenInformation(hToken, TokenUser, user, size, &size); - } - - CloseHandle(hToken); - } - - if (!user) - return FALSE; - - LSA_HANDLE handle; - LSA_OBJECT_ATTRIBUTES attributes; - ZeroMemory(&attributes, sizeof(attributes)); - - BOOL result = FALSE; - if (LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0) - { - LSA_UNICODE_STRING lockmem; - lockmem.Buffer = L"SeLockMemoryPrivilege"; - lockmem.Length = 42; - lockmem.MaximumLength = 44; - - PLSA_UNICODE_STRING rights = NULL; - ULONG cnt = 0; - BOOL bHasRights = FALSE; - if (LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0) - { - for (size_t i = 0; i < cnt; i++) - { - if (rights[i].Length == lockmem.Length && - memcmp(rights[i].Buffer, lockmem.Buffer, 42) == 0) - { - bHasRights = TRUE; - break; - } - } - - LsaFreeMemory(rights); - } - - if(!bHasRights) - result = LsaAddAccountRights(handle, user->User.Sid, &lockmem, 1) == 0; - - LsaClose(handle); - } - - LocalFree(user); - return result; -} -#endif size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg) { -#ifdef _WIN32 - if(use_fast_mem == 0) - return 1; - - if(AddPrivilege(TEXT("SeLockMemoryPrivilege")) == 0) - { - printer::inst()->print_msg(L0, "Elevating because we need to set up fast memory privileges."); - RequestElevation(); - - if(AddLargePageRights()) - { - msg->warning = "Added SeLockMemoryPrivilege to the current account. You need to reboot for it to work"; - bRebootDesirable = TRUE; - } - else - msg->warning = "Obtaining SeLockMemoryPrivilege failed."; - - return 0; - } - - bRebootDesirable = TRUE; - return 1; -#else return 1; -#endif // _WIN32 } cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg) { size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo()); - cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096); + cryptonight_ctx* ptr = (cryptonight_ctx*)malloc(sizeof(cryptonight_ctx)); if(use_fast_mem == 0) { // use 2MiB aligned memory - ptr->long_state = (uint8_t*)_mm_malloc(hashMemSize, hashMemSize); + ptr->long_state = (uint8_t*)malloc(hashMemSize); ptr->ctx_info[0] = 0; ptr->ctx_info[1] = 0; return ptr; } -#ifdef _WIN32 - SIZE_T iLargePageMin = GetLargePageMinimum(); - - if(hashMemSize > iLargePageMin) - iLargePageMin *= 2; - - ptr->long_state = (uint8_t*)VirtualAlloc(NULL, iLargePageMin, - MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); - - if(ptr->long_state == NULL) - { - _mm_free(ptr); - if(bRebootDesirable) - msg->warning = "VirtualAlloc failed. Reboot might help."; - else - msg->warning = "VirtualAlloc failed."; - return NULL; - } - else - { - ptr->ctx_info[0] = 1; - return ptr; - } -#else - -#if defined(__APPLE__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); -#elif defined(__FreeBSD__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0); -#elif defined(__OpenBSD__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); -#else ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0); -#endif if (ptr->long_state == MAP_FAILED) { - _mm_free(ptr); + free(ptr); msg->warning = "mmap failed"; return NULL; } @@ -273,7 +104,6 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al ptr->ctx_info[1] = 1; return ptr; -#endif // _WIN32 } void cryptonight_free_ctx(cryptonight_ctx* ctx) @@ -282,16 +112,12 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx) if(ctx->ctx_info[0] != 0) { -#ifdef _WIN32 - VirtualFree(ctx->long_state, 0, MEM_RELEASE); -#else if(ctx->ctx_info[1] != 0) munlock(ctx->long_state, hashMemSize); munmap(ctx->long_state, hashMemSize); -#endif // _WIN32 } else - _mm_free(ctx->long_state); + free(ctx->long_state); - _mm_free(ctx); + free(ctx); } diff --git a/xmrstak/backend/cpu/crypto/soft_aes.hpp b/xmrstak/backend/cpu/crypto/soft_aes.hpp index d3f4637..8cbe9d5 100644 --- a/xmrstak/backend/cpu/crypto/soft_aes.hpp +++ b/xmrstak/backend/cpu/crypto/soft_aes.hpp @@ -21,105 +21,24 @@ * */ -/* +/*,h * Parts of this file are originally copyright (c) 2014-2017, The Monero Project */ -#pragma once - -#ifdef __GNUC__ -#include <x86intrin.h> -#else -#include <intrin.h> -#endif // __GNUC__ - -#include <inttypes.h> - -#define saes_data(w) {\ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ - w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ - w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ - w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ - w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ - w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ - w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ - w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ - w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ - w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ - w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ - w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ - w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ - w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ - w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ - w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ - w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ - w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ - w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ - w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ - w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ - w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ - w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ - w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ - w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ - w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ - w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ - w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } - -#define SAES_WPOLY 0x011b +#include <altivec.h> +#undef vector +#undef pixel +#undef bool +typedef __vector unsigned char __m128i; +typedef __vector unsigned long long __m128ll; -#define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \ - ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) - -#define saes_f2(x) ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY)) -#define saes_f3(x) (saes_f2(x) ^ x) -#define saes_h0(x) (x) - -#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p)) -#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p) -#define saes_u2(p) saes_b2w( p, saes_f3(p), saes_f2(p), p) -#define saes_u3(p) saes_b2w( p, p, saes_f3(p), saes_f2(p)) - -alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) }; -alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0); - -static inline __m128i soft_aesenc(__m128i in, __m128i key) -{ - uint32_t x0, x1, x2, x3; - x0 = _mm_cvtsi128_si32(in); - x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); - x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA)); - x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF)); - - __m128i out = _mm_set_epi32( - (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]), - (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]), - (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]), - (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24])); - - return _mm_xor_si128(out, key); -} - -static inline uint32_t sub_word(uint32_t key) -{ - return (saes_sbox[key >> 24 ] << 24) | - (saes_sbox[(key >> 16) & 0xff] << 16 ) | - (saes_sbox[(key >> 8) & 0xff] << 8 ) | - saes_sbox[key & 0xff]; -} - -#ifdef __clang__ -static inline uint32_t _rotr(uint32_t value, uint32_t amount) +static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) { - return (value >> amount) | (value << ((32 - amount) & 31)); + key = __builtin_crypto_vsbox(vec_perm(key,key,(__m128i){0x4,0x5,0x6,0x7, 0x5,0x6,0x7,0x4, 0xc,0xd,0xe,0xf, 0xd,0xe,0xf,0xc})); + return vec_xor(key,(__m128i){0,0,0,0, rcon,0,0,0, 0,0,0,0, rcon,0,0,0}); } -#endif -static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) +static inline __m128i soft_aeskeygenassist_be(__m128i key, uint8_t rcon) { - uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55))); - uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF))); - return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1); + key = __builtin_crypto_vsbox(vec_perm(key,key,(__m128i){0x3,0x0,0x1,0x2, 0x0,0x1,0x2,0x3, 0xb,0x8,0x9,0xa, 0x8,0x9,0xa,0xb})); + return vec_xor(key,(__m128i){0,0,0,rcon, 0,0,0,0, 0,0,0,rcon, 0,0,0,0}); } diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 6e709bd..5ee6676 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -30,13 +30,6 @@ #include <stdlib.h> #include <string> -#ifdef _WIN32 -#define strcasecmp _stricmp -#include <intrin.h> -#else -#include <cpuid.h> -#endif - namespace xmrstak { @@ -110,7 +103,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) const Value *mode, *no_prefetch, *aff; mode = GetObjectMember(oThdConf, "low_power_mode"); - no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); + no_prefetch = GetObjectMember(oThdConf, "be_mode"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index e263aca..d5324f5 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -21,7 +21,7 @@ * */ -#include "crypto/cryptonight_aesni.h" +#include "crypto/cryptonight_altivec.h" #include "xmrstak/misc/console.hpp" #include "xmrstak/backend/iBackend.hpp" @@ -319,13 +319,9 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work for (i = 0; i < n; i++) { jconf::inst()->GetThreadConfig(i, cfg); - + if(cfg.iCpuAff >= 0) { -#if defined(__APPLE__) - printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory."); -#endif - printer::inst()->print_msg(L1, "Starting %dx thread, affinity: %d.", cfg.iMultiway, (int)cfg.iCpuAff); } else @@ -392,7 +388,7 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr std::bitset<2> digit; digit.set(0, !bHaveAes); - digit.set(1, !bNoPrefetch); + digit.set(1, bNoPrefetch); return func_table[ algv << 2 | digit.to_ulong() ]; } |