diff options
-rw-r--r-- | .appveyor.yml | 13 | ||||
-rw-r--r-- | .travis.yml | 42 | ||||
-rw-r--r-- | CMakeLists.txt | 278 | ||||
-rw-r--r-- | INSTALL.md | 201 | ||||
-rw-r--r-- | README.md | 170 | ||||
-rw-r--r-- | autoAdjust.hpp | 154 | ||||
-rw-r--r-- | cli-miner.cpp | 225 | ||||
-rw-r--r-- | doc/FAQ.md | 61 | ||||
-rw-r--r-- | doc/compile.md | 81 | ||||
-rw-r--r-- | doc/compile_FreeBSD.md (renamed from FREEBSDCOMPILE.md) | 6 | ||||
-rw-r--r-- | doc/compile_Linux.md (renamed from LINUXCOMPILE.md) | 6 | ||||
-rw-r--r-- | doc/compile_Windows.md (renamed from WINCOMPILE.md) | 8 | ||||
-rw-r--r-- | doc/tuning.md | 67 | ||||
-rw-r--r-- | doc/usage.md | 54 | ||||
-rw-r--r-- | minethd.h | 144 | ||||
-rw-r--r-- | version.h | 4 | ||||
-rw-r--r-- | xmr-stak-cpu.cbp | 167 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/gpu.cpp | 887 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/gpu.hpp | 51 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/blake256.cl | 93 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 860 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl | 295 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/jh.cl | 274 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl | 90 | ||||
-rw-r--r-- | xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl | 114 | ||||
-rw-r--r-- | xmrstak/backend/amd/autoAdjust.hpp | 111 | ||||
-rw-r--r-- | xmrstak/backend/amd/config.tpl | 29 | ||||
-rw-r--r-- | xmrstak/backend/amd/jconf.cpp | 259 | ||||
-rw-r--r-- | xmrstak/backend/amd/jconf.hpp | 46 | ||||
-rw-r--r-- | xmrstak/backend/amd/minethd.cpp | 234 | ||||
-rw-r--r-- | xmrstak/backend/amd/minethd.hpp | 54 | ||||
-rw-r--r-- | xmrstak/backend/backendConnector.cpp | 103 | ||||
-rw-r--r-- | xmrstak/backend/backendConnector.hpp | 21 | ||||
-rw-r--r-- | xmrstak/backend/cpu/autoAdjust.hpp | 174 | ||||
-rw-r--r-- | xmrstak/backend/cpu/autoAdjustHwloc.hpp (renamed from autoAdjustHwloc.hpp) | 67 | ||||
-rw-r--r-- | xmrstak/backend/cpu/config.tpl | 32 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_blake256.c (renamed from crypto/c_blake256.c) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_blake256.h (renamed from crypto/c_blake256.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_groestl.c (renamed from crypto/c_groestl.c) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_groestl.h (renamed from crypto/c_groestl.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_jh.c (renamed from crypto/c_jh.c) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_jh.h (renamed from crypto/c_jh.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_keccak.c (renamed from crypto/c_keccak.c) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_keccak.h (renamed from crypto/c_keccak.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_skein.c (renamed from crypto/c_skein.c) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/c_skein.h (renamed from crypto/c_skein.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/cryptonight.h (renamed from crypto/cryptonight.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/cryptonight_aesni.h (renamed from crypto/cryptonight_aesni.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/cryptonight_common.cpp (renamed from crypto/cryptonight_common.cpp) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/groestl_tables.h (renamed from crypto/groestl_tables.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/hash.h (renamed from crypto/hash.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/int-util.h (renamed from crypto/int-util.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/skein_port.h (renamed from crypto/skein_port.h) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/crypto/soft_aes.c (renamed from crypto/soft_aes.c) | 0 | ||||
-rw-r--r-- | xmrstak/backend/cpu/hwlocMemory.hpp (renamed from hwlocMemory.hpp) | 2 | ||||
-rw-r--r-- | xmrstak/backend/cpu/jconf.cpp | 255 | ||||
-rw-r--r-- | xmrstak/backend/cpu/jconf.hpp | 46 | ||||
-rw-r--r-- | xmrstak/backend/cpu/minethd.cpp (renamed from minethd.cpp) | 226 | ||||
-rw-r--r-- | xmrstak/backend/cpu/minethd.hpp | 59 | ||||
-rw-r--r-- | xmrstak/backend/globalStates.cpp | 51 | ||||
-rw-r--r-- | xmrstak/backend/globalStates.hpp | 38 | ||||
-rw-r--r-- | xmrstak/backend/iBackend.hpp | 53 | ||||
-rw-r--r-- | xmrstak/backend/miner_work.hpp | 85 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/autoAdjust.hpp | 113 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/config.tpl | 28 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/jconf.cpp | 270 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/jconf.hpp | 51 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/minethd.cpp | 272 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/minethd.hpp | 54 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp | 48 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp | 305 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp | 193 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 343 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp | 30 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu | 367 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp | 104 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp | 357 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp | 301 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp | 197 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp | 347 | ||||
-rw-r--r-- | xmrstak/backend/plugin.hpp | 99 | ||||
-rw-r--r-- | xmrstak/cli/cli-miner.cpp | 370 | ||||
-rw-r--r-- | xmrstak/config.tpl (renamed from config.txt) | 345 | ||||
-rw-r--r-- | xmrstak/donate-level.hpp (renamed from donate-level.h) | 0 | ||||
-rw-r--r-- | xmrstak/http/httpd.cpp (renamed from httpd.cpp) | 16 | ||||
-rw-r--r-- | xmrstak/http/httpd.hpp (renamed from httpd.h) | 2 | ||||
-rw-r--r-- | xmrstak/http/webdesign.cpp (renamed from webdesign.cpp) | 2 | ||||
-rw-r--r-- | xmrstak/http/webdesign.hpp (renamed from webdesign.h) | 0 | ||||
-rw-r--r-- | xmrstak/jconf.cpp (renamed from jconf.cpp) | 194 | ||||
-rw-r--r-- | xmrstak/jconf.hpp (renamed from jconf.h) | 27 | ||||
-rw-r--r-- | xmrstak/misc/configEditor.hpp | 57 | ||||
-rw-r--r-- | xmrstak/misc/console.cpp (renamed from console.cpp) | 21 | ||||
-rw-r--r-- | xmrstak/misc/console.hpp (renamed from console.h) | 13 | ||||
-rw-r--r-- | xmrstak/misc/environment.hpp | 46 | ||||
-rw-r--r-- | xmrstak/misc/executor.cpp (renamed from executor.cpp) | 62 | ||||
-rw-r--r-- | xmrstak/misc/executor.hpp (renamed from executor.h) | 30 | ||||
-rw-r--r-- | xmrstak/misc/jext.hpp (renamed from jext.h) | 3 | ||||
-rw-r--r-- | xmrstak/misc/telemetry.cpp | 109 | ||||
-rw-r--r-- | xmrstak/misc/telemetry.hpp | 24 | ||||
-rw-r--r-- | xmrstak/misc/thdq.hpp (renamed from thdq.hpp) | 0 | ||||
-rw-r--r-- | xmrstak/net/jpsock.cpp (renamed from jpsock.cpp) | 19 | ||||
-rw-r--r-- | xmrstak/net/jpsock.hpp (renamed from jpsock.h) | 4 | ||||
-rw-r--r-- | xmrstak/net/msgstruct.hpp (renamed from msgstruct.h) | 4 | ||||
-rw-r--r-- | xmrstak/net/socket.cpp (renamed from socket.cpp) | 10 | ||||
-rw-r--r-- | xmrstak/net/socket.hpp (renamed from socket.h) | 4 | ||||
-rw-r--r-- | xmrstak/net/socks.hpp (renamed from socks.h) | 2 | ||||
-rw-r--r-- | xmrstak/params.hpp | 50 | ||||
-rw-r--r-- | xmrstak/rapidjson/allocators.h (renamed from rapidjson/allocators.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/document.h (renamed from rapidjson/document.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/encodedstream.h (renamed from rapidjson/encodedstream.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/encodings.h (renamed from rapidjson/encodings.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/error/en.h (renamed from rapidjson/error/en.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/error/error.h (renamed from rapidjson/error/error.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/filereadstream.h (renamed from rapidjson/filereadstream.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/filewritestream.h (renamed from rapidjson/filewritestream.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/fwd.h (renamed from rapidjson/fwd.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/biginteger.h (renamed from rapidjson/internal/biginteger.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/diyfp.h (renamed from rapidjson/internal/diyfp.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/dtoa.h (renamed from rapidjson/internal/dtoa.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/ieee754.h (renamed from rapidjson/internal/ieee754.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/itoa.h (renamed from rapidjson/internal/itoa.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/meta.h (renamed from rapidjson/internal/meta.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/pow10.h (renamed from rapidjson/internal/pow10.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/regex.h (renamed from rapidjson/internal/regex.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/stack.h (renamed from rapidjson/internal/stack.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/strfunc.h (renamed from rapidjson/internal/strfunc.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/strtod.h (renamed from rapidjson/internal/strtod.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/internal/swap.h (renamed from rapidjson/internal/swap.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/istreamwrapper.h (renamed from rapidjson/istreamwrapper.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/memorybuffer.h (renamed from rapidjson/memorybuffer.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/memorystream.h (renamed from rapidjson/memorystream.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/msinttypes/inttypes.h (renamed from rapidjson/msinttypes/inttypes.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/msinttypes/stdint.h (renamed from rapidjson/msinttypes/stdint.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/ostreamwrapper.h (renamed from rapidjson/ostreamwrapper.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/pointer.h (renamed from rapidjson/pointer.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/prettywriter.h (renamed from rapidjson/prettywriter.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/rapidjson.h (renamed from rapidjson/rapidjson.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/reader.h (renamed from rapidjson/reader.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/schema.h (renamed from rapidjson/schema.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/stream.h (renamed from rapidjson/stream.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/stringbuffer.h (renamed from rapidjson/stringbuffer.h) | 0 | ||||
-rw-r--r-- | xmrstak/rapidjson/writer.h (renamed from rapidjson/writer.h) | 0 | ||||
-rw-r--r-- | xmrstak/version.hpp | 4 |
143 files changed, 9755 insertions, 1432 deletions
diff --git a/.appveyor.yml b/.appveyor.yml index 381e544..02ce827 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -5,7 +5,7 @@ image: Visual Studio 2017 platform: x64 # clone directory -clone_folder: c:\xmr-stak-cpu +clone_folder: c:\xmr-stak install: - mkdir c:\xmr-stak-dep @@ -14,15 +14,14 @@ install: build_script: - call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsMSBuildCmd.bat" - - cd c:\xmr-stak-cpu + - cd c:\xmr-stak - mkdir build - cd build - set CMAKE_PREFIX_PATH=C:\xmr-stak-dep\hwloc;C:\xmr-stak-dep\libmicrohttpd;C:\xmr-stak-dep\openssl; - - cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 .. - - msbuild xmr-stak-cpu.sln /p:Configuration=Release + - cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF .. + - cmake --build . --config Release --target install test_script: - - cd c:\xmr-stak-cpu\build\bin\Release + - cd c:\xmr-stak\build\bin\Release - dir - - copy ..\..\..\config.txt . -# - xmr-stak-cpu.exe +# - xmr-stak.exe diff --git a/.travis.yml b/.travis.yml index 37b7749..def7fee 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,11 @@ sudo: false language: cpp +cache: + apt: true + directories: + - $HOME/.cache/cuda + addons: apt: packages: &default_packages @@ -14,6 +19,10 @@ addons: - libssl-dev - libhwloc-dev +env: + global: + - CUDA_ROOT: $HOME/.cache/cuda + matrix: include: - os: linux @@ -29,6 +38,7 @@ matrix: env: - CMAKE_CXX_COMPILER=g++-5 - CMAKE_C_COMPILER=gcc-5 + - XMRSTAK_CMAKE_FLAGS="-DCUDA_ARCH=30 -DOpenCL_ENABLE=OFF" - os: linux compiler: gcc @@ -43,6 +53,7 @@ matrix: env: - CMAKE_CXX_COMPILER=g++-6 - CMAKE_C_COMPILER=gcc-6 + - XMRSTAK_CMAKE_FLAGS="-DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF" - os: linux compiler: gcc @@ -57,19 +68,40 @@ matrix: env: - CMAKE_CXX_COMPILER=g++-7 - CMAKE_C_COMPILER=gcc-7 + - XMRSTAK_CMAKE_FLAGS="-DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF" - os: osx compiler: gcc + env: + - XMRSTAK_CMAKE_FLAGS="-DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF" before_install: - if [ $TRAVIS_OS_NAME = osx ]; then brew tap homebrew/science; fi + - export PATH=$CUDA_ROOT/bin:$PATH + +install: + + # CUDA + - if [ $TRAVIS_OS_NAME != osx ]; then + NVCC_FOUND=$(which nvcc >/dev/null && { echo 0; } || { echo 1; }); + if [ $NVCC_FOUND -ne 0 ]; then + mkdir -p $CUDA_ROOT && + cd $CUDA_ROOT && + travis_retry wget https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_linux-run && + ls -la && + chmod u+x *-run && + ./cuda_8.0.44_linux-run --silent --toolkit --toolkitpath=$CUDA_ROOT && + rm -rf ./cuda_8.0.44_linux-run $CUDA_ROOT/{samples,jre,doc,share} && + cd -; + fi + fi; script: - if [ $TRAVIS_OS_NAME = osx ]; then brew install hwloc; - cmake -DMICROHTTPD_ENABLE=OFF -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl .; - else - cmake -D CMAKE_C_COMPILER=${CMAKE_C_COMPILER} -D CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} .; + cmake -DMICROHTTPD_ENABLE=OFF -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl ${XMRSTAK_CMAKE_FLAGS} .; + else + cmake -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} ${XMRSTAK_CMAKE_FLAGS} .; fi; - - make VERBOSE=1 - - ./bin/xmr-stak-cpu -c ./config.txt + - make VERBOSE=1 install + - ./bin/xmr-stak --help diff --git a/CMakeLists.txt b/CMakeLists.txt index ac01604..3c7a585 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,8 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}" CACHE PATH "install prefix" FORCE) endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) +# help to find cuda on systems with a software module system +list(APPEND CMAKE_PREFIX_PATH "$ENV{CUDA_ROOT}") # allow user to extent CMAKE_PREFIX_PATH via environment variable list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}") @@ -38,12 +40,159 @@ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "${BUILD_TYPE}") option(CMAKE_LINK_STATIC "link as much as possible libraries static" OFF) ################################################################################ -# Find PThreads +# Find CUDA ################################################################################ +#option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" OFF) +#set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "Use the static version of the CUDA runtime library if available" FORCE) + +option(CUDA_ENABLE "Enable or disable CUDA support (NVIDIA backend)" ON) +if(CUDA_ENABLE) + find_package(CUDA 7.5 QUIET) + + if(CUDA_FOUND) + + option(XMR-STAK_LARGEGRID "Support large CUDA block count > 128" ON) + if(XMR-STAK_LARGEGRID) + add_definitions("-DXMR_STAK_LARGEGRID=${XMR-STAK_LARGEGRID}") + endif() + + set(DEVICE_COMPILER "nvcc") + set(CUDA_COMPILER "${DEVICE_COMPILER}" CACHE STRING "Select the device compiler") + + if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + list(APPEND DEVICE_COMPILER "clang") + endif() + + set_property(CACHE CUDA_COMPILER PROPERTY STRINGS "${DEVICE_COMPILER}") + + set(XMR-STAK_THREADS 0 CACHE STRING "Set maximum number of threads (for compile time optimization)") + if(NOT XMR-STAK_THREADS EQUAL 0) + message(STATUS "xmr-stak-nvidia: set max threads per block to ${XMR-STAK_THREADS}") + add_definitions("-DXMR_STAK_THREADS=${XMR-STAK_THREADS}") + endif() + + set(CUDA_ARCH "20;30;35;37;50;52;60;61;62" CACHE STRING "Set GPU architecture (semicolon separated list, e.g. '-DCUDA_ARCH=20;35;60')") + + # validate architectures (only numbers are allowed) + foreach(CUDA_ARCH_ELEM ${CUDA_ARCH}) + string(REGEX MATCH "^[0-9]+$" IS_NUMBER ${CUDA_ARCH}) + if(NOT IS_NUMBER) + message(FATAL_ERROR "Defined compute architecture '${CUDA_ARCH_ELEM}' in " + "'${CUDA_ARCH}' is not an integral number, use e.g. '30' (for compute architecture 3.0).") + endif() + unset(IS_NUMBER) + + if(${CUDA_ARCH_ELEM} LESS 20) + message(FATAL_ERROR "Unsupported CUDA architecture '${CUDA_ARCH_ELEM}' specified. " + "Use '20' (for compute architecture 2.0) or higher.") + endif() + endforeach() + + option(CUDA_SHOW_REGISTER "Show registers used for each kernel and compute architecture" OFF) + option(CUDA_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps" OFF) + + if("${CUDA_COMPILER}" STREQUAL "clang") + set(CLANG_BUILD_FLAGS "-O3 -x cuda --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") + # activation usage of FMA + set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} -ffp-contract=fast") + + if(CUDA_SHOW_REGISTER) + set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} -Xcuda-ptxas -v") + endif(CUDA_SHOW_REGISTER) + + if(CUDA_KEEP_FILES) + set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} -save-temps=${PROJECT_BINARY_DIR}") + endif(CUDA_KEEP_FILES) + + foreach(CUDA_ARCH_ELEM ${CUDA_ARCH}) + # set flags to create device code for the given architectures + set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} --cuda-gpu-arch=sm_${CUDA_ARCH_ELEM}") + endforeach() + + elseif("${CUDA_COMPILER}" STREQUAL "nvcc") + # add c++11 for cuda + if(NOT "${CMAKE_CXX_FLAGS}" MATCHES "-std=c\\+\\+11") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") + endif() + + # avoid that nvcc in CUDA < 8 tries to use libc `memcpy` within the kernel + if(CUDA_VERSION VERSION_LESS 8.0) + add_definitions(-D_FORCE_INLINES) + endif() + foreach(CUDA_ARCH_ELEM ${CUDA_ARCH}) + # set flags to create device code for the given architecture + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} + "--generate-code arch=compute_${CUDA_ARCH_ELEM},code=sm_${CUDA_ARCH_ELEM} --generate-code arch=compute_${CUDA_ARCH_ELEM},code=compute_${CUDA_ARCH_ELEM}") + endforeach() + + # give each thread an independent default stream + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --default-stream per-thread") + + option(CUDA_SHOW_CODELINES "Show kernel lines in cuda-gdb and cuda-memcheck" OFF) + + if(CUDA_SHOW_CODELINES) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" --source-in-ptx -lineinfo) + set(CUDA_KEEP_FILES ON CACHE BOOL "activate keep files" FORCE) + endif(CUDA_SHOW_CODELINES) + + if(CUDA_SHOW_REGISTER) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" -Xptxas=-v) + endif(CUDA_SHOW_REGISTER) + + if(CUDA_KEEP_FILES) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" --keep --keep-dir "${PROJECT_BINARY_DIR}") + endif(CUDA_KEEP_FILES) + + else() + message(FATAL_ERROR "selected CUDA compiler '${CUDA_COMPILER}' is not supported") + endif() + else() + message(FATAL_ERROR "CUDA NOT found: use `-DCUDA_ENABLE=OFF` to build without NVIDIA GPU support") + endif() +else() + add_definitions("-DCONF_NO_CUDA") +endif() -find_package(Threads REQUIRED) -set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) +# help to find AMD app SDK on systems with a software module system +list(APPEND CMAKE_PREFIX_PATH "$ENV{AMDAPPSDKROOT}") +# allow user to extent CMAKE_PREFIX_PATH via environment variable +list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}") +############################################################################### +# Find OpenCL +############################################################################### + +option(OpenCL_ENABLE "Enable or disable OpenCL spport (AMD GPU support)" ON) +if(OpenCL_ENABLE) + find_package(OpenCL QUIET) + if(OpenCL_FOUND) + include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS}) + #set(LIBS ${LIBS} ${OpenCL_LIBRARY}) + link_directories(${OpenCL_LIBRARY}) + else() + message(FATAL_ERROR "OpenCL NOT found: use `-DOpenCL_ENABLE=OFF` to build without OpenCL support for AMD gpu's") + endif() +else() + add_definitions("-DCONF_NO_OPENCL") +endif() + +############################################################################### +# CPU backend +############################################################################### + +option(CPU_ENABLE "Enable or disable CPU support" ON) +if(NOT CPU_ENABLE) + add_definitions("-DCONF_NO_CPU") +endif() + +################################################################################ +# Find PThreads +################################################################################ + +if(NOT WIN32) + find_package(Threads REQUIRED) + set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) +endif() ################################################################################ # Find microhttpd ################################################################################ @@ -111,7 +260,7 @@ if(HWLOC_ENABLE) /usr/local /usr ENV "PROGRAMFILES(X86)" - ENV "HWLOC_ROOT" + ENV "MICROHTTPD_ROOT" PATH_SUFFIXES include) @@ -146,10 +295,18 @@ endif() # Compile & Link ################################################################################ -# activate sse2 and aes-ni -if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC") +include_directories(BEFORE .) + +if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + # remove warnings that f_open() is not save and f_open_s should be used + add_definitions(-D_CRT_SECURE_NO_DEPRECATE) + # disable min define to allow usage of std::min + add_definitions(-DNOMINMAX) +else() + # activate sse2 and aes-ni set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -maes") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -maes") + endif() # activate static libgcc and libstdc++ linking @@ -160,8 +317,8 @@ if(CMAKE_LINK_STATIC) set(LIBS "-static-libgcc -static-libstdc++ ${LIBS}") endif() -file(GLOB SRCFILES_CPP "*.cpp" "crypto/*.cpp") -file(GLOB SRCFILES_C "crypto/*.c") +# compile C files +file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c") add_library(xmr-stak-c STATIC @@ -170,29 +327,108 @@ add_library(xmr-stak-c set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99) target_link_libraries(xmr-stak-c ${LIBS}) -add_executable(xmr-stak-cpu +# compile generic backend files +file(GLOB BACKEND_CPP + "xmrstak/*.cpp" + "xmrstak/backend/cpu/*.cpp" + "xmrstak/backend/*.cpp" + "xmrstak/backend/cpu/crypto/*.cpp" + "xmrstak/http/*.cpp" + "xmrstak/misc/*.cpp" + "xmrstak/net/*.cpp") + +add_library(xmr-stak-backend + STATIC + ${BACKEND_CPP} +) +target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS}) + +# compile CUDA backend +if(CUDA_FOUND) + file(GLOB CUDASRCFILES + "xmrstak/backend/nvidia/nvcc_code/*.cu" + "xmrstak/backend/nvidia/*.cpp") + + if("${CUDA_COMPILER}" STREQUAL "clang") + # build device code with clang + add_library( + xmrstak_cuda_backend + SHARED + ${CUDASRCFILES} + ) + set_target_properties(xmrstak_cuda_backend PROPERTIES COMPILE_FLAGS ${CLANG_BUILD_FLAGS}) + set_target_properties(xmrstak_cuda_backend PROPERTIES LINKER_LANGUAGE CXX) + set_source_files_properties(${CUDASRCFILES} PROPERTIES LANGUAGE CXX) + else() + # build device code with nvcc + cuda_add_library( + xmrstak_cuda_backend + SHARED + ${CUDASRCFILES} + ) + endif() + target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES}) + if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + target_link_libraries(xmrstak_cuda_backend xmr-stak-backend) + endif() +endif() + +# compile AMD backend +if(OpenCL_FOUND) + file(GLOB OPENCLSRCFILES + "xmrstak/backend/amd/amd_gpu/*.cpp" + "xmrstak/backend/amd/*.cpp") + add_library(xmrstak_opencl_backend + SHARED + ${OPENCLSRCFILES} + ) + target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} ) + if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + target_link_libraries(xmrstak_opencl_backend xmr-stak-backend) + endif() +endif() + +# compile final binary +file(GLOB SRCFILES_CPP "xmrstak/cli/*.cpp") +set_source_files_properties(${SRCFILES_CPP} PROPERTIES LANGUAGE CXX) + +add_executable(xmr-stak ${SRCFILES_CPP} ) set(EXECUTABLE_OUTPUT_PATH "bin") -target_link_libraries(xmr-stak-cpu ${LIBS} xmr-stak-c) +set(LIBRARY_OUTPUT_PATH "bin") + +target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend) ################################################################################ # Install ################################################################################ + # do not install the binary if the project and install are equal if( NOT "${CMAKE_INSTALL_PREFIX}" STREQUAL "${PROJECT_BINARY_DIR}" ) - install(TARGETS xmr-stak-cpu + install(TARGETS xmr-stak RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin") + if(CUDA_FOUND) + if(WIN32) + install(TARGETS xmrstak_cuda_backend + RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin") + else() + install(TARGETS xmrstak_cuda_backend + LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/bin") + endif() + endif() + if(OpenCL_FOUND) + if(WIN32) + install(TARGETS xmrstak_opencl_backend + RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin") + else() + install(TARGETS xmrstak_opencl_backend + LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/bin") + endif() + endif() +else() + # this rule is used if the install prefix is the build directory + install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")") endif() - - -# avoid overwrite of user defined settings -# install `config.txt`if file not exists in `${CMAKE_INSTALL_PREFIX}/bin` -install(CODE " \ - if(NOT EXISTS ${CMAKE_INSTALL_PREFIX}/bin/config.txt)\n \ - file(INSTALL ${CMAKE_CURRENT_SOURCE_DIR}/config.txt \ - DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)\n \ - endif()" -) diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..d127bb6 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,201 @@ +# XMR-Stak-CPU - Monero mining software + +XMR-Stak is a universal Stratum pool miner. This is the CPU-mining version; there is also an [AMD GPU version](https://github.com/fireice-uk/xmr-stak-amd) and an [NVIDA GPU version](https://github.com/fireice-uk/xmr-stak-nvidia) + +## HTML reports +<img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-hashrate.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-results.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-connection.png" width="260"> + +## HTML and JSON API report configuraton + +To configure the reports shown above you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json + +## Usage on Windows +1) Edit the config.txt file to enter your pool login and password. +2) Double click the exe file. + +XMR-Stak should compile on any C++11 compliant compiler. +``` +-----BEGIN PGP SIGNED MESSAGE----- +Hash: SHA256 + +sha1sum +d34a0ba0dd7b3b1f900a7e02772e197e974b4a73 libeay32.dll +2ee9966a0fc163da58408d91be36b84fa287c10b ssleay32.dll +e4d8a974e58985214de163df0c1ed0f54250d7ee xmr-stak-cpu.exe +ae0153ff98df82022b2c392d6a17c5f3614f6a50 xmr-stak-cpu-notls.exe + +sha3sum +05003137a87313c81d6c348c9b96411c95d48dc22c35f36c39129747 libeay32.dll +133c065d9ef2c93396382e2ba5d8c3ca8c6a57c6beb0159cb9a4b6c5 ssleay32.dll +7bfc30b06524dc9139a3157e2661d2a6f5720738dde8e490f05cc8e2 xmr-stak-cpu.exe +005fb81fc3711a97b2ce65bad0ca97318d878dc793a8cba99c7d1f6f xmr-stak-cpu-notls.exe + +date +Wed 19 Jul 21:18:58 BST 2017 +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v2 + +iQEcBAEBCAAGBQJZb77XAAoJEPsk95p+1Bw0GU4H/26sBwJzYSeWoLwo0LdmOPk3 +19n+svFYnz6NlxAjs+fvuTK992ilLMy2pa4PHKhot2oyZIgt2rRaFsvRADcHVraG +nsIh4Oq31T9epZI0WxIH5FJlDx30fdGkpMTu9xt6ta2JXsmkDiCoZxmETuljB7Rw +xvnKeHiuTccp73C6Nd7dkuiemsOw0FZA7XXS/Kmwqm7n8BtCztY70R6SVN7QFbCz +C49s0A9cT4UbAUPuu8KvxFozmJHA/wDBYHgkq95Y6n/q116+Sc9BpdF8j+qK4YzZ +uM+B10XY0g7Qv376UoJRYKokpVaBxF08nD+JXLdL+zfQvnEfKgrhTnjaTkWFfEY= +=jpgE +-----END PGP SIGNATURE----- +``` +## Compile guides + +- [Free BSD](FREEBSDCOMPILE.md) +- [Linux](LINUXCOMPILE.md) +- [Windows](WINCOMPILE.md) + + +#### CPU mining performance + +Performance is nearly identical to the closed source paid miners. Here are some numbers: + +* **I7-2600K** - 266 H/s +* **I7-6700** - 276 H/s (with a separate GPU miner) +* **Dual X5650** - 466 H/s (depends on NUMA) +* **Dual E5640** - 365 H/s (same as above) + +## Default dev donation +By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit **xmrstak/donate-level.hpp** before you build the binaries. + +If you want to donate directly to support further development, here is my wallet + +fireice-uk: +``` +4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk +``` + +psychocrypt: +``` +43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU +``` + +## Common Issues + +**SeLockMemoryPrivilege failed** + +Please see [config.txt](config.txt) under section **LARGE PAGE SUPPORT** + +For Windows 7 pro, or Windows 8 and above see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx) (make sure to reboot afterwards!). + +For Windows 7 Home : + +1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore incompatiablity warning during installation. + +2) In cmd or power shell: `ntrights -u %USERNAME% +r SeLockMemoryPrivilege` (where %USERNAME% is the user that will be running the program. This command needs to be run as admin) + +3) Reboot. + +Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791 + +*Warning: do not download ntrights.exe from any other site other then the offical Microsoft download page.* + +**VirtualAlloc failed** + +If you set up the user rights properly (see above), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation. + +If that happens, disable all auto-staring applications and run the miner after a reboot. + +**msvcp140.dll and vcruntime140.dll not available errors** + +Download and install this [runtime package](https://go.microsoft.com/fwlink/?LinkId=746572) from Microsoft. *Warning: Do NOT use "missing dll" sites - dll's are exe files with another name, and it is a fairly safe bet that any dll on a shady site like that will be trojaned. Please download offical runtimes from Microsoft above.* + + +**Error: MEMORY ALLOC FAILED: mmap failed** + +From [config.txt](config.txt): + +On Linux you will need to configure large page support `sudo sysctl -w vm.nr_hugepages=128` and increase your +ulimit -l. To do this you need to add following lines to /etc/security/limits.conf: + + * soft memlock 262144 + * hard memlock 262144 + +Save file. You WILL need to log out and log back in for these settings to take affect on your user (no need to reboot, just relogin in your session). + +You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. + +**Illegal instruction (core dumped)** + +This typically means you are trying to run it on a CPU that does not have [AES](https://en.wikipedia.org/wiki/AES_instruction_set). This only happens on older version of miner, new version gives better error message (but still wont' work since your CPU doesn't support the required instructions). + + +## Advanced Compile Options + +The build system is CMake, if you are not familiar with CMake you can learn more [here](https://cmake.org/runningcmake/). + +### Short Description + +There are two easy ways to set variables for `cmake` to configure *xmr-stak-cpu* +- use the ncurses GUI + - `ccmake .` + - edit your options + - end the GUI by pressing the key `c`(create) and than `g`(generate) +- set Options on the command line + - enable a option: `cmake . -DNAME_OF_THE_OPTION=ON` + - disable a option `cmake . -DNAME_OF_THE_OPTION=OFF` + - set a value `cmake . -DNAME_OF_THE_OPTION=value` + +After the configuration you need to call +`make install` for slow sequential build +or +`make -j install` for faster parallel build +and install. + +### xmr-stak-cpu Compile Options +- `CMAKE_INSTALL_PREFIX` install miner to the home folder + - `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/xmr-stak-cpu` + - you can find the binary and the `config.txt` file after `make install` in `$HOME/xmr-stak-cpu/bin` +- `CMAKE_LINK_STATIC` link libgcc and libstdc++ libraries static (default OFF) + - disable with `cmake . -DCMAKE_LINK_STATIC=ON` +-`CMAKE_BUILD_TYPE` set the build type + - valid options: `Release` or `Debug` + - you should always keep `Release` for your productive miners +- `MICROHTTPD_ENABLE` allow to disable/enable the dependency *microhttpd* + - by default enabled + - there is no *http* interface available if option is disabled: `cmake . -DMICROHTTPD_ENABLE=OFF` +- `OpenSSL_ENABLE` allow to disable/enable the dependency *OpenSSL* + - by default enabled + - it is not possible to connect to a *https* secured pool if option is disabled: `cmake . -DOpenSSL_ENABLE=OFF` +- `HWLOC_ENABLE` allow to disable/enable the dependency *hwloc* + - by default enabled + - the config suggestion is not optimal if option is disabled: `cmake . -DHWLOC_ENABLE=OFF` + +## PGP Key +``` +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2 + +mQENBFhYUmUBCAC6493W5y1MMs38ApRbI11jWUqNdFm686XLkZWGDfYImzL6pEYk +RdWkyt9ziCyA6NUeWFQYniv/z10RxYKq8ulVVJaKb9qPGMU0ESfdxlFNJkU/pf28 +sEVBagGvGw8uFxjQONnBJ7y7iNRWMN7qSRS636wN5ryTHNsmqI4ClXPHkXkDCDUX +QvhXZpG9RRM6jsE3jBGz/LJi3FyZLo/vB60OZBODJ2IA0wSR41RRiOq01OqDueva +9jPoAokNglJfn/CniQ+lqUEXj1vjAZ1D5Mn9fISzA/UPen5Z7Sipaa9aAtsDBOfP +K9iPKOsWa2uTafoyXgiwEVXCCeMMUjCGaoFBABEBAAG0ImZpcmVpY2VfdWsgPGZp +cmVpY2UueG1yQGdtYWlsLmNvbT6JATcEEwEIACEFAlhYUmUCGwMFCwkIBwIGFQgJ +CgsCBBYCAwECHgECF4AACgkQ+yT3mn7UHDTEcQf8CMhqaZ0IOBxeBnsq5HZr2X6z +E5bODp5cPs6ha1tjH3CWpk1AFeykNtXH7kPW9hcDt/e4UQtcHs+lu6YU59X7xLJQ +udOkpWdmooJMXRWS/zeeon4ivT9d69jNnwubh8EJOyw8xm/se6n48BcewfHekW/6 +mVrbhLbF1dnuUGXzRN1WxsUZx3uJd2UvrkJhAtHtX92/qIVhT0+3PXV0bmpHURlK +YKhhm8dPLV9jPX8QVRHQXCOHSMqy/KoWEe6CnT0Isbkq3JtS3K4VBVeTX9gkySRc +IFxrNJdXsI9BxKv4O8yajP8DohpoGLMDKZKSO0yq0BRMgMh0cw6Lk22uyulGALkB +DQRYWFJlAQgAqikfViOmIccCZKVMZfNHjnigKtQqNrbJpYZCOImql4FqbZu9F7TD +9HIXA43SPcwziWlyazSy8Pa9nCpc6PuPPO1wxAaNIc5nt+w/x2EGGTIFGjRoubmP +3i5jZzOFYsvR2W3PgVa3/ujeYYJYo1oeVeuGmmJRejs0rp1mbvBSKw1Cq6C4cI0x +GTY1yXFGLIgdfYNMmiLsTy1Qwq8YStbFKeUYAMMG3128SAIaT3Eet911f5Jx4tC8 +6kWUr6PX1rQ0LQJqyIsLq9U53XybUksRfJC9IEfgvgBxRBHSD8WfqEhHjhW1VsZG +dcYgr7A1PIneWsCEY+5VUnqTlt2HPaKweQARAQABiQEfBBgBCAAJBQJYWFJlAhsM +AAoJEPsk95p+1Bw0Pr8H/0vZ6U2zaih03jOHOvsrYxRfDXSmgudOp1VS45aHIREd +2nrJ+drleeFVyb14UQqO/6iX9GuDX2yBEHdCg2aljeP98AaMU//RiEtebE6CUWsL +HPVXHIkxwBCBe0YkJINHUQqLz/5f6qLsNUp1uTH2++zhdBWvg+gErTYbx8aFMFYH +0GoOtqE5rtlAh5MTvDZm+UcDwKJCxhrLaN3R3dDoyrDNRTgHQQuX5/opJBiUnVNK +d+vugnxzpMIJQP11yCZkz/KxV8zQ2QPMuZdAoh3znd/vGCJcp0rWphn4pqxA4vDp +c4hC0Yg9Dha1OoE5CJCqVL+ic4vAyB1urAwBlsd/wH8= +=B5I+ +-----END PGP PUBLIC KEY BLOCK----- +``` @@ -1,69 +1,34 @@ -# XMR-Stak-CPU - Monero mining software +# XMR-Stak - Monero All-in-One Mining Software -XMR-Stak is a universal Stratum pool miner. This is the CPU-mining version; there is also an [AMD GPU version](https://github.com/fireice-uk/xmr-stak-amd) and an [NVIDA GPU version](https://github.com/fireice-uk/xmr-stak-nvidia) +XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA gpus. ## HTML reports <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-hashrate.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-results.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-connection.png" width="260"> -## HTML and JSON API report configuraton +## Content Overview +* [Features](#features) +* [Download](#download) +* [Usage](doc/usage.md) +* [HowTo Compile](doc/compile.md) +* [FAQ](doc/FAQ.md) +* [Developer Donation](#default-developer-donation) +* [PGP Key](#pgp-key) -To configure the reports shown above you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json +## Features -## Usage on Windows -1) Edit the config.txt file to enter your pool login and password. -2) Double click the exe file. +- support all common backends (CPU/x86, AMD-GPU and NVIDIA-GPU) +- support all common OS (Linux, Windows and MacOS) +- easy to use + - guided start (no need to edit a config file for the first start) + - auto configuration for each backend +- open source software (GPLv3) +- TLS support +- HTML statistics +- JSON API for monitoring -XMR-Stak should compile on any C++11 compliant compiler. -``` ------BEGIN PGP SIGNED MESSAGE----- -Hash: SHA256 - -sha1sum -d34a0ba0dd7b3b1f900a7e02772e197e974b4a73 libeay32.dll -2ee9966a0fc163da58408d91be36b84fa287c10b ssleay32.dll -e4d8a974e58985214de163df0c1ed0f54250d7ee xmr-stak-cpu.exe -ae0153ff98df82022b2c392d6a17c5f3614f6a50 xmr-stak-cpu-notls.exe - -sha3sum -05003137a87313c81d6c348c9b96411c95d48dc22c35f36c39129747 libeay32.dll -133c065d9ef2c93396382e2ba5d8c3ca8c6a57c6beb0159cb9a4b6c5 ssleay32.dll -7bfc30b06524dc9139a3157e2661d2a6f5720738dde8e490f05cc8e2 xmr-stak-cpu.exe -005fb81fc3711a97b2ce65bad0ca97318d878dc793a8cba99c7d1f6f xmr-stak-cpu-notls.exe - -date -Wed 19 Jul 21:18:58 BST 2017 ------BEGIN PGP SIGNATURE----- -Version: GnuPG v2 - -iQEcBAEBCAAGBQJZb77XAAoJEPsk95p+1Bw0GU4H/26sBwJzYSeWoLwo0LdmOPk3 -19n+svFYnz6NlxAjs+fvuTK992ilLMy2pa4PHKhot2oyZIgt2rRaFsvRADcHVraG -nsIh4Oq31T9epZI0WxIH5FJlDx30fdGkpMTu9xt6ta2JXsmkDiCoZxmETuljB7Rw -xvnKeHiuTccp73C6Nd7dkuiemsOw0FZA7XXS/Kmwqm7n8BtCztY70R6SVN7QFbCz -C49s0A9cT4UbAUPuu8KvxFozmJHA/wDBYHgkq95Y6n/q116+Sc9BpdF8j+qK4YzZ -uM+B10XY0g7Qv376UoJRYKokpVaBxF08nD+JXLdL+zfQvnEfKgrhTnjaTkWFfEY= -=jpgE ------END PGP SIGNATURE----- -``` -## Compile guides - -- [Free BSD](FREEBSDCOMPILE.md) -- [Linux](LINUXCOMPILE.md) -- [Windows](WINCOMPILE.md) - - -#### CPU mining performance - -Performance is nearly identical to the closed source paid miners. Here are some numbers: +## Download -* **I7-2600K** - 266 H/s -* **I7-6700** - 276 H/s (with a separate GPU miner) -* **Dual X5650** - 466 H/s (depends on NUMA) -* **Dual E5640** - 365 H/s (same as above) - -## Default dev donation -By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit **donate-level.h** before you build the binaries. - -If you want to donate directly to support further development, here is my wallet +You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/xmr-stak/xmr-stak/releases). fireice-uk: ``` @@ -75,97 +40,6 @@ psychocrypt: 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU ``` -## Common Issues - -**SeLockMemoryPrivilege failed** - -Please see [config.txt](config.txt) under section **LARGE PAGE SUPPORT** - -For Windows 7 pro, or Windows 8 and above see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx) (make sure to reboot afterwards!). - -For Windows 7 Home : - -1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore incompatiablity warning during installation. - -2) In cmd or power shell: `ntrights -u %USERNAME% +r SeLockMemoryPrivilege` (where %USERNAME% is the user that will be running the program. This command needs to be run as admin) - -3) Reboot. - -Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791 - -*Warning: do not download ntrights.exe from any other site other then the offical Microsoft download page.* - -**VirtualAlloc failed** - -If you set up the user rights properly (see above), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation. - -If that happens, disable all auto-staring applications and run the miner after a reboot. - -**msvcp140.dll and vcruntime140.dll not available errors** - -Download and install this [runtime package](https://go.microsoft.com/fwlink/?LinkId=746572) from Microsoft. *Warning: Do NOT use "missing dll" sites - dll's are exe files with another name, and it is a fairly safe bet that any dll on a shady site like that will be trojaned. Please download offical runtimes from Microsoft above.* - - -**Error: MEMORY ALLOC FAILED: mmap failed** - -From [config.txt](config.txt): - -On Linux you will need to configure large page support `sudo sysctl -w vm.nr_hugepages=128` and increase your -ulimit -l. To do this you need to add following lines to /etc/security/limits.conf: - - * soft memlock 262144 - * hard memlock 262144 - -Save file. You WILL need to log out and log back in for these settings to take affect on your user (no need to reboot, just relogin in your session). - -You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. - -**Illegal instruction (core dumped)** - -This typically means you are trying to run it on a CPU that does not have [AES](https://en.wikipedia.org/wiki/AES_instruction_set). This only happens on older version of miner, new version gives better error message (but still wont' work since your CPU doesn't support the required instructions). - - -## Advanced Compile Options - -The build system is CMake, if you are not familiar with CMake you can learn more [here](https://cmake.org/runningcmake/). - -### Short Description - -There are two easy ways to set variables for `cmake` to configure *xmr-stak-cpu* -- use the ncurses GUI - - `ccmake .` - - edit your options - - end the GUI by pressing the key `c`(create) and than `g`(generate) -- set Options on the command line - - enable a option: `cmake . -DNAME_OF_THE_OPTION=ON` - - disable a option `cmake . -DNAME_OF_THE_OPTION=OFF` - - set a value `cmake . -DNAME_OF_THE_OPTION=value` - -After the configuration you need to call -`make install` for slow sequential build -or -`make -j install` for faster parallel build -and install. - -### xmr-stak-cpu Compile Options -- `CMAKE_INSTALL_PREFIX` install miner to the home folder - - `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/xmr-stak-cpu` - - you can find the binary and the `config.txt` file after `make install` in `$HOME/xmr-stak-cpu/bin` -- `CMAKE_LINK_STATIC` link libgcc and libstdc++ libraries static (default OFF) - - disable with `cmake . -DCMAKE_LINK_STATIC=ON` --`CMAKE_BUILD_TYPE` set the build type - - valid options: `Release` or `Debug` - - you should always keep `Release` for your productive miners -- `MICROHTTPD_ENABLE` allow to disable/enable the dependency *microhttpd* - - by default enabled - - there is no *http* interface available if option is disabled: `cmake . -DMICROHTTPD_ENABLE=OFF` -- `OpenSSL_ENABLE` allow to disable/enable the dependency *OpenSSL* - - by default enabled - - it is not possible to connect to a *https* secured pool if option is disabled: `cmake . -DOpenSSL_ENABLE=OFF` -- `HWLOC_ENABLE` allow to disable/enable the dependency *hwloc* - - by default enabled - - the config suggestion is not optimal if option is disabled: `cmake . -DHWLOC_ENABLE=OFF` - ## PGP Key ``` -----BEGIN PGP PUBLIC KEY BLOCK----- diff --git a/autoAdjust.hpp b/autoAdjust.hpp deleted file mode 100644 index 93a88e8..0000000 --- a/autoAdjust.hpp +++ /dev/null @@ -1,154 +0,0 @@ -#pragma once -#include "jconf.h" -#include "console.h" - -#ifdef _WIN32 -#include <windows.h> -#else -#include <unistd.h> -#endif // _WIN32 - -// Mask bits between h and l and return the value -// This enables us to put in values exactly like in the manual -// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22) -inline int32_t get_masked(int32_t val, int32_t h, int32_t l) -{ - val &= (0x7FFFFFFF >> (31-(h-l))) << l; - return val >> l; -} - -class autoAdjust -{ -public: - - autoAdjust() - { - } - - void printConfig() - { - printer::inst()->print_str("The configuration for 'cpu_threads_conf' in your config file is 'null'.\n"); - printer::inst()->print_str("The miner evaluates your system and prints a suggestion for the section `cpu_threads_conf` to the terminal.\n"); - printer::inst()->print_str("The values are not optimal, please try to tweak the values based on notes in config.txt.\n"); - printer::inst()->print_str("Please copy & paste the block within the asterisks to your config.\n\n"); - - if(!detectL3Size() || L3KB_size < 1024 || L3KB_size > 102400) - { - if(L3KB_size < 1024 || L3KB_size > 102400) - printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size); - - printer::inst()->print_msg(L0, "Autoconf failed: Printing config for a single thread. Please try to add new ones until the hashrate slows down."); - printer::inst()->print_str("\n**************** Copy&Paste BEGIN ****************\n\n"); - printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n"); - printer::inst()->print_str(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); - printer::inst()->print_str("],\n\n**************** Copy&Paste END ****************\n"); - return; - } - - printer::inst()->print_msg(L0, "Autoconf L3 size detected at %u KB.", L3KB_size); - - detectCPUConf(); - - printer::inst()->print_msg(L0, "Autoconf core count detected as %u on %s.", corecnt, - linux_layout ? "Linux" : "Windows"); - - printer::inst()->print_str("\n**************** Copy&Paste BEGIN ****************\n\n"); - printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n"); - - uint32_t aff_id = 0; - char strbuf[256]; - for(uint32_t i=0; i < corecnt; i++) - { - bool double_mode; - - if(L3KB_size <= 0) - break; - - double_mode = L3KB_size / 2048 > (int32_t)(corecnt-i); - - snprintf(strbuf, sizeof(strbuf), " { \"low_power_mode\" : %s, \"no_prefetch\" : true, \"affine_to_cpu\" : %u },\n", - double_mode ? "true" : "false", aff_id); - printer::inst()->print_str(strbuf); - - if(!linux_layout || old_amd) - { - aff_id += 2; - - if(aff_id >= corecnt) - aff_id = 1; - } - else - aff_id++; - - if(double_mode) - L3KB_size -= 4096; - else - L3KB_size -= 2048; - } - - printer::inst()->print_str("],\n\n**************** Copy&Paste END ****************\n"); - } - -private: - bool detectL3Size() - { - int32_t cpu_info[4]; - char cpustr[13] = {0}; - - jconf::cpuid(0, 0, cpu_info); - memcpy(cpustr, &cpu_info[1], 4); - memcpy(cpustr+4, &cpu_info[3], 4); - memcpy(cpustr+8, &cpu_info[2], 4); - - if(strcmp(cpustr, "GenuineIntel") == 0) - { - jconf::cpuid(4, 3, cpu_info); - - if(get_masked(cpu_info[0], 7, 5) != 3) - { - printer::inst()->print_msg(L0, "Autoconf failed: Couln't find L3 cache page."); - return false; - } - - L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) * - (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024; - - return true; - } - else if(strcmp(cpustr, "AuthenticAMD") == 0) - { - jconf::cpuid(0x80000006, 0, cpu_info); - - L3KB_size = get_masked(cpu_info[3], 31, 18) * 512; - - jconf::cpuid(1, 0, cpu_info); - if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen - old_amd = true; - - return true; - } - else - { - printer::inst()->print_msg(L0, "Autoconf failed: Unknown CPU type: %s.", cpustr); - return false; - } - } - - void detectCPUConf() - { -#ifdef _WIN32 - SYSTEM_INFO info; - GetSystemInfo(&info); - corecnt = info.dwNumberOfProcessors; - linux_layout = false; -#else - corecnt = sysconf(_SC_NPROCESSORS_ONLN); - linux_layout = true; -#endif // _WIN32 - } - - int32_t L3KB_size = 0; - uint32_t corecnt; - bool old_amd = false; - bool linux_layout; -}; diff --git a/cli-miner.cpp b/cli-miner.cpp deleted file mode 100644 index 45d2c16..0000000 --- a/cli-miner.cpp +++ /dev/null @@ -1,225 +0,0 @@ - /* - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * Additional permission under GNU GPL version 3 section 7 - * - * If you modify this Program, or any covered work, by linking or combining - * it with OpenSSL (or a modified version of that library), containing parts - * covered by the terms of OpenSSL License and SSLeay License, the licensors - * of this Program grant you additional permission to convey the resulting work. - * - */ - -#include "executor.h" -#include "minethd.h" -#include "jconf.h" -#include "console.h" -#include "donate-level.h" -#ifndef CONF_NO_HWLOC -# include "autoAdjustHwloc.hpp" -#else -# include "autoAdjust.hpp" -#endif -#include "version.h" - -#ifndef CONF_NO_HTTPD -# include "httpd.h" -#endif - -#include <stdlib.h> -#include <stdio.h> -#include <string.h> - -#include <time.h> - -#ifndef CONF_NO_TLS -#include <openssl/ssl.h> -#include <openssl/err.h> -#endif - -//Do a press any key for the windows folk. *insert any key joke here* -#ifdef _WIN32 -void win_exit() -{ - printer::inst()->print_str("Press any key to exit."); - get_key(); - return; -} - -#define strcasecmp _stricmp - -#else -void win_exit() { return; } -#endif // _WIN32 - -void do_benchmark(); - -int main(int argc, char *argv[]) -{ -#ifndef CONF_NO_TLS - SSL_library_init(); - SSL_load_error_strings(); - ERR_load_BIO_strings(); - ERR_load_crypto_strings(); - SSL_load_error_strings(); - OpenSSL_add_all_digests(); -#endif - - srand(time(0)); - - const char* sFilename = "config.txt"; - bool benchmark_mode = false; - - if(argc >= 2) - { - if(strcmp(argv[1], "-h") == 0) - { - printer::inst()->print_msg(L0, "Usage %s [CONFIG FILE]", argv[0]); - win_exit(); - return 0; - } - - if(argc >= 3 && strcasecmp(argv[1], "-c") == 0) - { - sFilename = argv[2]; - } - else if(argc >= 3 && strcasecmp(argv[1], "benchmark_mode") == 0) - { - sFilename = argv[2]; - benchmark_mode = true; - } - else - sFilename = argv[1]; - } - - if(!jconf::inst()->parse_config(sFilename)) - { - win_exit(); - return 0; - } - - if(jconf::inst()->NeedsAutoconf()) - { - autoAdjust adjust; - adjust.printConfig(); - win_exit(); - return 0; - } - - if (!minethd::self_test()) - { - win_exit(); - return 0; - } - - if(benchmark_mode) - { - do_benchmark(); - win_exit(); - return 0; - } - -#ifndef CONF_NO_HTTPD - if(jconf::inst()->GetHttpdPort() != 0) - { - if (!httpd::inst()->start_daemon()) - { - win_exit(); - return 0; - } - } -#endif - - printer::inst()->print_str("-------------------------------------------------------------------\n"); - printer::inst()->print_str( XMR_STAK_NAME" " XMR_STAK_VERSION " mining software, CPU Version.\n"); - printer::inst()->print_str("Based on CPU mining code by wolf9466 (heavily optimized by fireice_uk).\n"); - printer::inst()->print_str("Brought to you by fireice_uk and psychocrypt under GPLv3.\n\n"); - char buffer[64]; - snprintf(buffer, sizeof(buffer), "Configurable dev donation level is set to %.1f %%\n\n", fDevDonationLevel * 100.0); - printer::inst()->print_str(buffer); - printer::inst()->print_str("You can use following keys to display reports:\n"); - printer::inst()->print_str("'h' - hashrate\n"); - printer::inst()->print_str("'r' - results\n"); - printer::inst()->print_str("'c' - connection\n"); - printer::inst()->print_str("-------------------------------------------------------------------\n"); - - if(strlen(jconf::inst()->GetOutputFile()) != 0) - printer::inst()->open_logfile(jconf::inst()->GetOutputFile()); - - executor::inst()->ex_start(jconf::inst()->DaemonMode()); - - using namespace std::chrono; - uint64_t lastTime = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); - - int key; - while(true) - { - key = get_key(); - - switch(key) - { - case 'h': - executor::inst()->push_event(ex_event(EV_USR_HASHRATE)); - break; - case 'r': - executor::inst()->push_event(ex_event(EV_USR_RESULTS)); - break; - case 'c': - executor::inst()->push_event(ex_event(EV_USR_CONNSTAT)); - break; - default: - break; - } - - uint64_t currentTime = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); - - /* Hard guard to make sure we never get called more than twice per second */ - if( currentTime - lastTime < 500) - std::this_thread::sleep_for(std::chrono::milliseconds(500 - (currentTime - lastTime))); - lastTime = currentTime; - } - - return 0; -} - -void do_benchmark() -{ - using namespace std::chrono; - std::vector<minethd*>* pvThreads; - - printer::inst()->print_msg(L0, "Running a 60 second benchmark..."); - - uint8_t work[76] = {0}; - minethd::miner_work oWork = minethd::miner_work("", work, sizeof(work), 0, 0, false, 0); - pvThreads = minethd::thread_starter(oWork); - - uint64_t iStartStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); - - std::this_thread::sleep_for(std::chrono::seconds(60)); - - oWork = minethd::miner_work(); - minethd::switch_work(oWork); - - double fTotalHps = 0.0; - for (uint32_t i = 0; i < pvThreads->size(); i++) - { - double fHps = pvThreads->at(i)->iHashCount; - fHps /= (pvThreads->at(i)->iTimestamp - iStartStamp) / 1000.0; - - printer::inst()->print_msg(L0, "Thread %u: %.1f H/S", i, fHps); - fTotalHps += fHps; - } - - printer::inst()->print_msg(L0, "Total: %.1f H/S", fTotalHps); -} diff --git a/doc/FAQ.md b/doc/FAQ.md new file mode 100644 index 0000000..995a421 --- /dev/null +++ b/doc/FAQ.md @@ -0,0 +1,61 @@ +# FAQ + +## Content Overview +* [SeLockMemoryPrivilege failed](#selockmemoryprivilege-failed) +* [VirtualAlloc failed](#virtualalloc-failed) +* [Error msvcp140.dll and vcruntime140.dll not available](#error-msvcp140dll-and-vcruntime140dll-not-available) +* [Error: MEMORY ALLOC FAILED: mmap failed](#error-memory-alloc-failed-mmap-failed) +* [Illegal instruction (core dumped)](#illegal-instruction) +* [Virus Protection Alert](#virus-protection-alert) + +## SeLockMemoryPrivilege failed + +Please see [config.txt](config.txt) under section **LARGE PAGE SUPPORT** + +For Windows 7 pro, or Windows 8 and above see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx) (make sure to reboot afterwards!). + +For Windows 7 Home : + +1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore incompatiablity warning during installation. + +2) In cmd or power shell: `ntrights -u %USERNAME% +r SeLockMemoryPrivilege` (where %USERNAME% is the user that will be running the program. This command needs to be run as admin) + +3) Reboot. + +Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791 + +*Warning: do not download ntrights.exe from any other site other then the offical Microsoft download page.* + +## VirtualAlloc failed + +If you set up the user rights properly (see above), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation. + +If that happens, disable all auto-staring applications and run the miner after a reboot. + +## Error msvcp140.dll and vcruntime140.dll not available + +Download and install this [runtime package](https://go.microsoft.com/fwlink/?LinkId=746572) from Microsoft. *Warning: Do NOT use "missing dll" sites - dll's are exe files with another name, and it is a fairly safe bet that any dll on a shady site like that will be trojaned. Please download offical runtimes from Microsoft above.* + + +## Error: MEMORY ALLOC FAILED: mmap failed + +On Linux you will need to configure large page support `sudo sysctl -w vm.nr_hugepages=128` and increase your +ulimit -l. To do this you need to add following lines to /etc/security/limits.conf: + + * soft memlock 262144 + * hard memlock 262144 + +Save file. You WILL need to log out and log back in for these settings to take affect on your user (no need to reboot, just relogin in your session). + +You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. + +## Illegal Instruction + +This typically means you are trying to run it on a CPU that does not have [AES](https://en.wikipedia.org/wiki/AES_instruction_set). This only happens on older version of miner, new version gives better error message (but still wont' work since your CPU doesn't support the required instructions). + +## Virus Protection Alert + +Some Virus protection software flag the miner binary as *Male Ware*. +In this case the binary is moved to the quarantine area of the protection software. +This is a wrong alert and not avoid by use. +Add the binary to to protection software white list to solve this issue.s diff --git a/doc/compile.md b/doc/compile.md new file mode 100644 index 0000000..04b1fe4 --- /dev/null +++ b/doc/compile.md @@ -0,0 +1,81 @@ +# Compile xmr-stak + +## Content Overview +* [Build System](#build-system) +* [Generic Build Options](#generic-build-options) +* [CPU Build Options](#cpu-build-options) +* [AMD Build Options](#amd-build-options) +* [NVIDIA Build Options](#nvidia-build-options) +* [Compile in Windows](compile_Windows.md) +* [Compile in Linux](compile_Linux.md) +* [Compile in FreeBSD](compile_FreeBSD.md) + +## Build System + +The build system is CMake, if you are not familiar with CMake you can learn more [here](https://cmake.org/runningcmake/). + +By default the miner will be build with all dependencies. Each optional dependency can be disabled (this will reduce the miner features). + +There are two easy ways to set variables for `cmake` to configure *xmr-stak* +- use the ncurses GUI + - `ccmake .` + - edit your options + - end the GUI by pressing the key `c`(create) and than `g`(generate) +- set Options on the command line + - enable a option: `cmake . -DNAME_OF_THE_OPTION=ON` + - disable a option `cmake . -DNAME_OF_THE_OPTION=OFF` + - set a value `cmake . -DNAME_OF_THE_OPTION=value` + +After the configuration you need to compile the miner, follow the guide for your platform: +* [Compile in Windows](compile_Windows.md) +* [Compile in Linux](compile_Linux.md) +* [Compile in FreeBSD](compile_FreeBSD.md) + +## Generic Build Options +- `CMAKE_INSTALL_PREFIX` install miner to the home folder + - `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/xmr-stak-cpu` + - you can find the binary and the `config.txt` file after `make install` in `$HOME/xmr-stak-cpu/bin` +- `CMAKE_LINK_STATIC` link libgcc and libstdc++ libraries static (default OFF) + - disable with `cmake . -DCMAKE_LINK_STATIC=ON` +- `CMAKE_BUILD_TYPE` set the build type + - valid options: `Release` or `Debug` + - you should always keep `Release` for your productive miners +- `MICROHTTPD_ENABLE` allow to disable/enable the dependency *microhttpd* + - there is no *http* interface available if option is disabled: `cmake . -DMICROHTTPD_ENABLE=OFF` +- `OpenSSL_ENABLE` allow to disable/enable the dependency *OpenSSL* + - it is not possible to connect to a *https* secured pool if option is disabled: `cmake . -DOpenSSL_ENABLE=OFF` + +## CPU Build Options + +- `CPU_ENABLE` allow to disable/enable the CPU backend of the miner +- `HWLOC_ENABLE` allow to disable/enable the dependency *hwloc* + - the config suggestion is not optimal if option is disabled: `cmake . -DHWLOC_ENABLE=OFF` + - disabling can be reduce the miner performance + +## AMD Build Options + +- `OpenCL_ENABLE` allow to disable/enable the AMD backend of the miner + +## NVIDIA Build Options + +- `CUDA_ENABLE` allow to disable/enable the NVIDIA backend of the miner +- `CUDA_ARCH` build for a certain compute architecture + - this option needs a semicolon separated list + - `cmake . -DCUDA_ARCH=61` or `cmake . -DCUDA_ARCH=20;61` + - [list](https://developer.nvidia.com/cuda-gpus) with NVIDIA compute architectures + - by default the miner is created for all currently available compute architectures +- `CUDA_COMPILER` select the compiler for the device code + - valid options: `nvcc` or `clang` if clang 3.9+ is installed +``` + # compile host and device code with clang + export CC=/usr/bin/clang + export CXX=/usr/bin/clang++ + cmake . -DCUDA_COMPILER=clang +``` +- `XMR-STAK_LARGEGRID` use `32` or `64` bit integer for on device indices + - default is enabled + - on old GPUs it can increase the hash rate if disabled: `cmake . -DXMR-STAK_LARGEGRID=OFF` + - if disabled it is not allowed to use more than `1000` threads on the device +- `XMR-STAK_THREADS` give the compiler information which value for `threads` is used at runtime + - default is `0` (compile time optimization) + - if the miner is compiled and used at runtime with the some value it can increase the hash rate: `cmake . -DXMR-STAK_THREADS=32` diff --git a/FREEBSDCOMPILE.md b/doc/compile_FreeBSD.md index 230ce73..6dd1165 100644 --- a/FREEBSDCOMPILE.md +++ b/doc/compile_FreeBSD.md @@ -13,8 +13,6 @@ Type 'y' and hit enter to proceed with installing the packages. git clone https://github.com/fireice-uk/xmr-stak-cpu.git cd xmr-stak-cpu cmake . - make + make install -Now you have the binary located at "bin/xmr-stak-cpu". Either move this file to your desired location or run "make install" to install it to your path. - -You can edit the prebuilt [config.txt](config.txt) file found in the root of the repository or you can make your own. This file is required to run xmr-stak-cpu. +Now you have the binary located at "bin/xmr-stak" and the needed shared libraries. diff --git a/LINUXCOMPILE.md b/doc/compile_Linux.md index 4486081..4947fd2 100644 --- a/LINUXCOMPILE.md +++ b/doc/compile_Linux.md @@ -37,8 +37,7 @@ make install ``` -- g++ version 5.1 or higher is required for full C++11 support. CMake release compile scripts, as well as CodeBlocks build environment for debug builds is included. - +- g++ version 5.1 or higher is required for full C++11 support. If you want to compile the binary without installing libraries / compiler or just compile binary for some other distribution, please check the [build_xmr-stak_docker.sh script](scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh). ### To do a static build for a system without gcc 5.1+ @@ -47,6 +46,3 @@ If you want to compile the binary without installing libraries / compiler or jus make install ``` Note - cmake caches variables, so if you want to do a dynamic build later you need to specify '-DCMAKE_LINK_STATIC=OFF' - - - diff --git a/WINCOMPILE.md b/doc/compile_Windows.md index ec810af..dec1e75 100644 --- a/WINCOMPILE.md +++ b/doc/compile_Windows.md @@ -58,7 +58,7 @@ ## Compile -- download and unzip `xmr-stak-cpu` +- download and unzip `xmr-stak - open the command line terminal `cmd` - `cd` to your unzipped source code directory - execute the following commands (NOTE: path to VS2017 can be different) @@ -67,9 +67,7 @@ set CMAKE_PREFIX_PATH=C:\xmr-stak-dep\hwloc;C:\xmr-stak-dep\libmicrohttpd;C:\xmr-stak-dep\openssl mkdir build cd build - cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 .. - msbuild xmr-stak-cpu.sln /p:Configuration=Release + cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 .. + cmake --build . --config Release --target install cd bin\Release - copy ..\..\..\config.txt . ``` -- customize your `config.txt` file by adding the pool, username and password diff --git a/doc/tuning.md b/doc/tuning.md new file mode 100644 index 0000000..820fab5 --- /dev/null +++ b/doc/tuning.md @@ -0,0 +1,67 @@ +# Tuning Guide + +## Content Overview +* [NVIDIA Backend](#nvidia-backend) + * [Choose Value for `threads` and `blocks`](#choose-value-for-threads-and-blocks) + * [Add more GPUs](#add-more-gpus) +* [AMD Backend](#amd-backend) + * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize) + * [Add more GPUs](#add-more-gpus) + +## NVIDIA Backend + +By default the NVIDIA backend can be tuned in the config file `nvidia.txt` + +### Choose Value for `threads` and `blocks` + +The optimal parameter for the `threads` and `blocks` option in `config.txt` depend on your GPU. +For all GPU's with a compute capability `>=2.0` and `<6.0` there is a restriction of the amount of RAM that can be used for the mining algorithm. +The maximum RAM that can be used must be less than 2GB (e.g. GTX TITAN) or 1GB (e.g. GTX 750-TI). +The amount of RAM used for mining can be changed with `"threads" : T, "blocks : B"`. + - `T` = threads used per block + - `B` = CUDA blocks started (should be a multiple of the multiprocessors `M` on the GPU) + +For the 2GB limit the equations must be full filled: `T * B * 2 <= 1900` and ` B mod M == 0`. +The value `1900` is used because there is a little data overhead for administration. +The GTX Titan X has 24 multiprocessors `M`, this means a valid and good starting configuration is `"threads" : 16, "blocks : 48"` +and full fill all restrictions `16 * 48 * 2 = 1536` and `48 mod 24 = 0`. + +The memory limit for NVIDIA Pascal GPUs is `16` GiB if the newest CUDA driver is used. + +### Add More GPUs + +To add a new GPU you need to add a new config set to `gpu_threads_conf`. +`index` is the number of the gpu, the index order not follow the order from `nvidia-smi` or the order shown in windows. + +``` +"gpu_threads_conf" : +[ + { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, "affine_to_cpu" : false}, + { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, "affine_to_cpu" : false}, +], +``` + +## AMD Backend + +By default the NVIDIA backend can be tuned in the config file `nvidia.txt` + +### Choose `intensity` and `worksize` + +Intensity means the number of threads used to mine. +`worksize` is the number of threads working together to increase the miner performance. +In the most cases a `worksize` of `16` or `8` is optimal. + +### Add More GPUs + +To add a new GPU you need to add a new config set to `gpu_threads_conf` and increase `gpu_thread_num"` to the number of gpus (entries in `gpu_threads_conf`). +`index` is the number of the gpu. + +``` +"gpu_thread_num" : 2, + +"gpu_threads_conf" : +[ + { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false }, + { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false }, +], +```
\ No newline at end of file diff --git a/doc/usage.md b/doc/usage.md new file mode 100644 index 0000000..42a29fd --- /dev/null +++ b/doc/usage.md @@ -0,0 +1,54 @@ +# HowTo Use xmr-stak + +## Content Overview +* [Configuration](#configuration) +* [Usage on Windows](#usage-on-windows) +* [Usage on Linux](#usage-on-linux) +* [Command Line Options](#command-line-options) +* [HTML and JSON API report configuraton](#xx) + +## Configurations + +Before you started the miner the first time there are no config files available. +Config files will be created at the first start. +The number of files depends on the available backends. +`config.txt` contains the common miner settings. +`amd.txt`, `cpu.txt` and `nvidia.txt` contains miner backend specific settings and can be used for further tuning ([Tuning Guide](tuning.md)). + + +## Usage on Windows +1) Double click the `xmr-stak.exe` file +2) Fill in the pool url, username and password + +## Usage on Linux +1) Open a terminal within the folder with the binary +2) Start the miner with `./xmr-stak` + +## Command Line Options + +The miner allow to overwrite some of the settings via command line options. + +``` +Usage: xmr-stak [OPTION]... + + -c, --config FILE common miner configuration file + -h, --help show this help + --noCPU disable the CPU miner backend + --cpu FILE CPU backend miner config file + --noAMD disable the AMD miner backend + --amd FILE AMD backend miner config file + +The Following options temporary overwrites the config file settings: + -o, --url URL pool url and port, e.g. pool.usxmrpool.com:3333 + -u, --user USERNAME pool user name or wallet address + -p, --pass PASSWD pool password, in the most cases x or empty "" + +## HTML and JSON API report configuraton + +To configure the reports shown on the [README](README.md) side you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json + +## Default Developer Donation +By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit **xmrstak/donate-level.hpp** before you build the binaries. + +If you want to donate directly to support further development, here is my wallet +``` diff --git a/minethd.h b/minethd.h deleted file mode 100644 index 293e8de..0000000 --- a/minethd.h +++ /dev/null @@ -1,144 +0,0 @@ -#pragma once -#include <thread> -#include <atomic> -#include <mutex> -#include "crypto/cryptonight.h" - -class telemetry -{ -public: - telemetry(size_t iThd); - void push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp); - double calc_telemetry_data(size_t iLastMilisec, size_t iThread); - -private: - constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations - constexpr static size_t iBucketMask = iBucketSize - 1; - uint32_t* iBucketTop; - uint64_t** ppHashCounts; - uint64_t** ppTimestamps; -}; - -class minethd -{ -public: - struct miner_work - { - char sJobID[64]; - uint8_t bWorkBlob[112]; - uint32_t iWorkSize; - uint32_t iResumeCnt; - uint64_t iTarget; - bool bNiceHash; - bool bStall; - size_t iPoolId; - - miner_work() : iWorkSize(0), bStall(true), iPoolId(0) { } - - miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, uint32_t iResumeCnt, - uint64_t iTarget, bool bNiceHash, size_t iPoolId) : iWorkSize(iWorkSize), iResumeCnt(iResumeCnt), - iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId) - { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); - memcpy(this->bWorkBlob, bWork, iWorkSize); - } - - miner_work(miner_work const&) = delete; - - miner_work& operator=(miner_work const& from) - { - assert(this != &from); - - iWorkSize = from.iWorkSize; - iResumeCnt = from.iResumeCnt; - iTarget = from.iTarget; - bNiceHash = from.bNiceHash; - bStall = from.bStall; - iPoolId = from.iPoolId; - - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(sJobID, from.sJobID, sizeof(sJobID)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - - return *this; - } - - miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget), - bStall(from.bStall), iPoolId(from.iPoolId) - { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(sJobID, from.sJobID, sizeof(sJobID)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - } - - miner_work& operator=(miner_work&& from) - { - assert(this != &from); - - iWorkSize = from.iWorkSize; - iResumeCnt = from.iResumeCnt; - iTarget = from.iTarget; - bNiceHash = from.bNiceHash; - bStall = from.bStall; - iPoolId = from.iPoolId; - - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(sJobID, from.sJobID, sizeof(sJobID)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - - return *this; - } - }; - - static void switch_work(miner_work& pWork); - static std::vector<minethd*>* thread_starter(miner_work& pWork); - static bool self_test(); - - std::atomic<uint64_t> iHashCount; - std::atomic<uint64_t> iTimestamp; - -private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); - typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, cryptonight_ctx* __restrict, cryptonight_ctx* __restrict); - - minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity); - - // We use the top 10 bits of the nonce for thread and resume - // This allows us to resume up to 128 threads 4 times before - // we get nonce collisions - // Bottom 22 bits allow for an hour of work at 1000 H/s - inline uint32_t calc_start_nonce(uint32_t resume) - { return (resume * iThreadCount + iThreadNo) << 22; } - - // Limited version of the nonce calc above - inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume) - { return start | (resume * iThreadCount + iThreadNo) << 18; } - - static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch); - static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch); - - void work_main(); - void double_work_main(); - void consume_work(); - - static std::atomic<uint64_t> iGlobalJobNo; - static std::atomic<uint64_t> iConsumeCnt; - static uint64_t iThreadCount; - uint64_t iJobNo; - - static miner_work oGlobalWork; - miner_work oWork; - - void pin_thd_affinity(); - // Held by the creating context to prevent a race cond with oWorkThd = std::thread(...) - std::mutex work_thd_mtx; - - std::thread oWorkThd; - uint8_t iThreadNo; - int64_t affinity; - - bool bQuit; - bool bNoPrefetch; -}; - diff --git a/version.h b/version.h deleted file mode 100644 index 04a6ced..0000000 --- a/version.h +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once - -#define XMR_STAK_NAME "xmr-stak-cpu" -#define XMR_STAK_VERSION "1.3.0-1.5.0-dev" diff --git a/xmr-stak-cpu.cbp b/xmr-stak-cpu.cbp deleted file mode 100644 index c2f3774..0000000 --- a/xmr-stak-cpu.cbp +++ /dev/null @@ -1,167 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="yes" ?> -<CodeBlocks_project_file> - <FileVersion major="1" minor="6" /> - <Project> - <Option title="xmr-stak-cpu" /> - <Option pch_mode="2" /> - <Option compiler="gcc" /> - <Build> - <Target title="Debug"> - <Option output="bin/Debug/miner" prefix_auto="1" extension_auto="1" /> - <Option object_output="obj/Debug/" /> - <Option type="1" /> - <Option compiler="gcc" /> - <Option parameters="config-debug.txt" /> - <Compiler> - <Add option="-std=c++11" /> - <Add option="-m64" /> - <Add option="-g" /> - <Add directory="include" /> - </Compiler> - <Linker> - <Add option="-m64" /> - </Linker> - </Target> - <Target title="Release"> - <Option output="bin/Release/miner" prefix_auto="1" extension_auto="1" /> - <Option object_output="obj/Release/" /> - <Option type="1" /> - <Option compiler="gcc" /> - <Option parameters="config-debug.txt" /> - <Compiler> - <Add option="-O3" /> - <Add option="-std=c++11" /> - <Add option="-m64" /> - <Add option="-DNDEBUG" /> - <Add directory="include" /> - </Compiler> - <Linker> - <Add option="-s" /> - <Add option="-m64" /> - </Linker> - </Target> - <Target title="Release_test"> - <Option output="bin/Release_test/miner" prefix_auto="1" extension_auto="1" /> - <Option object_output="obj/Release_test/" /> - <Option type="1" /> - <Option compiler="gcc" /> - <Option parameters="config-debug.txt" /> - <Compiler> - <Add option="-O3" /> - <Add option="-std=c++11" /> - <Add option="-m64" /> - <Add directory="include" /> - </Compiler> - <Linker> - <Add option="-s" /> - <Add option="-m64" /> - </Linker> - </Target> - </Build> - <Compiler> - <Add option="-march=haswell" /> - <Add option="-Wall" /> - </Compiler> - <Linker> - <Add library="pthread" /> - <Add library="libmicrohttpd" /> - <Add library="crypto" /> - <Add library="ssl" /> - <Add library="libhwloc" /> - </Linker> - <Unit filename="autoAdjust.hpp" /> - <Unit filename="autoAdjustHwloc.hpp" /> - <Unit filename="cli-miner.cpp" /> - <Unit filename="console.cpp" /> - <Unit filename="console.h" /> - <Unit filename="crypto/c_blake256.c"> - <Option compilerVar="CC" /> - </Unit> - <Unit filename="crypto/c_blake256.h" /> - <Unit filename="crypto/c_groestl.c"> - <Option compilerVar="CC" /> - </Unit> - <Unit filename="crypto/c_groestl.h" /> - <Unit filename="crypto/c_jh.c"> - <Option compilerVar="CC" /> - </Unit> - <Unit filename="crypto/c_jh.h" /> - <Unit filename="crypto/c_keccak.c"> - <Option compilerVar="CC" /> - </Unit> - <Unit filename="crypto/c_keccak.h" /> - <Unit filename="crypto/c_skein.c"> - <Option compilerVar="CC" /> - </Unit> - <Unit filename="crypto/c_skein.h" /> - <Unit filename="crypto/cryptonight.h" /> - <Unit filename="crypto/cryptonight_aesni.h" /> - <Unit filename="crypto/cryptonight_common.cpp" /> - <Unit filename="crypto/groestl_tables.h" /> - <Unit filename="crypto/hash.h" /> - <Unit filename="crypto/int-util.h" /> - <Unit filename="crypto/skein_port.h" /> - <Unit filename="crypto/soft_aes.c"> - <Option compilerVar="CC" /> - </Unit> - <Unit filename="donate-level.h" /> - <Unit filename="executor.cpp" /> - <Unit filename="executor.h" /> - <Unit filename="httpd.cpp" /> - <Unit filename="httpd.h" /> - <Unit filename="hwlocMemory.hpp" /> - <Unit filename="jconf.cpp" /> - <Unit filename="jconf.h" /> - <Unit filename="jext.h" /> - <Unit filename="jpsock.cpp" /> - <Unit filename="jpsock.h" /> - <Unit filename="minethd.cpp" /> - <Unit filename="minethd.h" /> - <Unit filename="msgstruct.h" /> - <Unit filename="rapidjson/allocators.h" /> - <Unit filename="rapidjson/document.h" /> - <Unit filename="rapidjson/encodedstream.h" /> - <Unit filename="rapidjson/encodings.h" /> - <Unit filename="rapidjson/error/en.h" /> - <Unit filename="rapidjson/error/error.h" /> - <Unit filename="rapidjson/filereadstream.h" /> - <Unit filename="rapidjson/filewritestream.h" /> - <Unit filename="rapidjson/fwd.h" /> - <Unit filename="rapidjson/internal/biginteger.h" /> - <Unit filename="rapidjson/internal/diyfp.h" /> - <Unit filename="rapidjson/internal/dtoa.h" /> - <Unit filename="rapidjson/internal/ieee754.h" /> - <Unit filename="rapidjson/internal/itoa.h" /> - <Unit filename="rapidjson/internal/meta.h" /> - <Unit filename="rapidjson/internal/pow10.h" /> - <Unit filename="rapidjson/internal/regex.h" /> - <Unit filename="rapidjson/internal/stack.h" /> - <Unit filename="rapidjson/internal/strfunc.h" /> - <Unit filename="rapidjson/internal/strtod.h" /> - <Unit filename="rapidjson/internal/swap.h" /> - <Unit filename="rapidjson/istreamwrapper.h" /> - <Unit filename="rapidjson/memorybuffer.h" /> - <Unit filename="rapidjson/memorystream.h" /> - <Unit filename="rapidjson/msinttypes/inttypes.h" /> - <Unit filename="rapidjson/msinttypes/stdint.h" /> - <Unit filename="rapidjson/ostreamwrapper.h" /> - <Unit filename="rapidjson/pointer.h" /> - <Unit filename="rapidjson/prettywriter.h" /> - <Unit filename="rapidjson/rapidjson.h" /> - <Unit filename="rapidjson/reader.h" /> - <Unit filename="rapidjson/schema.h" /> - <Unit filename="rapidjson/stream.h" /> - <Unit filename="rapidjson/stringbuffer.h" /> - <Unit filename="rapidjson/writer.h" /> - <Unit filename="socket.cpp" /> - <Unit filename="socket.h" /> - <Unit filename="socks.h" /> - <Unit filename="thdq.hpp" /> - <Unit filename="webdesign.cpp" /> - <Unit filename="webdesign.h" /> - <Extensions> - <code_completion /> - <debugger /> - </Extensions> - </Project> -</CodeBlocks_project_file> diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp new file mode 100644 index 0000000..deb0fc7 --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -0,0 +1,887 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <string.h> +#include <math.h> +#include <iostream> +#include <vector> +#include <algorithm> +#include <regex> + +#ifdef _WIN32 +#include <windows.h> + +static inline void port_sleep(size_t sec) +{ + Sleep(sec * 1000); +} +#else +#include <unistd.h> + +static inline void port_sleep(size_t sec) +{ + sleep(sec); +} +#endif // _WIN32 + +#if 0 +static inline long long unsigned int int_port(size_t i) +{ + return i; +} +#endif + +#include "gpu.hpp" + +const char* err_to_str(cl_int ret) +{ + switch(ret) + { + case CL_SUCCESS: + return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: + return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: + return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: + return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: + return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: + return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: + return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: + return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: + return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: + return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: + return "CL_MAP_FAILURE"; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case CL_COMPILE_PROGRAM_FAILURE: + return "CL_COMPILE_PROGRAM_FAILURE"; + case CL_LINKER_NOT_AVAILABLE: + return "CL_LINKER_NOT_AVAILABLE"; + case CL_LINK_PROGRAM_FAILURE: + return "CL_LINK_PROGRAM_FAILURE"; + case CL_DEVICE_PARTITION_FAILED: + return "CL_DEVICE_PARTITION_FAILED"; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + case CL_INVALID_VALUE: + return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: + return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: + return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: + return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: + return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: + return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: + return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: + return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: + return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: + return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: + return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: + return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: + return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: + return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: + return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: + return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: + return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: + return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: + return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: + return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: + return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: + return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: + return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: + return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: + return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: + return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: + return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: + return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: + return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: + return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: + return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: + return "CL_INVALID_MIP_LEVEL"; + case CL_INVALID_GLOBAL_WORK_SIZE: + return "CL_INVALID_GLOBAL_WORK_SIZE"; + case CL_INVALID_PROPERTY: + return "CL_INVALID_PROPERTY"; + case CL_INVALID_IMAGE_DESCRIPTOR: + return "CL_INVALID_IMAGE_DESCRIPTOR"; + case CL_INVALID_COMPILER_OPTIONS: + return "CL_INVALID_COMPILER_OPTIONS"; + case CL_INVALID_LINKER_OPTIONS: + return "CL_INVALID_LINKER_OPTIONS"; + case CL_INVALID_DEVICE_PARTITION_COUNT: + return "CL_INVALID_DEVICE_PARTITION_COUNT"; +#ifdef CL_VERSION_2_0 + case CL_INVALID_PIPE_SIZE: + return "CL_INVALID_PIPE_SIZE"; + case CL_INVALID_DEVICE_QUEUE: + return "CL_INVALID_DEVICE_QUEUE"; +#endif + default: + return "UNKNOWN_ERROR"; + } +} + +#if 0 +void printer::inst()->print_msg(L1,const char* fmt, ...); +void printer::inst()->print_str(const char* str); +#endif + +char* LoadTextFile(const char* filename) +{ + size_t flen; + char* out; + FILE* kernel = fopen(filename, "rb"); + + if(kernel == NULL) + return NULL; + + fseek(kernel, 0, SEEK_END); + flen = ftell(kernel); + fseek(kernel, 0, SEEK_SET); + + out = (char*)malloc(flen+1); + size_t r = fread(out, flen, 1, kernel); + fclose(kernel); + + if(r != 1) + { + free(out); + return NULL; + } + + out[flen] = '\0'; + return out; +} + +size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_code) +{ + size_t MaximumWorkSize; + cl_int ret; + + if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &MaximumWorkSize, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret)); + return ERR_OCL_API; + } + + printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); +#ifdef CL_VERSION_2_0 + const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 }; + ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret); +#else + const cl_command_queue_properties CommandQueueProperties = { 0 }; + ctx->CommandQueues = clCreateCommandQueue(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret); +#endif + + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret)); + return ERR_OCL_API; + } + + ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + size_t g_thd = ctx->rawIntensity; + ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (1 << 21) * g_thd, NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Blake-256 branches + ctx->ExtraBuffers[2] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Groestl-256 branches + ctx->ExtraBuffers[3] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + // JH-256 branches + ctx->ExtraBuffers[4] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Skein-512 branches + ctx->ExtraBuffers[5] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Assume we may find up to 0xFF nonces in one run - it's reasonable + ctx->OutputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * 0x100, NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the contents of cryptonight.cl", err_to_str(ret)); + return ERR_OCL_API; + } + + char options[32]; + snprintf(options, sizeof(options), "-I. -DWORKSIZE=%llu", int_port(ctx->workSize)); + ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL); + if(ret != CL_SUCCESS) + { + size_t len; + printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); + + if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + return ERR_OCL_API; + } + + char* BuildLog = (char*)malloc(len + 1); + BuildLog[0] = '\0'; + + if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + { + free(BuildLog); + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + return ERR_OCL_API; + } + + printer::inst()->print_str("Build log:\n"); + std::cerr<<BuildLog<<std::endl; + + free(BuildLog); + return ERR_OCL_API; + } + + cl_build_status status; + do + { + if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + return ERR_OCL_API; + } + port_sleep(1); + } + while(status == CL_BUILD_IN_PROGRESS); + + const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; + for(int i = 0; i < 7; ++i) + { + ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel %s.", err_to_str(ret), KernelNames[i]); + return ERR_OCL_API; + } + } + + ctx->Nonce = 0; + return 0; +} + +const cl_platform_info attributeTypes[5] = { + CL_PLATFORM_NAME, + CL_PLATFORM_VENDOR, + CL_PLATFORM_VERSION, + CL_PLATFORM_PROFILE, + CL_PLATFORM_EXTENSIONS +}; + +const char* const attributeNames[] = { + "CL_PLATFORM_NAME", + "CL_PLATFORM_VENDOR", + "CL_PLATFORM_VERSION", + "CL_PLATFORM_PROFILE", + "CL_PLATFORM_EXTENSIONS" +}; + +#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) + +void PrintDeviceInfo(cl_device_id device) +{ + char queryBuffer[1024]; + int queryInt; + cl_int clError; + clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL); + printf(" CL_DEVICE_NAME: %s\n", queryBuffer); + queryBuffer[0] = '\0'; + clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL); + printf(" CL_DEVICE_VENDOR: %s\n", queryBuffer); + queryBuffer[0] = '\0'; + clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL); + printf(" CL_DRIVER_VERSION: %s\n", queryBuffer); + queryBuffer[0] = '\0'; + clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL); + printf(" CL_DEVICE_VERSION: %s\n", queryBuffer); + queryBuffer[0] = '\0'; + clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL); + printf(" CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt); +} + +uint32_t getNumPlatforms() +{ + cl_uint num_platforms = 0; + cl_platform_id * platforms = NULL; + cl_int clStatus; + + // Get platform and device information + clStatus = clGetPlatformIDs(0, NULL, &num_platforms); + platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); + clStatus = clGetPlatformIDs(num_platforms, platforms, NULL); + + return num_platforms; +} + +std::vector<GpuContext> getAMDDevices(int index) +{ + std::vector<GpuContext> ctxVec; + cl_platform_id * platforms = NULL; + cl_int clStatus; + cl_uint num_devices; + cl_device_id *device_list = NULL; + + uint32_t numPlatforms = getNumPlatforms(); + + + platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms); + clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL); + + clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); + device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices); + clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL); + for (int k = 0; k < num_devices; k++) { + cl_int clError; + std::vector<char> devVendorVec(1024); + clError = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL); + std::string devVendor(devVendorVec.data()); + if( devVendor.find("Advanced Micro Devices") != std::string::npos) + { + GpuContext ctx; + ctx.deviceIdx = k; + clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL); + size_t maxMem; + clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL); + clError = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL); + // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory + ctx.freeMem = std::min(ctx.freeMem, maxMem); + std::vector<char> devNameVec(1024); + clError = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL); + ctx.name = std::string(devNameVec.data()); + printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); + ctx.DeviceID = device_list[k]; + ctxVec.push_back(ctx); + } + } + + + free(device_list); + free(platforms); + + return ctxVec; +} + +int getAMDPlatformIdx() +{ + + uint32_t numPlatforms = getNumPlatforms(); + + if(numPlatforms == 0) + { + printer::inst()->print_msg(L0,"WARNING: No OpenCL platform found."); + return -1; + } + cl_platform_id * platforms = NULL; + cl_int clStatus; + + platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms); + clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL); + + int platformIndex = -1; + + for (int i = 0; i < numPlatforms; i++) { + size_t infoSize; + clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &infoSize); + std::vector<char> platformNameVec(infoSize); + + clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL); + std::string platformName(platformNameVec.data()); + if( platformName.find("Advanced Micro Devices") != std::string::npos) + { + platformIndex = i; + printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str()); + break; + } + } + + free(platforms); + return platformIndex; +} + +// RequestedDeviceIdxs is a list of OpenCL device indexes +// NumDevicesRequested is number of devices in RequestedDeviceIdxs list +// Returns 0 on success, -1 on stupid params, -2 on OpenCL API error +size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) +{ + + cl_context opencl_ctx; + cl_int ret; + cl_uint entries; + + if((ret = clGetPlatformIDs(0, NULL, &entries)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret)); + return ERR_OCL_API; + } + + + // The number of platforms naturally is the index of the last platform plus one. + if(entries <= platform_idx) + { + printer::inst()->print_msg(L1,"Selected OpenCL platform index %d doesn't exist.", platform_idx); + return ERR_STUPID_PARAMS; + } + + + + cl_platform_id * platforms = NULL; + cl_int clStatus; + uint32_t numPlatforms = getNumPlatforms(); + + platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms); + clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL); + + size_t infoSize; + clGetPlatformInfo(platforms[platform_idx], CL_PLATFORM_VENDOR, 0, NULL, &infoSize); + std::vector<char> platformNameVec(infoSize); + clGetPlatformInfo(platforms[platform_idx], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL); + std::string platformName(platformNameVec.data()); + if( platformName.find("Advanced Micro Devices") == std::string::npos) + { + printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str()); + } + + free(platforms); + + /*MSVC skimping on devel costs by shoehorning C99 to be a subset of C++? Noooo... can't be.*/ +#ifdef __GNUC__ + cl_platform_id PlatformIDList[entries]; +#else + cl_platform_id* PlatformIDList = (cl_platform_id*)_alloca(entries * sizeof(cl_platform_id)); +#endif + if((ret = clGetPlatformIDs(entries, PlatformIDList, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret)); + return ERR_OCL_API; + } + + if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &entries)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Same as the platform index sanity check, except we must check all requested device indexes + for(int i = 0; i < num_gpus; ++i) + { + if(entries <= ctx[i].deviceIdx) + { + printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + } + +#ifdef __GNUC__ + cl_device_id DeviceIDList[entries]; +#else + cl_device_id* DeviceIDList = (cl_device_id*)_alloca(entries * sizeof(cl_device_id)); +#endif + if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, entries, DeviceIDList, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Indexes sanity checked above +#ifdef __GNUC__ + cl_device_id TempDeviceList[num_gpus]; +#else + cl_device_id* TempDeviceList = (cl_device_id*)_alloca(entries * sizeof(cl_device_id)); +#endif + for(int i = 0; i < num_gpus; ++i) + { + ctx[i].DeviceID = DeviceIDList[ctx[i].deviceIdx]; + TempDeviceList[i] = DeviceIDList[ctx[i].deviceIdx]; + } + + opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList, NULL, NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret)); + return ERR_OCL_API; + } + + //char* source_code = LoadTextFile(sSourcePath); + + const char *cryptonightCL = + #include "./opencl/cryptonight.cl" + ; + const char *blake256CL = + #include "./opencl/blake256.cl" + ; + const char *groestl256CL = + #include "./opencl/groestl256.cl" + ; + const char *jhCL = + #include "./opencl/jh.cl" + ; + const char *wolfAesCL = + #include "./opencl/wolf-aes.cl" + ; + const char *wolfSkeinCL = + #include "./opencl/wolf-skein.cl" + ; + + std::string source_code(cryptonightCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL); + + for(int i = 0; i < num_gpus; ++i) + { + if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) + { + return ret; + } + } + + return ERR_SUCCESS; +} + +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint32_t target) +{ + cl_int ret; + + if(input_len > 84) + return ERR_STUPID_PARAMS; + + input[input_len] = 0x01; + memset(input + input_len + 1, 0, 88 - input_len - 1); + + if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret)); + return ERR_OCL_API; + } + + if((ret = clSetKernelArg(ctx->Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Scratchpads + if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); + return ERR_OCL_API; + } + + // States + if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); + return ERR_OCL_API; + } + + // CN2 Kernel + + // Scratchpads + if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); + return ERR_OCL_API; + } + + // States + if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); + return ERR_OCL_API; + } + + // CN3 Kernel + // Scratchpads + if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); + return ERR_OCL_API; + } + + // States + if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Branch 0 + if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Branch 1 + if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Branch 2 + if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); + return ERR_OCL_API; + } + + // Branch 3 + if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); + return ERR_OCL_API; + } + + for(int i = 0; i < 4; ++i) + { + // States + if((ret = clSetKernelArg(ctx->Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + return ERR_OCL_API; + } + + // Nonce buffer + if((ret = clSetKernelArg(ctx->Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + return ERR_OCL_API; + } + + // Output + if((ret = clSetKernelArg(ctx->Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + return ERR_OCL_API; + } + + // Target + if((ret = clSetKernelArg(ctx->Kernels[i + 3], 3, sizeof(cl_uint), &target)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + return ERR_OCL_API; + } + } + + return ERR_SUCCESS; +} + +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput) +{ + cl_int ret; + cl_uint zero = 0; + size_t BranchNonces[4]; + memset(BranchNonces,0,sizeof(size_t)*4); + + size_t g_thd = ctx->rawIntensity; + size_t w_size = ctx->workSize; + + for(int i = 2; i < 6; ++i) + { + if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->ExtraBuffers[i], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2); + return ERR_OCL_API; + } + } + + if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + return ERR_OCL_API; + } + + clFinish(ctx->CommandQueues); + + size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 }; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); + return ERR_OCL_API; + } + + /*for(int i = 1; i < 3; ++i) + { + if((ret = clEnqueueNDRangeKernel(*ctx->CommandQueues, ctx->Kernels[i], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + Log(LOG_CRITICAL, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i); + return(ERR_OCL_API); + } + }*/ + + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + return ERR_OCL_API; + } + + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); + return ERR_OCL_API; + } + + if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + return ERR_OCL_API; + } + + if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + return ERR_OCL_API; + } + + if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + return ERR_OCL_API; + } + + if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + return ERR_OCL_API; + } + + clFinish(ctx->CommandQueues); + + for(int i = 0; i < 4; ++i) + { + if(BranchNonces[i]) + { + // Threads + if((clSetKernelArg(ctx->Kernels[i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return(ERR_OCL_API); + } + + BranchNonces[i] = ((size_t)ceil( (double)BranchNonces[i] / (double)w_size) ) * w_size; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &ctx->Nonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + return ERR_OCL_API; + } + } + } + + if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + return ERR_OCL_API; + } + + clFinish(ctx->CommandQueues); + ctx->Nonce += g_thd; + + return ERR_SUCCESS; +} diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp new file mode 100644 index 0000000..5ff7ea1 --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -0,0 +1,51 @@ +#pragma once + +#include "xmrstak/misc/console.hpp" + +#if defined(__APPLE__) +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +#include <stdint.h> +#include <vector> + +#define ERR_SUCCESS (0) +#define ERR_OCL_API (2) +#define ERR_STUPID_PARAMS (1) + + + +struct GpuContext +{ + /*Input vars*/ + size_t deviceIdx; + size_t rawIntensity; + size_t workSize; + + /*Output vars*/ + cl_device_id DeviceID; + cl_command_queue CommandQueues; + cl_mem InputBuffer; + cl_mem OutputBuffer; + cl_mem ExtraBuffers[6]; + cl_program Program; + cl_kernel Kernels[7]; + size_t freeMem; + int computeUnits; + std::string name; + + size_t Nonce; + +}; + +uint32_t getNumPlatforms(); +int getAMDPlatformIdx(); +std::vector<GpuContext> getAMDDevices(int index); + +size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint32_t target); +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput); + + diff --git a/xmrstak/backend/amd/amd_gpu/opencl/blake256.cl b/xmrstak/backend/amd/amd_gpu/opencl/blake256.cl new file mode 100644 index 0000000..3d5fe3e --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/blake256.cl @@ -0,0 +1,93 @@ +R"===( +/* +* blake256 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2014 djm34 +* Copyright (c) 2014 tpruvot +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +__constant static const int sigma[16][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + + +__constant static const sph_u32 c_IV256[8] = { + 0x6A09E667, 0xBB67AE85, + 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + +/* Second part (64-80) msg never change, store it */ +__constant static const sph_u32 c_Padding[16] = { + 0, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640, +}; +__constant static const sph_u32 c_u256[16] = { + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, + 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, + 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, + 0x3F84D5B5, 0xB5470917 +}; + +#define GS(a,b,c,d,x) { \ + const sph_u32 idx1 = sigma[r][x]; \ + const sph_u32 idx2 = sigma[r][x+1]; \ + v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ + v[d] ^= v[a]; \ + v[d] = rotate(v[d], 16U); \ + v[c] += v[d]; \ + v[b] ^= v[c]; \ + v[b] = rotate(v[b], 20U); \ +\ + v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \ + v[d] ^= v[a]; \ + v[d] = rotate(v[d], 24U); \ + v[c] += v[d]; \ + v[b] ^= v[c]; \ + v[b] = rotate(v[b], 25U); \ +} +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl new file mode 100644 index 0000000..dd0ebcb --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -0,0 +1,860 @@ +R"===( +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifdef cl_amd_media_ops +#pragma OPENCL EXTENSION cl_amd_media_ops : enable +#else +/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt + * Build-in Function + * uintn amd_bitalign (uintn src0, uintn src1, uintn src2) + * Description + * dst.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2.s0 & 31)) + * similar operation applied to other components of the vectors. + * + * The implemented function is modified because the last is in our case always a scalar. + * We can ignore the bitwise AND operation. + */ +inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2) +{ + uint2 result; + result.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2)); + result.s1 = (uint) (((((long)src0.s1) << 32) | (long)src1.s1) >> (src2)); + return result; +} +#endif + +#ifdef cl_amd_media_ops2 +#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable +#else +/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt + * Built-in Function: + * uintn amd_bfe (uintn src0, uintn src1, uintn src2) + * Description + * NOTE: operator >> below represent logical right shift + * offset = src1.s0 & 31; + * width = src2.s0 & 31; + * if width = 0 + * dst.s0 = 0; + * else if (offset + width) < 32 + * dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width); + * else + * dst.s0 = src0.s0 >> offset; + * similar operation applied to other components of the vectors + */ +inline int amd_bfe(const uint src0, const uint offset, const uint width) +{ + /* casts are removed because we can implement everything as uint + * int offset = src1; + * int width = src2; + * remove check for edge case, this function is always called with + * `width==8` + * @code + * if ( width == 0 ) + * return 0; + * @endcode + */ + if ( (offset + width) < 32u ) + return (src0 << (32u - offset - width)) >> (32u - width); + + return src0 >> offset; +} +#endif + +//#include "opencl/wolf-aes.cl" +XMRSTAK_INCLUDE_WOLF_AES +//#include "opencl/wolf-skein.cl" +XMRSTAK_INCLUDE_WOLF_SKEIN +//#include "opencl/jh.cl" +XMRSTAK_INCLUDE_JH +//#include "opencl/blake256.cl" +XMRSTAK_INCLUDE_BLAKE256 +//#include "opencl/groestl256.cl" +XMRSTAK_INCLUDE_GROESTL256 + +static const __constant ulong keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +static const __constant uchar sbox[256] = +{ + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16 +}; + + +void keccakf1600(ulong *s) +{ + for(int i = 0; i < 24; ++i) + { + ulong bc[5], tmp1, tmp2; + bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL); + bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL); + bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL); + bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL); + bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = rotate(s[6] ^ bc[0], 44UL); + s[6] = rotate(s[9] ^ bc[3], 20UL); + s[9] = rotate(s[22] ^ bc[1], 61UL); + s[22] = rotate(s[14] ^ bc[3], 39UL); + s[14] = rotate(s[20] ^ bc[4], 18UL); + s[20] = rotate(s[2] ^ bc[1], 62UL); + s[2] = rotate(s[12] ^ bc[1], 43UL); + s[12] = rotate(s[13] ^ bc[2], 25UL); + s[13] = rotate(s[19] ^ bc[3], 8UL); + s[19] = rotate(s[23] ^ bc[2], 56UL); + s[23] = rotate(s[15] ^ bc[4], 41UL); + s[15] = rotate(s[4] ^ bc[3], 27UL); + s[4] = rotate(s[24] ^ bc[3], 14UL); + s[24] = rotate(s[21] ^ bc[0], 2UL); + s[21] = rotate(s[8] ^ bc[2], 55UL); + s[8] = rotate(s[16] ^ bc[0], 35UL); + s[16] = rotate(s[5] ^ bc[4], 36UL); + s[5] = rotate(s[3] ^ bc[2], 28UL); + s[3] = rotate(s[18] ^ bc[2], 21UL); + s[18] = rotate(s[17] ^ bc[1], 15UL); + s[17] = rotate(s[11] ^ bc[0], 10UL); + s[11] = rotate(s[7] ^ bc[1], 6UL); + s[7] = rotate(s[10] ^ bc[4], 3UL); + s[10] = rotate(tmp1, 1UL); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= keccakf_rndc[i]; + } +} + +static const __constant uint keccakf_rotc[24] = +{ + 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, + 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 +}; + +static const __constant uint keccakf_piln[24] = +{ + 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, + 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 +}; + +void keccakf1600_1(ulong *st) +{ + int i, round; + ulong t, bc[5]; + + #pragma unroll 1 + for(round = 0; round < 24; ++round) + { + + // Theta + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; + + #pragma unroll 1 + for (i = 0; i < 5; ++i) { + t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL); + st[i ] ^= t; + st[i + 5] ^= t; + st[i + 10] ^= t; + st[i + 15] ^= t; + st[i + 20] ^= t; + } + + // Rho Pi + t = st[1]; + #pragma unroll + for (i = 0; i < 24; ++i) { + bc[0] = st[keccakf_piln[i]]; + st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]); + t = bc[0]; + } + + //ulong tmp1 = st[0]; ulong tmp2 = st[1]; st[0] = bitselect(st[0] ^ st[2], st[0], st[1]); st[1] = bitselect(st[1] ^ st[3], st[1], st[2]); st[2] = bitselect(st[2] ^ st[4], st[2], st[3]); st[3] = bitselect(st[3] ^ tmp1, st[3], st[4]); st[4] = bitselect(st[4] ^ tmp2, st[4], tmp1); + //tmp1 = st[5]; tmp2 = st[6]; st[5] = bitselect(st[5] ^ st[7], st[5], st[6]); st[6] = bitselect(st[6] ^ st[8], st[6], st[7]); st[7] = bitselect(st[7] ^ st[9], st[7], st[8]); st[8] = bitselect(st[8] ^ tmp1, st[8], st[9]); st[9] = bitselect(st[9] ^ tmp2, st[9], tmp1); + //tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1); + //tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1); + //tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1); + + #pragma unroll 1 + for(int i = 0; i < 25; i += 5) + { + ulong tmp[5]; + + #pragma unroll 1 + for(int x = 0; x < 5; ++x) + tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]); + + #pragma unroll 1 + for(int x = 0; x < 5; ++x) st[i + x] = tmp[x]; + } + + // Iota + st[0] ^= keccakf_rndc[round]; + } +} +)===" +R"===( +void keccakf1600_2(ulong *st) +{ + int i, round; + ulong t, bc[5]; + + #pragma unroll 1 + for(round = 0; round < 24; ++round) + { + + // Theta + //bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; + //bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; + //bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; + //bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; + //bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; + + /* + #pragma unroll + for (i = 0; i < 5; ++i) { + t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL); + st[i ] ^= t; + st[i + 5] ^= t; + st[i + 10] ^= t; + st[i + 15] ^= t; + st[i + 20] ^= t; + } + */ + + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL); + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL); + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL); + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL); + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL); + + st[0] ^= bc[4]; + st[5] ^= bc[4]; + st[10] ^= bc[4]; + st[15] ^= bc[4]; + st[20] ^= bc[4]; + + st[1] ^= bc[0]; + st[6] ^= bc[0]; + st[11] ^= bc[0]; + st[16] ^= bc[0]; + st[21] ^= bc[0]; + + st[2] ^= bc[1]; + st[7] ^= bc[1]; + st[12] ^= bc[1]; + st[17] ^= bc[1]; + st[22] ^= bc[1]; + + st[3] ^= bc[2]; + st[8] ^= bc[2]; + st[13] ^= bc[2]; + st[18] ^= bc[2]; + st[23] ^= bc[2]; + + st[4] ^= bc[3]; + st[9] ^= bc[3]; + st[14] ^= bc[3]; + st[19] ^= bc[3]; + st[24] ^= bc[3]; + + // Rho Pi + t = st[1]; + #pragma unroll + for (i = 0; i < 24; ++i) { + bc[0] = st[keccakf_piln[i]]; + st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]); + t = bc[0]; + } + + + + /*ulong tmp1 = st[1] ^ bc[0]; + + st[0] ^= bc[4]; + st[1] = rotate(st[6] ^ bc[0], 44UL); + st[6] = rotate(st[9] ^ bc[3], 20UL); + st[9] = rotate(st[22] ^ bc[1], 61UL); + st[22] = rotate(st[14] ^ bc[3], 39UL); + st[14] = rotate(st[20] ^ bc[4], 18UL); + st[20] = rotate(st[2] ^ bc[1], 62UL); + st[2] = rotate(st[12] ^ bc[1], 43UL); + st[12] = rotate(st[13] ^ bc[2], 25UL); + st[13] = rotate(st[19] ^ bc[3], 8UL); + st[19] = rotate(st[23] ^ bc[2], 56UL); + st[23] = rotate(st[15] ^ bc[4], 41UL); + st[15] = rotate(st[4] ^ bc[3], 27UL); + st[4] = rotate(st[24] ^ bc[3], 14UL); + st[24] = rotate(st[21] ^ bc[0], 2UL); + st[21] = rotate(st[8] ^ bc[2], 55UL); + st[8] = rotate(st[16] ^ bc[0], 35UL); + st[16] = rotate(st[5] ^ bc[4], 36UL); + st[5] = rotate(st[3] ^ bc[2], 28UL); + st[3] = rotate(st[18] ^ bc[2], 21UL); + st[18] = rotate(st[17] ^ bc[1], 15UL); + st[17] = rotate(st[11] ^ bc[0], 10UL); + st[11] = rotate(st[7] ^ bc[1], 6UL); + st[7] = rotate(st[10] ^ bc[4], 3UL); + st[10] = rotate(tmp1, 1UL); + */ + + + //ulong tmp1 = st[0]; ulong tmp2 = st[1]; st[0] = bitselect(st[0] ^ st[2], st[0], st[1]); st[1] = bitselect(st[1] ^ st[3], st[1], st[2]); st[2] = bitselect(st[2] ^ st[4], st[2], st[3]); st[3] = bitselect(st[3] ^ tmp1, st[3], st[4]); st[4] = bitselect(st[4] ^ tmp2, st[4], tmp1); + //tmp1 = st[5]; tmp2 = st[6]; st[5] = bitselect(st[5] ^ st[7], st[5], st[6]); st[6] = bitselect(st[6] ^ st[8], st[6], st[7]); st[7] = bitselect(st[7] ^ st[9], st[7], st[8]); st[8] = bitselect(st[8] ^ tmp1, st[8], st[9]); st[9] = bitselect(st[9] ^ tmp2, st[9], tmp1); + //tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1); + //tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1); + //tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1); + + #pragma unroll + for(int i = 0; i < 25; i += 5) + { + ulong tmp1 = st[i], tmp2 = st[i + 1]; + + st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]); + st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]); + st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]); + st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]); + st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1); + } + + // Iota + st[0] ^= keccakf_rndc[round]; + } +} + +)===" +R"===( + +void CNKeccak(ulong *output, ulong *input) +{ + ulong st[25]; + + // Copy 72 bytes + for(int i = 0; i < 9; ++i) st[i] = input[i]; + + // Last four and '1' bit for padding + //st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U)); + + st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL; + + for(int i = 10; i < 25; ++i) st[i] = 0x00UL; + + // Last bit of padding + st[16] = 0x8000000000000000UL; + + keccakf1600_1(st); + + for(int i = 0; i < 25; ++i) output[i] = st[i]; +} + +static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 }; + +#define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U)) + +#define SubWord(inw) ((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)]) + +void AESExpandKey256(uint *keybuf) +{ + //#pragma unroll 4 + for(uint c = 8, i = 1; c < 60; ++c) + { + // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th + uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1]; + + // If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant, + // then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation + // is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done. + keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t); + } +} + +#define IDX(x) (x) + +__attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) +__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states) +{ + ulong State[25]; + uint ExpandedKey1[256]; + __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + uint4 text; + + states += (25 * (get_global_id(0) - get_global_offset(0))); + Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2); + + for(int i = get_local_id(0); i < 256; i += WORKSIZE) + { + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + AES2[i] = rotate(tmp, 16U); + AES3[i] = rotate(tmp, 24U); + } + barrier(CLK_LOCAL_MEM_FENCE); + + ((ulong8 *)State)[0] = vload8(0, input); + State[8] = input[8]; + State[9] = input[9]; + State[10] = input[10]; + + ((uint *)State)[9] &= 0x00FFFFFFU; + ((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24; + ((uint *)State)[10] &= 0xFF000000U; + ((uint *)State)[10] |= ((get_global_id(0) >> 8)); + + for(int i = 11; i < 25; ++i) State[i] = 0x00UL; + + // Last bit of padding + State[16] = 0x8000000000000000UL; + + keccakf1600_2(State); + + mem_fence(CLK_GLOBAL_MEM_FENCE); + + #pragma unroll + for(int i = 0; i < 25; ++i) states[i] = State[i]; + + text = vload4(get_local_id(1) + 4, (__global uint *)(states)); + + #pragma unroll + for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey1)[i] = states[i]; + + AESExpandKey256(ExpandedKey1); + + mem_fence(CLK_LOCAL_MEM_FENCE); + + #pragma unroll 2 + for(int i = 0; i < 0x4000; ++i) + { + #pragma unroll + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]); + + Scratchpad[IDX((i << 3) + get_local_id(1))] = text; + } + + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states) +{ + ulong a[2], b[2]; + __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + + Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2); + states += (25 * (get_global_id(0) - get_global_offset(0))); + + for(int i = get_local_id(0); i < 256; i += WORKSIZE) + { + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + AES2[i] = rotate(tmp, 16U); + AES3[i] = rotate(tmp, 24U); + } + barrier(CLK_LOCAL_MEM_FENCE); + + a[0] = states[0] ^ states[4]; + b[0] = states[2] ^ states[6]; + a[1] = states[1] ^ states[5]; + b[1] = states[3] ^ states[7]; + + uint4 b_x = ((uint4 *)b)[0]; + + mem_fence(CLK_LOCAL_MEM_FENCE); + + #pragma unroll 8 + for(int i = 0; i < 0x80000; ++i) + { + ulong c[2]; + + ((uint4 *)c)[0] = Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)]; + ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); + //b_x ^= ((uint4 *)c)[0]; + + Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)] = b_x ^ ((uint4 *)c)[0]; + + uint4 tmp; + tmp = Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)]; + + a[1] += c[0] * as_ulong2(tmp).s0; + a[0] += mul_hi(c[0], as_ulong2(tmp).s0); + + Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)] = ((uint4 *)a)[0]; + + ((uint4 *)a)[0] ^= tmp; + + b_x = ((uint4 *)c)[0]; + } + + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) +__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3) +{ + __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + uint ExpandedKey2[256]; + ulong State[25]; + uint4 text; + + Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2); + states += (25 * (get_global_id(0) - get_global_offset(0))); + + for(int i = get_local_id(0); i < 256; i += WORKSIZE) + { + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + AES2[i] = rotate(tmp, 16U); + AES3[i] = rotate(tmp, 24U); + } + barrier(CLK_LOCAL_MEM_FENCE); + + #if defined(__Tahiti__) || defined(__Pitcairn__) + + for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4]; + text = vload4(get_local_id(1) + 4, (__global uint *)states); + + #else + + text = vload4(get_local_id(1) + 4, (__global uint *)states); + ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states); + + #endif + + AESExpandKey256(ExpandedKey2); + + barrier(CLK_LOCAL_MEM_FENCE); + + #pragma unroll 2 + for(int i = 0; i < 0x4000; ++i) + { + text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + + #pragma unroll + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + } + + vstore2(as_ulong2(text), get_local_id(1) + 4, states); + + barrier(CLK_GLOBAL_MEM_FENCE); + + if(!get_local_id(1)) + { + for(int i = 0; i < 25; ++i) State[i] = states[i]; + + keccakf1600_2(State); + + for(int i = 0; i < 25; ++i) states[i] = State[i]; + + switch(State[0] & 3) + { + case 0: + Branch0[atomic_inc(Branch0 + get_global_size(0))] = get_global_id(0) - get_global_offset(0); + break; + case 1: + Branch1[atomic_inc(Branch1 + get_global_size(0))] = get_global_id(0) - get_global_offset(0); + break; + case 2: + Branch2[atomic_inc(Branch2 + get_global_size(0))] = get_global_id(0) - get_global_offset(0); + break; + case 3: + Branch3[atomic_inc(Branch3 + get_global_size(0))] = get_global_id(0) - get_global_offset(0); + break; + } + } + + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +)===" +R"===( + +#define VSWAP8(x) (((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \ + | (((x) >> 8) & 0x00000000FF000000UL) | (((x) << 8) & 0x000000FF00000000UL) \ + | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL)) + +#define VSWAP4(x) ((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U)) + +__kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, uint Target, ulong Threads) +{ + const ulong idx = get_global_id(0) - get_global_offset(0); + + if(idx >= Threads) return; + + states += 25 * BranchBuf[idx]; + + // skein + ulong8 h = vload8(0, SKEIN512_256_IV); + + // Type field begins with final bit, first bit, then six bits of type; the last 96 + // bits are input processed (including in the block to be processed with that tweak) + // The output transform is only one run of UBI, since we need only 256 bits of output + // The tweak for the output transform is Type = Output with the Final bit set + // T[0] for the output is 8, and I don't know why - should be message size... + ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL }; + ulong8 p, m; + + for(uint i = 0; i < 4; ++i) + { + if(i < 3) t[0] += 0x40UL; + else t[0] += 0x08UL; + + t[2] = t[0] ^ t[1]; + + m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); + const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY; + p = Skein512Block(m, h, h8, t); + + h = m ^ p; + + if(i < 2) t[1] = 0x3000000000000000UL; + else t[1] = 0xB000000000000000UL; + } + + t[0] = 0x08UL; + t[1] = 0xFF00000000000000UL; + t[2] = t[0] ^ t[1]; + + p = (ulong8)(0); + const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY; + + p = Skein512Block(p, h, h8, t); + + //vstore8(p, 0, output); + + if(as_uint16(p).s7 <= Target) output[atomic_inc(output + 0xFF)] = BranchBuf[idx] + get_global_offset(0); + + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) + +__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, uint Target, ulong Threads) +{ + const uint idx = get_global_id(0) - get_global_offset(0); + + if(idx >= Threads) return; + + states += 25 * BranchBuf[idx]; + + sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL; + sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL; + sph_u64 tmp; + + for(int i = 0; i < 5; ++i) + { + ulong input[8]; + + if(i < 3) + { + for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]); + } + else if(i == 3) + { + input[0] = (states[24]); + input[1] = 0x80UL; + for(int x = 2; x < 8; ++x) input[x] = 0x00UL; + } + else + { + input[7] = 0x4006000000000000UL; + + for(int x = 0; x < 7; ++x) input[x] = 0x00UL; + } + + h0h ^= input[0]; + h0l ^= input[1]; + h1h ^= input[2]; + h1l ^= input[3]; + h2h ^= input[4]; + h2l ^= input[5]; + h3h ^= input[6]; + h3l ^= input[7]; + + E8; + + h4h ^= input[0]; + h4l ^= input[1]; + h5h ^= input[2]; + h5l ^= input[3]; + h6h ^= input[4]; + h6l ^= input[5]; + h7h ^= input[6]; + h7l ^= input[7]; + } + + //output[0] = h6h; + //output[1] = h6l; + //output[2] = h7h; + //output[3] = h7l; + + if(as_uint2(h7l).s1 <= Target) output[atomic_inc(output + 0xFF)] = BranchBuf[idx] + get_global_offset(0); +} + +#define SWAP4(x) as_uint(as_uchar4(x).s3210) + +__kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, uint Target, ulong Threads) +{ + const uint idx = get_global_id(0) - get_global_offset(0); + + if(idx >= Threads) return; + + states += 25 * BranchBuf[idx]; + + unsigned int m[16]; + unsigned int v[16]; + uint h[8]; + + ((uint8 *)h)[0] = vload8(0U, c_IV256); + + for(uint i = 0, bitlen = 0; i < 4; ++i) + { + if(i < 3) + { + ((uint16 *)m)[0] = vload16(i, (__global uint *)states); + for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]); + bitlen += 512; + } + else + { + m[0] = SWAP4(((__global uint *)states)[48]); + m[1] = SWAP4(((__global uint *)states)[49]); + m[2] = 0x80000000U; + + for(int i = 3; i < 13; ++i) m[i] = 0x00U; + + m[13] = 1U; + m[14] = 0U; + m[15] = 0x640; + bitlen += 64; + } + + ((uint16 *)v)[0].lo = ((uint8 *)h)[0]; + ((uint16 *)v)[0].hi = vload8(0U, c_u256); + + //v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U; + //v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U; + + v[12] ^= bitlen; + v[13] ^= bitlen; + + for(int r = 0; r < 14; r++) + { + GS(0, 4, 0x8, 0xC, 0x0); + GS(1, 5, 0x9, 0xD, 0x2); + GS(2, 6, 0xA, 0xE, 0x4); + GS(3, 7, 0xB, 0xF, 0x6); + GS(0, 5, 0xA, 0xF, 0x8); + GS(1, 6, 0xB, 0xC, 0xA); + GS(2, 7, 0x8, 0xD, 0xC); + GS(3, 4, 0x9, 0xE, 0xE); + } + + ((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1]; + } + + for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]); + + //for(int i = 0; i < 4; ++i) output[i] = ((ulong *)h)[i]; + if(h[7] <= Target) output[atomic_inc(output + 0xFF)] = BranchBuf[idx] + get_global_offset(0); +} + +__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, uint Target, ulong Threads) +{ + const uint idx = get_global_id(0) - get_global_offset(0); + + if(idx >= Threads) return; + + states += 25 * BranchBuf[idx]; + + ulong State[8]; + + for(int i = 0; i < 7; ++i) State[i] = 0UL; + + State[7] = 0x0001000000000000UL; + + for(uint i = 0; i < 4; ++i) + { + ulong H[8], M[8]; + + if(i < 3) + { + ((ulong8 *)M)[0] = vload8(i, states); + } + else + { + M[0] = states[24]; + M[1] = 0x80UL; + + for(int x = 2; x < 7; ++x) M[x] = 0UL; + + M[7] = 0x0400000000000000UL; + } + + for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x]; + + PERM_SMALL_P(H); + PERM_SMALL_Q(M); + + for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x]; + } + + ulong tmp[8]; + + for(int i = 0; i < 8; ++i) tmp[i] = State[i]; + + PERM_SMALL_P(State); + + for(int i = 0; i < 8; ++i) State[i] ^= tmp[i]; + + //for(int i = 0; i < 4; ++i) output[i] = State[i + 4]; + if(as_uint2(State[7]).s1 <= Target) output[atomic_inc(output + 0xFF)] = BranchBuf[idx] + get_global_offset(0); +} + +)==="
\ No newline at end of file diff --git a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl new file mode 100644 index 0000000..1a7c96f --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl @@ -0,0 +1,295 @@ +R"===( +/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */ +/* + * Groestl256 + * + * ==========================(LICENSE BEGIN)============================ + * Copyright (c) 2014 djm34 + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin <thomas.pornin@cryptolog.com> + */ + +#define SPH_C64(x) x +#define SPH_ROTL64(x, y) rotate((x), (ulong)(y)) + + +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) + +#define B64_0(x) ((x) & 0xFF) +#define B64_1(x) (((x) >> 8) & 0xFF) +#define B64_2(x) (((x) >> 16) & 0xFF) +#define B64_3(x) (((x) >> 24) & 0xFF) +#define B64_4(x) (((x) >> 32) & 0xFF) +#define B64_5(x) (((x) >> 40) & 0xFF) +#define B64_6(x) (((x) >> 48) & 0xFF) +#define B64_7(x) ((x) >> 56) +#define R64 SPH_ROTL64 +#define PC64(j, r) ((sph_u64)((j) + (r))) +#define QC64(j, r) (((sph_u64)(r) << 56) ^ (~((sph_u64)(j) << 56))) + +static const __constant ulong T0_G[] = +{ + 0xc6a597f4a5f432c6UL, 0xf884eb9784976ff8UL, 0xee99c7b099b05eeeUL, 0xf68df78c8d8c7af6UL, + 0xff0de5170d17e8ffUL, 0xd6bdb7dcbddc0ad6UL, 0xdeb1a7c8b1c816deUL, 0x915439fc54fc6d91UL, + 0x6050c0f050f09060UL, 0x0203040503050702UL, 0xcea987e0a9e02eceUL, 0x567dac877d87d156UL, + 0xe719d52b192bcce7UL, 0xb56271a662a613b5UL, 0x4de69a31e6317c4dUL, 0xec9ac3b59ab559ecUL, + 0x8f4505cf45cf408fUL, 0x1f9d3ebc9dbca31fUL, 0x894009c040c04989UL, 0xfa87ef92879268faUL, + 0xef15c53f153fd0efUL, 0xb2eb7f26eb2694b2UL, 0x8ec90740c940ce8eUL, 0xfb0bed1d0b1de6fbUL, + 0x41ec822fec2f6e41UL, 0xb3677da967a91ab3UL, 0x5ffdbe1cfd1c435fUL, 0x45ea8a25ea256045UL, + 0x23bf46dabfdaf923UL, 0x53f7a602f7025153UL, 0xe496d3a196a145e4UL, 0x9b5b2ded5bed769bUL, + 0x75c2ea5dc25d2875UL, 0xe11cd9241c24c5e1UL, 0x3dae7ae9aee9d43dUL, 0x4c6a98be6abef24cUL, + 0x6c5ad8ee5aee826cUL, 0x7e41fcc341c3bd7eUL, 0xf502f1060206f3f5UL, 0x834f1dd14fd15283UL, + 0x685cd0e45ce48c68UL, 0x51f4a207f4075651UL, 0xd134b95c345c8dd1UL, 0xf908e9180818e1f9UL, + 0xe293dfae93ae4ce2UL, 0xab734d9573953eabUL, 0x6253c4f553f59762UL, 0x2a3f54413f416b2aUL, + 0x080c10140c141c08UL, 0x955231f652f66395UL, 0x46658caf65afe946UL, 0x9d5e21e25ee27f9dUL, + 0x3028607828784830UL, 0x37a16ef8a1f8cf37UL, 0x0a0f14110f111b0aUL, 0x2fb55ec4b5c4eb2fUL, + 0x0e091c1b091b150eUL, 0x2436485a365a7e24UL, 0x1b9b36b69bb6ad1bUL, 0xdf3da5473d4798dfUL, + 0xcd26816a266aa7cdUL, 0x4e699cbb69bbf54eUL, 0x7fcdfe4ccd4c337fUL, 0xea9fcfba9fba50eaUL, + 0x121b242d1b2d3f12UL, 0x1d9e3ab99eb9a41dUL, 0x5874b09c749cc458UL, 0x342e68722e724634UL, + 0x362d6c772d774136UL, 0xdcb2a3cdb2cd11dcUL, 0xb4ee7329ee299db4UL, 0x5bfbb616fb164d5bUL, + 0xa4f65301f601a5a4UL, 0x764decd74dd7a176UL, 0xb76175a361a314b7UL, 0x7dcefa49ce49347dUL, + 0x527ba48d7b8ddf52UL, 0xdd3ea1423e429fddUL, 0x5e71bc937193cd5eUL, 0x139726a297a2b113UL, + 0xa6f55704f504a2a6UL, 0xb96869b868b801b9UL, 0x0000000000000000UL, 0xc12c99742c74b5c1UL, + 0x406080a060a0e040UL, 0xe31fdd211f21c2e3UL, 0x79c8f243c8433a79UL, 0xb6ed772ced2c9ab6UL, + 0xd4beb3d9bed90dd4UL, 0x8d4601ca46ca478dUL, 0x67d9ce70d9701767UL, 0x724be4dd4bddaf72UL, + 0x94de3379de79ed94UL, 0x98d42b67d467ff98UL, 0xb0e87b23e82393b0UL, 0x854a11de4ade5b85UL, + 0xbb6b6dbd6bbd06bbUL, 0xc52a917e2a7ebbc5UL, 0x4fe59e34e5347b4fUL, 0xed16c13a163ad7edUL, + 0x86c51754c554d286UL, 0x9ad72f62d762f89aUL, 0x6655ccff55ff9966UL, 0x119422a794a7b611UL, + 0x8acf0f4acf4ac08aUL, 0xe910c9301030d9e9UL, 0x0406080a060a0e04UL, 0xfe81e798819866feUL, + 0xa0f05b0bf00baba0UL, 0x7844f0cc44ccb478UL, 0x25ba4ad5bad5f025UL, 0x4be3963ee33e754bUL, + 0xa2f35f0ef30eaca2UL, 0x5dfeba19fe19445dUL, 0x80c01b5bc05bdb80UL, 0x058a0a858a858005UL, + 0x3fad7eecadecd33fUL, 0x21bc42dfbcdffe21UL, 0x7048e0d848d8a870UL, 0xf104f90c040cfdf1UL, + 0x63dfc67adf7a1963UL, 0x77c1ee58c1582f77UL, 0xaf75459f759f30afUL, 0x426384a563a5e742UL, + 0x2030405030507020UL, 0xe51ad12e1a2ecbe5UL, 0xfd0ee1120e12effdUL, 0xbf6d65b76db708bfUL, + 0x814c19d44cd45581UL, 0x1814303c143c2418UL, 0x26354c5f355f7926UL, 0xc32f9d712f71b2c3UL, + 0xbee16738e13886beUL, 0x35a26afda2fdc835UL, 0x88cc0b4fcc4fc788UL, 0x2e395c4b394b652eUL, + 0x93573df957f96a93UL, 0x55f2aa0df20d5855UL, 0xfc82e39d829d61fcUL, 0x7a47f4c947c9b37aUL, + 0xc8ac8befacef27c8UL, 0xbae76f32e73288baUL, 0x322b647d2b7d4f32UL, 0xe695d7a495a442e6UL, + 0xc0a09bfba0fb3bc0UL, 0x199832b398b3aa19UL, 0x9ed12768d168f69eUL, 0xa37f5d817f8122a3UL, + 0x446688aa66aaee44UL, 0x547ea8827e82d654UL, 0x3bab76e6abe6dd3bUL, 0x0b83169e839e950bUL, + 0x8cca0345ca45c98cUL, 0xc729957b297bbcc7UL, 0x6bd3d66ed36e056bUL, 0x283c50443c446c28UL, + 0xa779558b798b2ca7UL, 0xbce2633de23d81bcUL, 0x161d2c271d273116UL, 0xad76419a769a37adUL, + 0xdb3bad4d3b4d96dbUL, 0x6456c8fa56fa9e64UL, 0x744ee8d24ed2a674UL, 0x141e28221e223614UL, + 0x92db3f76db76e492UL, 0x0c0a181e0a1e120cUL, 0x486c90b46cb4fc48UL, 0xb8e46b37e4378fb8UL, + 0x9f5d25e75de7789fUL, 0xbd6e61b26eb20fbdUL, 0x43ef862aef2a6943UL, 0xc4a693f1a6f135c4UL, + 0x39a872e3a8e3da39UL, 0x31a462f7a4f7c631UL, 0xd337bd5937598ad3UL, 0xf28bff868b8674f2UL, + 0xd532b156325683d5UL, 0x8b430dc543c54e8bUL, 0x6e59dceb59eb856eUL, 0xdab7afc2b7c218daUL, + 0x018c028f8c8f8e01UL, 0xb16479ac64ac1db1UL, 0x9cd2236dd26df19cUL, 0x49e0923be03b7249UL, + 0xd8b4abc7b4c71fd8UL, 0xacfa4315fa15b9acUL, 0xf307fd090709faf3UL, 0xcf25856f256fa0cfUL, + 0xcaaf8feaafea20caUL, 0xf48ef3898e897df4UL, 0x47e98e20e9206747UL, 0x1018202818283810UL, + 0x6fd5de64d5640b6fUL, 0xf088fb83888373f0UL, 0x4a6f94b16fb1fb4aUL, 0x5c72b8967296ca5cUL, + 0x3824706c246c5438UL, 0x57f1ae08f1085f57UL, 0x73c7e652c7522173UL, 0x975135f351f36497UL, + 0xcb238d652365aecbUL, 0xa17c59847c8425a1UL, 0xe89ccbbf9cbf57e8UL, 0x3e217c6321635d3eUL, + 0x96dd377cdd7cea96UL, 0x61dcc27fdc7f1e61UL, 0x0d861a9186919c0dUL, 0x0f851e9485949b0fUL, + 0xe090dbab90ab4be0UL, 0x7c42f8c642c6ba7cUL, 0x71c4e257c4572671UL, 0xccaa83e5aae529ccUL, + 0x90d83b73d873e390UL, 0x06050c0f050f0906UL, 0xf701f5030103f4f7UL, 0x1c12383612362a1cUL, + 0xc2a39ffea3fe3cc2UL, 0x6a5fd4e15fe18b6aUL, 0xaef94710f910beaeUL, 0x69d0d26bd06b0269UL, + 0x17912ea891a8bf17UL, 0x995829e858e87199UL, 0x3a2774692769533aUL, 0x27b94ed0b9d0f727UL, + 0xd938a948384891d9UL, 0xeb13cd351335deebUL, 0x2bb356ceb3cee52bUL, 0x2233445533557722UL, + 0xd2bbbfd6bbd604d2UL, 0xa9704990709039a9UL, 0x07890e8089808707UL, 0x33a766f2a7f2c133UL, + 0x2db65ac1b6c1ec2dUL, 0x3c22786622665a3cUL, 0x15922aad92adb815UL, 0xc92089602060a9c9UL, + 0x874915db49db5c87UL, 0xaaff4f1aff1ab0aaUL, 0x5078a0887888d850UL, 0xa57a518e7a8e2ba5UL, + 0x038f068a8f8a8903UL, 0x59f8b213f8134a59UL, 0x0980129b809b9209UL, 0x1a1734391739231aUL, + 0x65daca75da751065UL, 0xd731b553315384d7UL, 0x84c61351c651d584UL, 0xd0b8bbd3b8d303d0UL, + 0x82c31f5ec35edc82UL, 0x29b052cbb0cbe229UL, 0x5a77b4997799c35aUL, 0x1e113c3311332d1eUL, + 0x7bcbf646cb463d7bUL, 0xa8fc4b1ffc1fb7a8UL, 0x6dd6da61d6610c6dUL, 0x2c3a584e3a4e622cUL +}; + +)===" +R"===( + +static const __constant ulong T4_G[] = +{ + 0xA5F432C6C6A597F4UL, 0x84976FF8F884EB97UL, 0x99B05EEEEE99C7B0UL, 0x8D8C7AF6F68DF78CUL, + 0x0D17E8FFFF0DE517UL, 0xBDDC0AD6D6BDB7DCUL, 0xB1C816DEDEB1A7C8UL, 0x54FC6D91915439FCUL, + 0x50F090606050C0F0UL, 0x0305070202030405UL, 0xA9E02ECECEA987E0UL, 0x7D87D156567DAC87UL, + 0x192BCCE7E719D52BUL, 0x62A613B5B56271A6UL, 0xE6317C4D4DE69A31UL, 0x9AB559ECEC9AC3B5UL, + 0x45CF408F8F4505CFUL, 0x9DBCA31F1F9D3EBCUL, 0x40C04989894009C0UL, 0x879268FAFA87EF92UL, + 0x153FD0EFEF15C53FUL, 0xEB2694B2B2EB7F26UL, 0xC940CE8E8EC90740UL, 0x0B1DE6FBFB0BED1DUL, + 0xEC2F6E4141EC822FUL, 0x67A91AB3B3677DA9UL, 0xFD1C435F5FFDBE1CUL, 0xEA25604545EA8A25UL, + 0xBFDAF92323BF46DAUL, 0xF702515353F7A602UL, 0x96A145E4E496D3A1UL, 0x5BED769B9B5B2DEDUL, + 0xC25D287575C2EA5DUL, 0x1C24C5E1E11CD924UL, 0xAEE9D43D3DAE7AE9UL, 0x6ABEF24C4C6A98BEUL, + 0x5AEE826C6C5AD8EEUL, 0x41C3BD7E7E41FCC3UL, 0x0206F3F5F502F106UL, 0x4FD15283834F1DD1UL, + 0x5CE48C68685CD0E4UL, 0xF407565151F4A207UL, 0x345C8DD1D134B95CUL, 0x0818E1F9F908E918UL, + 0x93AE4CE2E293DFAEUL, 0x73953EABAB734D95UL, 0x53F597626253C4F5UL, 0x3F416B2A2A3F5441UL, + 0x0C141C08080C1014UL, 0x52F66395955231F6UL, 0x65AFE94646658CAFUL, 0x5EE27F9D9D5E21E2UL, + 0x2878483030286078UL, 0xA1F8CF3737A16EF8UL, 0x0F111B0A0A0F1411UL, 0xB5C4EB2F2FB55EC4UL, + 0x091B150E0E091C1BUL, 0x365A7E242436485AUL, 0x9BB6AD1B1B9B36B6UL, 0x3D4798DFDF3DA547UL, + 0x266AA7CDCD26816AUL, 0x69BBF54E4E699CBBUL, 0xCD4C337F7FCDFE4CUL, 0x9FBA50EAEA9FCFBAUL, + 0x1B2D3F12121B242DUL, 0x9EB9A41D1D9E3AB9UL, 0x749CC4585874B09CUL, 0x2E724634342E6872UL, + 0x2D774136362D6C77UL, 0xB2CD11DCDCB2A3CDUL, 0xEE299DB4B4EE7329UL, 0xFB164D5B5BFBB616UL, + 0xF601A5A4A4F65301UL, 0x4DD7A176764DECD7UL, 0x61A314B7B76175A3UL, 0xCE49347D7DCEFA49UL, + 0x7B8DDF52527BA48DUL, 0x3E429FDDDD3EA142UL, 0x7193CD5E5E71BC93UL, 0x97A2B113139726A2UL, + 0xF504A2A6A6F55704UL, 0x68B801B9B96869B8UL, 0x0000000000000000UL, 0x2C74B5C1C12C9974UL, + 0x60A0E040406080A0UL, 0x1F21C2E3E31FDD21UL, 0xC8433A7979C8F243UL, 0xED2C9AB6B6ED772CUL, + 0xBED90DD4D4BEB3D9UL, 0x46CA478D8D4601CAUL, 0xD970176767D9CE70UL, 0x4BDDAF72724BE4DDUL, + 0xDE79ED9494DE3379UL, 0xD467FF9898D42B67UL, 0xE82393B0B0E87B23UL, 0x4ADE5B85854A11DEUL, + 0x6BBD06BBBB6B6DBDUL, 0x2A7EBBC5C52A917EUL, 0xE5347B4F4FE59E34UL, 0x163AD7EDED16C13AUL, + 0xC554D28686C51754UL, 0xD762F89A9AD72F62UL, 0x55FF99666655CCFFUL, 0x94A7B611119422A7UL, + 0xCF4AC08A8ACF0F4AUL, 0x1030D9E9E910C930UL, 0x060A0E040406080AUL, 0x819866FEFE81E798UL, + 0xF00BABA0A0F05B0BUL, 0x44CCB4787844F0CCUL, 0xBAD5F02525BA4AD5UL, 0xE33E754B4BE3963EUL, + 0xF30EACA2A2F35F0EUL, 0xFE19445D5DFEBA19UL, 0xC05BDB8080C01B5BUL, 0x8A858005058A0A85UL, + 0xADECD33F3FAD7EECUL, 0xBCDFFE2121BC42DFUL, 0x48D8A8707048E0D8UL, 0x040CFDF1F104F90CUL, + 0xDF7A196363DFC67AUL, 0xC1582F7777C1EE58UL, 0x759F30AFAF75459FUL, 0x63A5E742426384A5UL, + 0x3050702020304050UL, 0x1A2ECBE5E51AD12EUL, 0x0E12EFFDFD0EE112UL, 0x6DB708BFBF6D65B7UL, + 0x4CD45581814C19D4UL, 0x143C24181814303CUL, 0x355F792626354C5FUL, 0x2F71B2C3C32F9D71UL, + 0xE13886BEBEE16738UL, 0xA2FDC83535A26AFDUL, 0xCC4FC78888CC0B4FUL, 0x394B652E2E395C4BUL, + 0x57F96A9393573DF9UL, 0xF20D585555F2AA0DUL, 0x829D61FCFC82E39DUL, 0x47C9B37A7A47F4C9UL, + 0xACEF27C8C8AC8BEFUL, 0xE73288BABAE76F32UL, 0x2B7D4F32322B647DUL, 0x95A442E6E695D7A4UL, + 0xA0FB3BC0C0A09BFBUL, 0x98B3AA19199832B3UL, 0xD168F69E9ED12768UL, 0x7F8122A3A37F5D81UL, + 0x66AAEE44446688AAUL, 0x7E82D654547EA882UL, 0xABE6DD3B3BAB76E6UL, 0x839E950B0B83169EUL, + 0xCA45C98C8CCA0345UL, 0x297BBCC7C729957BUL, 0xD36E056B6BD3D66EUL, 0x3C446C28283C5044UL, + 0x798B2CA7A779558BUL, 0xE23D81BCBCE2633DUL, 0x1D273116161D2C27UL, 0x769A37ADAD76419AUL, + 0x3B4D96DBDB3BAD4DUL, 0x56FA9E646456C8FAUL, 0x4ED2A674744EE8D2UL, 0x1E223614141E2822UL, + 0xDB76E49292DB3F76UL, 0x0A1E120C0C0A181EUL, 0x6CB4FC48486C90B4UL, 0xE4378FB8B8E46B37UL, + 0x5DE7789F9F5D25E7UL, 0x6EB20FBDBD6E61B2UL, 0xEF2A694343EF862AUL, 0xA6F135C4C4A693F1UL, + 0xA8E3DA3939A872E3UL, 0xA4F7C63131A462F7UL, 0x37598AD3D337BD59UL, 0x8B8674F2F28BFF86UL, + 0x325683D5D532B156UL, 0x43C54E8B8B430DC5UL, 0x59EB856E6E59DCEBUL, 0xB7C218DADAB7AFC2UL, + 0x8C8F8E01018C028FUL, 0x64AC1DB1B16479ACUL, 0xD26DF19C9CD2236DUL, 0xE03B724949E0923BUL, + 0xB4C71FD8D8B4ABC7UL, 0xFA15B9ACACFA4315UL, 0x0709FAF3F307FD09UL, 0x256FA0CFCF25856FUL, + 0xAFEA20CACAAF8FEAUL, 0x8E897DF4F48EF389UL, 0xE920674747E98E20UL, 0x1828381010182028UL, + 0xD5640B6F6FD5DE64UL, 0x888373F0F088FB83UL, 0x6FB1FB4A4A6F94B1UL, 0x7296CA5C5C72B896UL, + 0x246C54383824706CUL, 0xF1085F5757F1AE08UL, 0xC752217373C7E652UL, 0x51F36497975135F3UL, + 0x2365AECBCB238D65UL, 0x7C8425A1A17C5984UL, 0x9CBF57E8E89CCBBFUL, 0x21635D3E3E217C63UL, + 0xDD7CEA9696DD377CUL, 0xDC7F1E6161DCC27FUL, 0x86919C0D0D861A91UL, 0x85949B0F0F851E94UL, + 0x90AB4BE0E090DBABUL, 0x42C6BA7C7C42F8C6UL, 0xC457267171C4E257UL, 0xAAE529CCCCAA83E5UL, + 0xD873E39090D83B73UL, 0x050F090606050C0FUL, 0x0103F4F7F701F503UL, 0x12362A1C1C123836UL, + 0xA3FE3CC2C2A39FFEUL, 0x5FE18B6A6A5FD4E1UL, 0xF910BEAEAEF94710UL, 0xD06B026969D0D26BUL, + 0x91A8BF1717912EA8UL, 0x58E87199995829E8UL, 0x2769533A3A277469UL, 0xB9D0F72727B94ED0UL, + 0x384891D9D938A948UL, 0x1335DEEBEB13CD35UL, 0xB3CEE52B2BB356CEUL, 0x3355772222334455UL, + 0xBBD604D2D2BBBFD6UL, 0x709039A9A9704990UL, 0x8980870707890E80UL, 0xA7F2C13333A766F2UL, + 0xB6C1EC2D2DB65AC1UL, 0x22665A3C3C227866UL, 0x92ADB81515922AADUL, 0x2060A9C9C9208960UL, + 0x49DB5C87874915DBUL, 0xFF1AB0AAAAFF4F1AUL, 0x7888D8505078A088UL, 0x7A8E2BA5A57A518EUL, + 0x8F8A8903038F068AUL, 0xF8134A5959F8B213UL, 0x809B92090980129BUL, 0x1739231A1A173439UL, + 0xDA75106565DACA75UL, 0x315384D7D731B553UL, 0xC651D58484C61351UL, 0xB8D303D0D0B8BBD3UL, + 0xC35EDC8282C31F5EUL, 0xB0CBE22929B052CBUL, 0x7799C35A5A77B499UL, 0x11332D1E1E113C33UL, + 0xCB463D7B7BCBF646UL, 0xFC1FB7A8A8FC4B1FUL, 0xD6610C6D6DD6DA61UL, 0x3A4E622C2C3A584EUL +}; + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0_G[B64_0(a[b0])] \ + ^ R64(T0_G[B64_1(a[b1])], 8) \ + ^ R64(T0_G[B64_2(a[b2])], 16) \ + ^ R64(T0_G[B64_3(a[b3])], 24) \ + ^ T4_G[B64_4(a[b4])] \ + ^ R64(T4_G[B64_5(a[b5])], 8) \ + ^ R64(T4_G[B64_6(a[b6])], 16) \ + ^ R64(T4_G[B64_7(a[b7])], 24); \ + } while (0) + +#define ROUND_SMALL_P(a, r) do { \ + ulong t[8]; \ + a[0] ^= PC64(0x00, r); \ + a[1] ^= PC64(0x10, r); \ + a[2] ^= PC64(0x20, r); \ + a[3] ^= PC64(0x30, r); \ + a[4] ^= PC64(0x40, r); \ + a[5] ^= PC64(0x50, r); \ + a[6] ^= PC64(0x60, r); \ + a[7] ^= PC64(0x70, r); \ + RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \ + RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \ + RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \ + RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \ + RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \ + RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \ + RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \ + RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#define ROUND_SMALL_Pf(a,r) do { \ + a[0] ^= PC64(0x00, r); \ + a[1] ^= PC64(0x10, r); \ + a[2] ^= PC64(0x20, r); \ + a[3] ^= PC64(0x30, r); \ + a[4] ^= PC64(0x40, r); \ + a[5] ^= PC64(0x50, r); \ + a[6] ^= PC64(0x60, r); \ + a[7] ^= PC64(0x70, r); \ + RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ + a[7] = t[7]; \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + ulong t[8]; \ + a[0] ^= QC64(0x00, r); \ + a[1] ^= QC64(0x10, r); \ + a[2] ^= QC64(0x20, r); \ + a[3] ^= QC64(0x30, r); \ + a[4] ^= QC64(0x40, r); \ + a[5] ^= QC64(0x50, r); \ + a[6] ^= QC64(0x60, r); \ + a[7] ^= QC64(0x70, r); \ + RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \ + RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \ + RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \ + RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \ + RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \ + RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \ + RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \ + RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#define PERM_SMALL_P(a) do { \ + for (int r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Pf(a) do { \ + for (int r = 0; r < 9; r ++) { \ + ROUND_SMALL_P(a, r);} \ + ROUND_SMALL_Pf(a,9); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + for (int r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + +)===" +
\ No newline at end of file diff --git a/xmrstak/backend/amd/amd_gpu/opencl/jh.cl b/xmrstak/backend/amd/amd_gpu/opencl/jh.cl new file mode 100644 index 0000000..fe70ea3 --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/jh.cl @@ -0,0 +1,274 @@ +R"===( +/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */ +/* + * JH implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin <thomas.pornin@cryptolog.com> + */ + +#define SPH_JH_64 1 +#define SPH_LITTLE_ENDIAN 1 + +#define SPH_C32(x) x +#define SPH_C64(x) x +typedef uint sph_u32; +typedef ulong sph_u64; + +/* + * The internal bitslice representation may use either big-endian or + * little-endian (true bitslice operations do not care about the bit + * ordering, and the bit-swapping linear operations in JH happen to + * be invariant through endianness-swapping). The constants must be + * defined according to the chosen endianness; we use some + * byte-swapping macros for that. + */ + +#if SPH_LITTLE_ENDIAN + +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) +#define dec32e_aligned sph_dec32le_aligned +#define enc32e sph_enc32le + +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) +#define dec64e_aligned sph_dec64le_aligned +#define enc64e sph_enc64le + +#else + +#define C32e(x) SPH_C32(x) +#define dec32e_aligned sph_dec32be_aligned +#define enc32e sph_enc32be +#define C64e(x) SPH_C64(x) +#define dec64e_aligned sph_dec64be_aligned +#define enc64e sph_enc64be + +#endif + +#define Sb(x0, x1, x2, x3, c) do { \ + x3 = ~x3; \ + x0 ^= (c) & ~x2; \ + tmp = (c) ^ (x0 & x1); \ + x0 ^= x2 & x3; \ + x3 ^= ~x1 & x2; \ + x1 ^= x0 & x2; \ + x2 ^= x0 & ~x3; \ + x0 ^= x1 | x3; \ + x3 ^= x1 & x2; \ + x1 ^= tmp & x0; \ + x2 ^= tmp; \ + } while (0) + +#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + x4 ^= x1; \ + x5 ^= x2; \ + x6 ^= x3 ^ x0; \ + x7 ^= x0; \ + x0 ^= x5; \ + x1 ^= x6; \ + x2 ^= x7 ^ x4; \ + x3 ^= x4; \ + } while (0) + +static const __constant ulong C[] = +{ + 0x67F815DFA2DED572UL, 0x571523B70A15847BUL, 0xF6875A4D90D6AB81UL, 0x402BD1C3C54F9F4EUL, + 0x9CFA455CE03A98EAUL, 0x9A99B26699D2C503UL, 0x8A53BBF2B4960266UL, 0x31A2DB881A1456B5UL, + 0xDB0E199A5C5AA303UL, 0x1044C1870AB23F40UL, 0x1D959E848019051CUL, 0xDCCDE75EADEB336FUL, + 0x416BBF029213BA10UL, 0xD027BBF7156578DCUL, 0x5078AA3739812C0AUL, 0xD3910041D2BF1A3FUL, + 0x907ECCF60D5A2D42UL, 0xCE97C0929C9F62DDUL, 0xAC442BC70BA75C18UL, 0x23FCC663D665DFD1UL, + 0x1AB8E09E036C6E97UL, 0xA8EC6C447E450521UL, 0xFA618E5DBB03F1EEUL, 0x97818394B29796FDUL, + 0x2F3003DB37858E4AUL, 0x956A9FFB2D8D672AUL, 0x6C69B8F88173FE8AUL, 0x14427FC04672C78AUL, + 0xC45EC7BD8F15F4C5UL, 0x80BB118FA76F4475UL, 0xBC88E4AEB775DE52UL, 0xF4A3A6981E00B882UL, + 0x1563A3A9338FF48EUL, 0x89F9B7D524565FAAUL, 0xFDE05A7C20EDF1B6UL, 0x362C42065AE9CA36UL, + 0x3D98FE4E433529CEUL, 0xA74B9A7374F93A53UL, 0x86814E6F591FF5D0UL, 0x9F5AD8AF81AD9D0EUL, + 0x6A6234EE670605A7UL, 0x2717B96EBE280B8BUL, 0x3F1080C626077447UL, 0x7B487EC66F7EA0E0UL, + 0xC0A4F84AA50A550DUL, 0x9EF18E979FE7E391UL, 0xD48D605081727686UL, 0x62B0E5F3415A9E7EUL, + 0x7A205440EC1F9FFCUL, 0x84C9F4CE001AE4E3UL, 0xD895FA9DF594D74FUL, 0xA554C324117E2E55UL, + 0x286EFEBD2872DF5BUL, 0xB2C4A50FE27FF578UL, 0x2ED349EEEF7C8905UL, 0x7F5928EB85937E44UL, + 0x4A3124B337695F70UL, 0x65E4D61DF128865EUL, 0xE720B95104771BC7UL, 0x8A87D423E843FE74UL, + 0xF2947692A3E8297DUL, 0xC1D9309B097ACBDDUL, 0xE01BDC5BFB301B1DUL, 0xBF829CF24F4924DAUL, + 0xFFBF70B431BAE7A4UL, 0x48BCF8DE0544320DUL, 0x39D3BB5332FCAE3BUL, 0xA08B29E0C1C39F45UL, + 0x0F09AEF7FD05C9E5UL, 0x34F1904212347094UL, 0x95ED44E301B771A2UL, 0x4A982F4F368E3BE9UL, + 0x15F66CA0631D4088UL, 0xFFAF52874B44C147UL, 0x30C60AE2F14ABB7EUL, 0xE68C6ECCC5B67046UL, + 0x00CA4FBD56A4D5A4UL, 0xAE183EC84B849DDAUL, 0xADD1643045CE5773UL, 0x67255C1468CEA6E8UL, + 0x16E10ECBF28CDAA3UL, 0x9A99949A5806E933UL, 0x7B846FC220B2601FUL, 0x1885D1A07FACCED1UL, + 0xD319DD8DA15B5932UL, 0x46B4A5AAC01C9A50UL, 0xBA6B04E467633D9FUL, 0x7EEE560BAB19CAF6UL, + 0x742128A9EA79B11FUL, 0xEE51363B35F7BDE9UL, 0x76D350755AAC571DUL, 0x01707DA3FEC2463AUL, + 0x42D8A498AFC135F7UL, 0x79676B9E20ECED78UL, 0xA8DB3AEA15638341UL, 0x832C83324D3BC3FAUL, + 0xF347271C1F3B40A7UL, 0x9A762DB734F04059UL, 0xFD4F21D26C4E3EE7UL, 0xEF5957DC398DFDB8UL, + 0xDAEB492B490C9B8DUL, 0x0D70F36849D7A25BUL, 0x84558D7AD0AE3B7DUL, 0x658EF8E4F0E9A5F5UL, + 0x533B1036F4A2B8A0UL, 0x5AEC3E759E07A80CUL, 0x4F88E85692946891UL, 0x4CBCBAF8555CB05BUL, + 0x7B9487F3993BBBE3UL, 0x5D1C6B72D6F4DA75UL, 0x6DB334DC28ACAE64UL, 0x71DB28B850A5346CUL, + 0x2A518D10F2E261F8UL, 0xFC75DD593364DBE3UL, 0xA23FCE43F1BCAC1CUL, 0xB043E8023CD1BB67UL, + 0x75A12988CA5B0A33UL, 0x5C5316B44D19347FUL, 0x1E4D790EC3943B92UL, 0x3FAFEEB6D7757479UL, + 0x21391ABEF7D4A8EAUL, 0x5127234C097EF45CUL, 0xD23C32BA5324A326UL, 0xADD5A66D4A17A344UL, + 0x08C9F2AFA63E1DB5UL, 0x563C6B91983D5983UL, 0x4D608672A17CF84CUL, 0xF6C76E08CC3EE246UL, + 0x5E76BCB1B333982FUL, 0x2AE6C4EFA566D62BUL, 0x36D4C1BEE8B6F406UL, 0x6321EFBC1582EE74UL, + 0x69C953F40D4EC1FDUL, 0x26585806C45A7DA7UL, 0x16FAE0061614C17EUL, 0x3F9D63283DAF907EUL, + 0x0CD29B00E3F2C9D2UL, 0x300CD4B730CEAA5FUL, 0x9832E0F216512A74UL, 0x9AF8CEE3D830EB0DUL, + 0x9279F1B57B9EC54BUL, 0xD36886046EE651FFUL, 0x316796E6574D239BUL, 0x05750A17F3A6E6CCUL, + 0xCE6C3213D98176B1UL, 0x62A205F88452173CUL, 0x47154778B3CB2BF4UL, 0x486A9323825446FFUL, + 0x65655E4E0758DF38UL, 0x8E5086FC897CFCF2UL, 0x86CA0BD0442E7031UL, 0x4E477830A20940F0UL, + 0x8338F7D139EEA065UL, 0xBD3A2CE437E95EF7UL, 0x6FF8130126B29721UL, 0xE7DE9FEFD1ED44A3UL, + 0xD992257615DFA08BUL, 0xBE42DC12F6F7853CUL, 0x7EB027AB7CECA7D8UL, 0xDEA83EAADA7D8D53UL, + 0xD86902BD93CE25AAUL, 0xF908731AFD43F65AUL, 0xA5194A17DAEF5FC0UL, 0x6A21FD4C33664D97UL, + 0x701541DB3198B435UL, 0x9B54CDEDBB0F1EEAUL, 0x72409751A163D09AUL, 0xE26F4791BF9D75F6UL +}; + +#define Ceven_hi(r) (C[((r) << 2) + 0]) +#define Ceven_lo(r) (C[((r) << 2) + 1]) +#define Codd_hi(r) (C[((r) << 2) + 2]) +#define Codd_lo(r) (C[((r) << 2) + 3]) + +#define S(x0, x1, x2, x3, cb, r) do { \ + Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \ + Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \ + } while (0) + +#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \ + x4 ## h, x5 ## h, x6 ## h, x7 ## h); \ + Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \ + x4 ## l, x5 ## l, x6 ## l, x7 ## l); \ + } while (0) + +#define Wz(x, c, n) do { \ + sph_u64 t = (x ## h & (c)) << (n); \ + x ## h = ((x ## h >> (n)) & (c)) | t; \ + t = (x ## l & (c)) << (n); \ + x ## l = ((x ## l >> (n)) & (c)) | t; \ + } while (0) + +#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1) +#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2) +#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4) +#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8) +#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16) +#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32) +#define W6(x) do { \ + sph_u64 t = x ## h; \ + x ## h = x ## l; \ + x ## l = t; \ + } while (0) + +#define SL(ro) SLu(r + ro, ro) + +#define SLu(r, ro) do { \ + S(h0, h2, h4, h6, Ceven_, r); \ + S(h1, h3, h5, h7, Codd_, r); \ + L(h0, h2, h4, h6, h1, h3, h5, h7); \ + W ## ro(h1); \ + W ## ro(h3); \ + W ## ro(h5); \ + W ## ro(h7); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_JH + +/* + * The "small footprint" 64-bit version just uses a partially unrolled + * loop. + */ + +#define E8 do { \ + unsigned r; \ + for (r = 0; r < 42; r += 7) { \ + SL(0); \ + SL(1); \ + SL(2); \ + SL(3); \ + SL(4); \ + SL(5); \ + SL(6); \ + } \ + } while (0) + +#else + +/* + * On a "true 64-bit" architecture, we can unroll at will. + */ + +#define E8 do { \ + SLu( 0, 0); \ + SLu( 1, 1); \ + SLu( 2, 2); \ + SLu( 3, 3); \ + SLu( 4, 4); \ + SLu( 5, 5); \ + SLu( 6, 6); \ + SLu( 7, 0); \ + SLu( 8, 1); \ + SLu( 9, 2); \ + SLu(10, 3); \ + SLu(11, 4); \ + SLu(12, 5); \ + SLu(13, 6); \ + SLu(14, 0); \ + SLu(15, 1); \ + SLu(16, 2); \ + SLu(17, 3); \ + SLu(18, 4); \ + SLu(19, 5); \ + SLu(20, 6); \ + SLu(21, 0); \ + SLu(22, 1); \ + SLu(23, 2); \ + SLu(24, 3); \ + SLu(25, 4); \ + SLu(26, 5); \ + SLu(27, 6); \ + SLu(28, 0); \ + SLu(29, 1); \ + SLu(30, 2); \ + SLu(31, 3); \ + SLu(32, 4); \ + SLu(33, 5); \ + SLu(34, 6); \ + SLu(35, 0); \ + SLu(36, 1); \ + SLu(37, 2); \ + SLu(38, 3); \ + SLu(39, 4); \ + SLu(40, 5); \ + SLu(41, 6); \ + } while (0) + +#endif + +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl new file mode 100644 index 0000000..996944b --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl @@ -0,0 +1,90 @@ +R"===( +#ifndef WOLF_AES_CL +#define WOLF_AES_CL + +// AES table - the other three are generated on the fly + +static const __constant uint AES0_C[256] = +{ + 0xA56363C6U, 0x847C7CF8U, 0x997777EEU, 0x8D7B7BF6U, + 0x0DF2F2FFU, 0xBD6B6BD6U, 0xB16F6FDEU, 0x54C5C591U, + 0x50303060U, 0x03010102U, 0xA96767CEU, 0x7D2B2B56U, + 0x19FEFEE7U, 0x62D7D7B5U, 0xE6ABAB4DU, 0x9A7676ECU, + 0x45CACA8FU, 0x9D82821FU, 0x40C9C989U, 0x877D7DFAU, + 0x15FAFAEFU, 0xEB5959B2U, 0xC947478EU, 0x0BF0F0FBU, + 0xECADAD41U, 0x67D4D4B3U, 0xFDA2A25FU, 0xEAAFAF45U, + 0xBF9C9C23U, 0xF7A4A453U, 0x967272E4U, 0x5BC0C09BU, + 0xC2B7B775U, 0x1CFDFDE1U, 0xAE93933DU, 0x6A26264CU, + 0x5A36366CU, 0x413F3F7EU, 0x02F7F7F5U, 0x4FCCCC83U, + 0x5C343468U, 0xF4A5A551U, 0x34E5E5D1U, 0x08F1F1F9U, + 0x937171E2U, 0x73D8D8ABU, 0x53313162U, 0x3F15152AU, + 0x0C040408U, 0x52C7C795U, 0x65232346U, 0x5EC3C39DU, + 0x28181830U, 0xA1969637U, 0x0F05050AU, 0xB59A9A2FU, + 0x0907070EU, 0x36121224U, 0x9B80801BU, 0x3DE2E2DFU, + 0x26EBEBCDU, 0x6927274EU, 0xCDB2B27FU, 0x9F7575EAU, + 0x1B090912U, 0x9E83831DU, 0x742C2C58U, 0x2E1A1A34U, + 0x2D1B1B36U, 0xB26E6EDCU, 0xEE5A5AB4U, 0xFBA0A05BU, + 0xF65252A4U, 0x4D3B3B76U, 0x61D6D6B7U, 0xCEB3B37DU, + 0x7B292952U, 0x3EE3E3DDU, 0x712F2F5EU, 0x97848413U, + 0xF55353A6U, 0x68D1D1B9U, 0x00000000U, 0x2CEDEDC1U, + 0x60202040U, 0x1FFCFCE3U, 0xC8B1B179U, 0xED5B5BB6U, + 0xBE6A6AD4U, 0x46CBCB8DU, 0xD9BEBE67U, 0x4B393972U, + 0xDE4A4A94U, 0xD44C4C98U, 0xE85858B0U, 0x4ACFCF85U, + 0x6BD0D0BBU, 0x2AEFEFC5U, 0xE5AAAA4FU, 0x16FBFBEDU, + 0xC5434386U, 0xD74D4D9AU, 0x55333366U, 0x94858511U, + 0xCF45458AU, 0x10F9F9E9U, 0x06020204U, 0x817F7FFEU, + 0xF05050A0U, 0x443C3C78U, 0xBA9F9F25U, 0xE3A8A84BU, + 0xF35151A2U, 0xFEA3A35DU, 0xC0404080U, 0x8A8F8F05U, + 0xAD92923FU, 0xBC9D9D21U, 0x48383870U, 0x04F5F5F1U, + 0xDFBCBC63U, 0xC1B6B677U, 0x75DADAAFU, 0x63212142U, + 0x30101020U, 0x1AFFFFE5U, 0x0EF3F3FDU, 0x6DD2D2BFU, + 0x4CCDCD81U, 0x140C0C18U, 0x35131326U, 0x2FECECC3U, + 0xE15F5FBEU, 0xA2979735U, 0xCC444488U, 0x3917172EU, + 0x57C4C493U, 0xF2A7A755U, 0x827E7EFCU, 0x473D3D7AU, + 0xAC6464C8U, 0xE75D5DBAU, 0x2B191932U, 0x957373E6U, + 0xA06060C0U, 0x98818119U, 0xD14F4F9EU, 0x7FDCDCA3U, + 0x66222244U, 0x7E2A2A54U, 0xAB90903BU, 0x8388880BU, + 0xCA46468CU, 0x29EEEEC7U, 0xD3B8B86BU, 0x3C141428U, + 0x79DEDEA7U, 0xE25E5EBCU, 0x1D0B0B16U, 0x76DBDBADU, + 0x3BE0E0DBU, 0x56323264U, 0x4E3A3A74U, 0x1E0A0A14U, + 0xDB494992U, 0x0A06060CU, 0x6C242448U, 0xE45C5CB8U, + 0x5DC2C29FU, 0x6ED3D3BDU, 0xEFACAC43U, 0xA66262C4U, + 0xA8919139U, 0xA4959531U, 0x37E4E4D3U, 0x8B7979F2U, + 0x32E7E7D5U, 0x43C8C88BU, 0x5937376EU, 0xB76D6DDAU, + 0x8C8D8D01U, 0x64D5D5B1U, 0xD24E4E9CU, 0xE0A9A949U, + 0xB46C6CD8U, 0xFA5656ACU, 0x07F4F4F3U, 0x25EAEACFU, + 0xAF6565CAU, 0x8E7A7AF4U, 0xE9AEAE47U, 0x18080810U, + 0xD5BABA6FU, 0x887878F0U, 0x6F25254AU, 0x722E2E5CU, + 0x241C1C38U, 0xF1A6A657U, 0xC7B4B473U, 0x51C6C697U, + 0x23E8E8CBU, 0x7CDDDDA1U, 0x9C7474E8U, 0x211F1F3EU, + 0xDD4B4B96U, 0xDCBDBD61U, 0x868B8B0DU, 0x858A8A0FU, + 0x907070E0U, 0x423E3E7CU, 0xC4B5B571U, 0xAA6666CCU, + 0xD8484890U, 0x05030306U, 0x01F6F6F7U, 0x120E0E1CU, + 0xA36161C2U, 0x5F35356AU, 0xF95757AEU, 0xD0B9B969U, + 0x91868617U, 0x58C1C199U, 0x271D1D3AU, 0xB99E9E27U, + 0x38E1E1D9U, 0x13F8F8EBU, 0xB398982BU, 0x33111122U, + 0xBB6969D2U, 0x70D9D9A9U, 0x898E8E07U, 0xA7949433U, + 0xB69B9B2DU, 0x221E1E3CU, 0x92878715U, 0x20E9E9C9U, + 0x49CECE87U, 0xFF5555AAU, 0x78282850U, 0x7ADFDFA5U, + 0x8F8C8C03U, 0xF8A1A159U, 0x80898909U, 0x170D0D1AU, + 0xDABFBF65U, 0x31E6E6D7U, 0xC6424284U, 0xB86868D0U, + 0xC3414182U, 0xB0999929U, 0x772D2D5AU, 0x110F0F1EU, + 0xCBB0B07BU, 0xFC5454A8U, 0xD6BBBB6DU, 0x3A16162CU +}; + +#define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U)) + +uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, const uint4 key) +{ + uint4 Y; + Y.s0 = AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)]; + Y.s1 = AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)]; + Y.s2 = AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)]; + Y.s3 = AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)]; + Y ^= key; + return(Y); +} + +#endif + +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl new file mode 100644 index 0000000..868757b --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl @@ -0,0 +1,114 @@ +R"===( +#ifndef WOLF_SKEIN_CL +#define WOLF_SKEIN_CL + +// Vectorized Skein implementation macros and functions by Wolf + +#define SKEIN_KS_PARITY 0x1BD11BDAA9FC1A22 + +static const __constant ulong SKEIN256_IV[8] = +{ + 0xCCD044A12FDB3E13UL, 0xE83590301A79A9EBUL, + 0x55AEA0614F816E6FUL, 0x2A2767A4AE9B94DBUL, + 0xEC06025E74DD7683UL, 0xE7A436CDC4746251UL, + 0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL +}; + +static const __constant ulong SKEIN512_256_IV[8] = +{ + 0xCCD044A12FDB3E13UL, 0xE83590301A79A9EBUL, + 0x55AEA0614F816E6FUL, 0x2A2767A4AE9B94DBUL, + 0xEC06025E74DD7683UL, 0xE7A436CDC4746251UL, + 0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL +}; + +#define SKEIN_INJECT_KEY(p, s) do { \ + p += h; \ + p.s5 += t[s % 3]; \ + p.s6 += t[(s + 1) % 3]; \ + p.s7 += s; \ +} while(0) + +ulong SKEIN_ROT(const uint2 x, const uint y) +{ + if(y < 32) return(as_ulong(amd_bitalign(x, x.s10, 32 - y))); + else return(as_ulong(amd_bitalign(x.s10, x, 32 - (y - 32)))); +} + +void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const uint rc0, const uint rc1, const uint rc2, const uint rc3) +{ + *pv0 += *pv1; + (*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc0); + (*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc1); + (*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc2); + (*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc3); + *pv1 ^= *pv0; +} + +ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s) +{ + SKEIN_INJECT_KEY(p, s); + ulong4 pv0 = p.even, pv1 = p.odd; + + SkeinMix8(&pv0, &pv1, 46, 36, 19, 37); + pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); + pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); + + SkeinMix8(&pv0, &pv1, 33, 27, 14, 42); + pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); + pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); + + SkeinMix8(&pv0, &pv1, 17, 49, 36, 39); + pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); + pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); + + SkeinMix8(&pv0, &pv1, 44, 9, 54, 56); + return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5))); +} + +ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s) +{ + SKEIN_INJECT_KEY(p, s); + ulong4 pv0 = p.even, pv1 = p.odd; + + SkeinMix8(&pv0, &pv1, 39, 30, 34, 24); + pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); + pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); + + SkeinMix8(&pv0, &pv1, 13, 50, 10, 17); + pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); + pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); + + SkeinMix8(&pv0, &pv1, 25, 29, 39, 43); + pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0)); + pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1)); + + SkeinMix8(&pv0, &pv1, 8, 35, 56, 22); + return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5))); +} + +ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t) +{ + #pragma unroll + for(int i = 0; i < 18; ++i) + { + p = SkeinEvenRound(p, h, t, i); + ++i; + ulong tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, i); + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + } + + SKEIN_INJECT_KEY(p, 18); + return(p); +} + +#endif + +)===" diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp new file mode 100644 index 0000000..dac0cfb --- /dev/null +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -0,0 +1,111 @@ + +#pragma once + +#include "amd_gpu/gpu.hpp" +#include "autoAdjust.hpp" +#include "jconf.hpp" + +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/params.hpp" + +#include <vector> +#include <cstdio> +#include <sstream> +#include <string> +#include <iostream> +#include <algorithm> + +#if defined(__APPLE__) +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + + +namespace xmrstak +{ +namespace amd +{ + +class autoAdjust +{ +public: + + autoAdjust() + { + + } + + /** print the adjusted values if needed + * + * Routine exit the application and print the adjusted values if needed else + * nothing is happened. + */ + bool printConfig() + { + int platformIndex = getAMDPlatformIdx(); + + if(platformIndex == -1) + { + printer::inst()->print_msg(L0,"WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver."); + return false; + } + + devVec = getAMDDevices(0); + + + int deviceCount = devVec.size(); + + if(deviceCount == 0) + return false; + + + generateThreadConfig(platformIndex); + return true; + } + +private: + + void generateThreadConfig(const int platformIndex) + { + // load the template of the backend config into a char variable + const char *tpl = + #include "./config.tpl" + ; + + configEditor configTpl{}; + configTpl.set( std::string(tpl) ); + + std::string conf; + int i = 0; + for(auto& ctx : devVec) + { + // keep 64MiB memory free (value is randomly chosen) + size_t availableMem = ctx.freeMem - (64u * 1024 * 1024); + // 224byte extra memory is used per thread for meta data + size_t perThread = (size_t(1u)<<21) + 224u; + size_t max_intensity = availableMem / perThread; + // 1000 is a magic selected limit \todo select max intensity depending of the gpu type + size_t intensity = std::min( size_t(1000u) , max_intensity ); + conf += std::string(" // gpu: ") + ctx.name + "\n"; + // set 8 threads per block (this is a good value for the most gpus) + conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + + " \"affine_to_cpu\" : false, \n" + " },\n"; + ++i; + } + + configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex)); + configTpl.replace("NUMGPUS",std::to_string(devVec.size())); + configTpl.replace("GPUCONFIG",conf); + configTpl.write(params::inst().configFileAMD); + printer::inst()->print_msg(L0, "AMD: GPU configuration stored in file '%s'", params::inst().configFileAMD.c_str()); + } + + std::vector<GpuContext> devVec; +}; + +} // namespace amd +} // namepsace xmrstak diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl new file mode 100644 index 0000000..b8b6dc4 --- /dev/null +++ b/xmrstak/backend/amd/config.tpl @@ -0,0 +1,29 @@ +R"===( + +/* + * Number of GPUs that you have in your system. Each GPU will get its own CPU thread. + */ +"gpu_thread_num" : NUMGPUS, + +/* + * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary. + * index - GPU index number usually starts from 0 + * intensity - Number of parallel GPU threads (nothing to do with CPU threads) + * worksize - Number of local GPU threads (nothing to do with CPU threads) + * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. + * "gpu_threads_conf" : + * [ + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false }, + * ], + */ + +"gpu_threads_conf" : [ +GPUCONFIG +], + +/* + * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel. + */ +"platform_index" : PLATFORMINDEX, + +)===" diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp new file mode 100644 index 0000000..c2bf1fa --- /dev/null +++ b/xmrstak/backend/amd/jconf.cpp @@ -0,0 +1,259 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + + +#include "jconf.hpp" +#include "xmrstak/misc/jext.hpp" +#include "xmrstak/misc/console.hpp" + +#ifdef _WIN32 +#define strcasecmp _stricmp +#include <intrin.h> +#else +#include <cpuid.h> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + + +namespace xmrstak +{ +namespace amd +{ + +using namespace rapidjson; + +/* + * This enum needs to match index in oConfigValues, otherwise we will get a runtime error + */ +enum configEnum { iGpuThreadNum, aGpuThreadsConf, iPlatformIdx }; + +struct configVal { + configEnum iName; + const char* sName; + Type iType; +}; + +//Same order as in configEnum, as per comment above +configVal oConfigValues[] = { + { iGpuThreadNum, "gpu_thread_num", kNumberType }, + { aGpuThreadsConf, "gpu_threads_conf", kArrayType }, + { iPlatformIdx, "platform_index", kNumberType } +}; + +constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + +inline bool checkType(Type have, Type want) +{ + if(want == have) + return true; + else if(want == kTrueType && have == kFalseType) + return true; + else if(want == kFalseType && have == kTrueType) + return true; + else + return false; +} + +struct jconf::opaque_private +{ + Document jsonDoc; + const Value* configValues[iConfigCnt]; //Compile time constant + + opaque_private() + { + } +}; + +jconf* jconf::oInst = nullptr; + +jconf::jconf() +{ + prv = new opaque_private(); +} + +bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) +{ + if(id >= prv->configValues[aGpuThreadsConf]->Size()) + return false; + + const Value& oThdConf = prv->configValues[aGpuThreadsConf]->GetArray()[id]; + + if(!oThdConf.IsObject()) + return false; + + const Value *idx, *intensity, *w_size, *aff; + idx = GetObjectMember(oThdConf, "index"); + intensity = GetObjectMember(oThdConf, "intensity"); + w_size = GetObjectMember(oThdConf, "worksize"); + aff = GetObjectMember(oThdConf, "affine_to_cpu"); + + if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr) + return false; + + if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) + return false; + + if(!aff->IsUint64() && !aff->IsBool()) + return false; + + cfg.index = idx->GetUint64(); + cfg.intensity = intensity->GetUint64(); + cfg.w_size = w_size->GetUint64(); + + if(aff->IsNumber()) + cfg.cpu_aff = aff->GetInt64(); + else + cfg.cpu_aff = -1; + + return true; +} + +size_t jconf::GetPlatformIdx() +{ + return prv->configValues[iPlatformIdx]->GetUint64(); +} + +size_t jconf::GetThreadCount() +{ + return prv->configValues[aGpuThreadsConf]->Size(); +} + +bool jconf::parse_config(const char* sFilename) +{ + FILE * pFile; + char * buffer; + size_t flen; + + pFile = fopen(sFilename, "rb"); + if (pFile == NULL) + { + printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); + return false; + } + + fseek(pFile,0,SEEK_END); + flen = ftell(pFile); + rewind(pFile); + + if(flen >= 64*1024) + { + fclose(pFile); + printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); + return false; + } + + if(flen <= 16) + { + printer::inst()->print_msg(L0, "File is empty or too short - %s.", sFilename); + return false; + } + + buffer = (char*)malloc(flen + 3); + if(fread(buffer+1, flen, 1, pFile) != 1) + { + free(buffer); + fclose(pFile); + printer::inst()->print_msg(L0, "Read error while reading %s.", sFilename); + return false; + } + fclose(pFile); + + //Replace Unicode BOM with spaces - we always use UTF-8 + unsigned char* ubuffer = (unsigned char*)buffer; + if(ubuffer[1] == 0xEF && ubuffer[2] == 0xBB && ubuffer[3] == 0xBF) + { + buffer[1] = ' '; + buffer[2] = ' '; + buffer[3] = ' '; + } + + buffer[0] = '{'; + buffer[flen] = '}'; + buffer[flen + 1] = '\0'; + + prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2); + free(buffer); + + if(prv->jsonDoc.HasParseError()) + { + printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s", + int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError())); + return false; + } + + + if(!prv->jsonDoc.IsObject()) + { //This should never happen as we created the root ourselves + printer::inst()->print_msg(L0, "Invalid config file. No root?\n"); + return false; + } + + for(size_t i = 0; i < iConfigCnt; i++) + { + if(oConfigValues[i].iName != i) + { + printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order."); + return false; + } + + prv->configValues[i] = GetObjectMember(prv->jsonDoc, oConfigValues[i].sName); + + if(prv->configValues[i] == nullptr) + { + printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName); + return false; + } + + if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType)) + { + printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName); + return false; + } + } + + size_t n_thd = prv->configValues[aGpuThreadsConf]->Size(); + if(prv->configValues[iGpuThreadNum]->GetUint64() != n_thd) + { + printer::inst()->print_msg(L0, + "Invalid config file. Your GPU config array has %llu members, while you want to use %llu threads.", + int_port(n_thd), int_port(prv->configValues[iGpuThreadNum]->GetUint64())); + return false; + } + + thd_cfg c; + for(size_t i=0; i < n_thd; i++) + { + if(!GetThreadConfig(i, c)) + { + printer::inst()->print_msg(L0, "Thread %llu has invalid config.", int_port(i)); + return false; + } + } + return true; +} + +} // namespace amd +} // namespace xmrstak diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp new file mode 100644 index 0000000..da024a4 --- /dev/null +++ b/xmrstak/backend/amd/jconf.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include "xmrstak/params.hpp" + +#include <stdlib.h> +#include <string> + +namespace xmrstak +{ +namespace amd +{ + +class jconf +{ +public: + static jconf* inst() + { + if (oInst == nullptr) oInst = new jconf; + return oInst; + }; + + bool parse_config(const char* sFilename = params::inst().configFileAMD.c_str()); + + struct thd_cfg { + size_t index; + size_t intensity; + size_t w_size; + long long cpu_aff; + }; + + size_t GetThreadCount(); + bool GetThreadConfig(size_t id, thd_cfg &cfg); + + size_t GetPlatformIdx(); + +private: + jconf(); + static jconf* oInst; + + struct opaque_private; + opaque_private* prv; + +}; + +} // namespace amd +} // namespace xmrstak diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp new file mode 100644 index 0000000..5f36428 --- /dev/null +++ b/xmrstak/backend/amd/minethd.cpp @@ -0,0 +1,234 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +#include "minethd.hpp" +#include "autoAdjust.hpp" +#include "amd_gpu/gpu.hpp" + +#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" +#include "xmrstak/backend/cpu/crypto/cryptonight.h" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cpu/minethd.hpp" +#include "xmrstak/jconf.hpp" +#include "xmrstak/misc/executor.hpp" +#include "xmrstak/misc/environment.hpp" +#include "xmrstak/params.hpp" + +#include <assert.h> +#include <cmath> +#include <chrono> +#include <thread> +#include <vector> + +namespace xmrstak +{ +namespace amd +{ + +minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx) +{ + oWork = pWork; + bQuit = 0; + iThreadNo = (uint8_t)iNo; + iJobNo = 0; + iHashCount = 0; + iTimestamp = 0; + pGpuCtx = ctx; + + oWorkThd = std::thread(&minethd::work_main, this); +} + +extern "C" { +#ifdef WIN32 +__declspec(dllexport) +#endif +std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) +{ + environment::inst() = env; + return amd::minethd::thread_starter(threadOffset, pWork); +} +} // extern "C" + +bool minethd::init_gpus() +{ + size_t i, n = jconf::inst()->GetThreadCount(); + + printer::inst()->print_msg(L1, "Compiling code and initializing GPUs. This will take a while..."); + vGpuData.resize(n); + + jconf::thd_cfg cfg; + for(i = 0; i < n; i++) + { + jconf::inst()->GetThreadConfig(i, cfg); + vGpuData[i].deviceIdx = cfg.index; + vGpuData[i].rawIntensity = cfg.intensity; + vGpuData[i].workSize = cfg.w_size; + } + + return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; +} + +std::vector<GpuContext> minethd::vGpuData; + +std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork) +{ + std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>(); + + if(!configEditor::file_exist(params::inst().configFileAMD)) + { + autoAdjust adjust; + if(!adjust.printConfig()) + return pvThreads; + } + + if(!jconf::inst()->parse_config()) + { + win_exit(); + } + + // \ todo get device count and exit if no opencl device + + if(!init_gpus()) + { + printer::inst()->print_msg(L1, "WARNING: AMD device not found"); + return pvThreads; + } + + size_t i, n = jconf::inst()->GetThreadCount(); + pvThreads->reserve(n); + + jconf::thd_cfg cfg; + for (i = 0; i < n; i++) + { + jconf::inst()->GetThreadConfig(i, cfg); + minethd* thd = new minethd(pWork, i + threadOffset, &vGpuData[i]); + + if(cfg.cpu_aff >= 0) + { +#if defined(__APPLE__) + printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory."); +#endif + cpu::minethd::thd_setaffinity(thd->oWorkThd.native_handle(), cfg.cpu_aff); + } + + pvThreads->push_back(thd); + if(cfg.cpu_aff >= 0) + printer::inst()->print_msg(L1, "Starting GPU thread, affinity: %d.", (int)cfg.cpu_aff); + else + printer::inst()->print_msg(L1, "Starting GPU thread, no affinity."); + } + + return pvThreads; +} + +void minethd::switch_work(miner_work& pWork) +{ + // iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work + // faster than threads can consume them. This should never happen in real life. + // Pool cant physically send jobs faster than every 250ms or so due to net latency. + + while (globalStates::inst().iConsumeCnt.load(std::memory_order_seq_cst) < globalStates::inst().iThreadCount) + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + globalStates::inst().oGlobalWork = pWork; + globalStates::inst().iConsumeCnt.store(0, std::memory_order_seq_cst); + globalStates::inst().iGlobalJobNo++; +} + +void minethd::consume_work() +{ + memcpy(&oWork, &globalStates::inst().oGlobalWork, sizeof(miner_work)); + iJobNo++; + globalStates::inst().iConsumeCnt++; + +} + +void minethd::work_main() +{ + uint64_t iCount = 0; + + cryptonight_ctx* cpu_ctx; + cpu_ctx = cpu::minethd::minethd_alloc_ctx(); + cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/); + + globalStates::inst().iConsumeCnt++; + + while (bQuit == 0) + { + if (oWork.bStall) + { + /* We are stalled here because the executor didn't find a job for us yet, + either because of network latency, or a socket problem. Since we are + raison d'etre of this software it us sensible to just wait until we have something*/ + + while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + consume_work(); + continue; + } + + assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); + pGpuCtx->Nonce = calc_start_nonce(oWork.iResumeCnt); + uint32_t target = oWork.iTarget32; + XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target); + + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + { + cl_uint results[0x100]; + memset(results,0,sizeof(cl_uint)*(0x100)); + + XMRRunJob(pGpuCtx, results); + + for(size_t i = 0; i < results[0xFF]; i++) + { + uint8_t bWorkBlob[112]; + uint8_t bResult[32]; + + memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); + memset(bResult, 0, sizeof(job_result::bResult)); + + *(uint32_t*)(bWorkBlob + 39) = results[i]; + + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx); + if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) + executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult), oWork.iPoolId)); + else + executor::inst()->log_result_error("AMD Invalid Result"); + } + + iCount += pGpuCtx->rawIntensity; + using namespace std::chrono; + uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); + iHashCount.store(iCount, std::memory_order_relaxed); + iTimestamp.store(iStamp, std::memory_order_relaxed); + std::this_thread::yield(); + } + + consume_work(); + } +} + +} // namespace amd +} // namespace xmrstak diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp new file mode 100644 index 0000000..21c2dd9 --- /dev/null +++ b/xmrstak/backend/amd/minethd.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include "amd_gpu/gpu.hpp" +#include "jconf.hpp" +#include "xmrstak/backend/cpu/crypto/cryptonight.h" +#include "xmrstak/backend/miner_work.hpp" +#include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/misc/environment.hpp" + +#include <thread> +#include <atomic> + +namespace xmrstak +{ +namespace amd +{ + +class minethd : public iBackend +{ +public: + + static void switch_work(miner_work& pWork); + static std::vector<iBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork); + static bool init_gpus(); + +private: + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + + minethd(miner_work& pWork, size_t iNo, GpuContext* ctx); + + void work_main(); + void double_work_main(); + void consume_work(); + + uint64_t iJobNo; + + static miner_work oGlobalWork; + miner_work oWork; + + std::thread oWorkThd; + + bool bQuit; + bool bNoPrefetch; + + //Mutable ptr to vector below, different for each thread + GpuContext* pGpuCtx; + + // WARNING - this vector (but not its contents) must be immutable + // once the threads are started + static std::vector<GpuContext> vGpuData; +}; + +} // namespace amd +} // namespace xmrstak diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp new file mode 100644 index 0000000..6106267 --- /dev/null +++ b/xmrstak/backend/backendConnector.cpp @@ -0,0 +1,103 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +#include "iBackend.hpp" +#include "backendConnector.hpp" +#include "miner_work.hpp" +#include "globalStates.hpp" +#include "plugin.hpp" +#include "xmrstak/misc/environment.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/params.hpp" + +#include "cpu/minethd.hpp" +#ifndef CONF_NO_CUDA +# include "nvidia/minethd.hpp" +#endif +#ifndef CONF_NO_OPENCL +# include "amd/minethd.hpp" +#endif + +#include <cstdlib> +#include <assert.h> +#include <cmath> +#include <chrono> +#include <cstring> +#include <thread> +#include <bitset> + + +namespace xmrstak +{ + +bool BackendConnector::self_test() +{ + + return true; +} + +std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork) +{ + globalStates::inst().iGlobalJobNo = 0; + globalStates::inst().iConsumeCnt = 0; + + + std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>; + +#ifndef CONF_NO_CUDA + if(params::inst().useNVIDIA) + { + plugin nvidiaplugin("NVIDIA", "xmrstak_cuda_backend"); + std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst()); + pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads)); + if(nvidiaThreads->size() == 0) + printer::inst()->print_msg(L0, "WARNING: backend NVIDIA disabled."); + } +#endif + +#ifndef CONF_NO_OPENCL + if(params::inst().useAMD) + { + plugin amdplugin("AMD", "xmrstak_opencl_backend"); + std::vector<iBackend*>* amdThreads = amdplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst()); + pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads)); + if(amdThreads->size() == 0) + printer::inst()->print_msg(L0, "WARNING: backend AMD disabled."); + } +#endif + +#ifndef CONF_NO_CPU + if(params::inst().useCPU) + { + auto cpuThreads = cpu::minethd::thread_starter(static_cast<uint32_t>(pvThreads->size()), pWork); + pvThreads->insert(std::end(*pvThreads), std::begin(cpuThreads), std::end(cpuThreads)); + if(cpuThreads.size() == 0) + printer::inst()->print_msg(L0, "WARNING: backend CPU disabled."); + } +#endif + + globalStates::inst().iThreadCount = pvThreads->size(); + return pvThreads; +} + +} // namepsace xmrstak diff --git a/xmrstak/backend/backendConnector.hpp b/xmrstak/backend/backendConnector.hpp new file mode 100644 index 0000000..da3dc77 --- /dev/null +++ b/xmrstak/backend/backendConnector.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "iBackend.hpp" +#include "miner_work.hpp" + +#include <thread> +#include <vector> +#include <atomic> +#include <mutex> + + +namespace xmrstak +{ + + struct BackendConnector + { + static std::vector<iBackend*>* thread_starter(miner_work& pWork); + static bool self_test(); + }; + +} // namepsace xmrstak diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp new file mode 100644 index 0000000..294bc6f --- /dev/null +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -0,0 +1,174 @@ +#pragma once + +#include "jconf.hpp" + +#include "xmrstak/misc/console.hpp" +#include "xmrstak/jconf.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/params.hpp" +#include <string> + +#ifdef _WIN32 +#include <windows.h> +#else +#include <unistd.h> +#endif // _WIN32 + + +namespace xmrstak +{ +namespace cpu +{ +// Mask bits between h and l and return the value +// This enables us to put in values exactly like in the manual +// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22) +inline int32_t get_masked(int32_t val, int32_t h, int32_t l) +{ + val &= (0x7FFFFFFF >> (31-(h-l))) << l; + return val >> l; +} + +class autoAdjust +{ +public: + + autoAdjust() + { + } + + bool printConfig() + { + + configEditor configTpl{}; + + // load the template of the backend config into a char variable + const char *tpl = + #include "./config.tpl" + ; + configTpl.set( std::string(tpl) ); + + std::string conf; + + if(!detectL3Size() || L3KB_size < 1024 || L3KB_size > 102400) + { + if(L3KB_size < 1024 || L3KB_size > 102400) + printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size); + + conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); + printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down."); + } + else + { + printer::inst()->print_msg(L0, "Autoconf L3 size detected at %u KB.", L3KB_size); + + detectCPUConf(); + + printer::inst()->print_msg(L0, "Autoconf core count detected as %u on %s.", corecnt, + linux_layout ? "Linux" : "Windows"); + + uint32_t aff_id = 0; + for(uint32_t i=0; i < corecnt; i++) + { + bool double_mode; + + if(L3KB_size <= 0) + break; + + double_mode = L3KB_size / 2048 > (int32_t)(corecnt-i); + + conf += std::string(" { \"low_power_mode\" : "); + conf += std::string(double_mode ? "true" : "false"); + conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::to_string(aff_id); + conf += std::string(" },\n"); + + if(!linux_layout || old_amd) + { + aff_id += 2; + + if(aff_id >= corecnt) + aff_id = 1; + } + else + aff_id++; + + if(double_mode) + L3KB_size -= 4096; + else + L3KB_size -= 2048; + } + } + + configTpl.replace("CPUCONFIG",conf); + configTpl.write(params::inst().configFileCPU); + printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str()); + + return true; + } + +private: + bool detectL3Size() + { + int32_t cpu_info[4]; + char cpustr[13] = {0}; + + ::jconf::cpuid(0, 0, cpu_info); + memcpy(cpustr, &cpu_info[1], 4); + memcpy(cpustr+4, &cpu_info[3], 4); + memcpy(cpustr+8, &cpu_info[2], 4); + + if(strcmp(cpustr, "GenuineIntel") == 0) + { + ::jconf::cpuid(4, 3, cpu_info); + + if(get_masked(cpu_info[0], 7, 5) != 3) + { + printer::inst()->print_msg(L0, "Autoconf failed: Couln't find L3 cache page."); + return false; + } + + L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) * + (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024; + + return true; + } + else if(strcmp(cpustr, "AuthenticAMD") == 0) + { + ::jconf::cpuid(0x80000006, 0, cpu_info); + + L3KB_size = get_masked(cpu_info[3], 31, 18) * 512; + + ::jconf::cpuid(1, 0, cpu_info); + if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen + old_amd = true; + + return true; + } + else + { + printer::inst()->print_msg(L0, "Autoconf failed: Unknown CPU type: %s.", cpustr); + return false; + } + } + + void detectCPUConf() + { +#ifdef _WIN32 + SYSTEM_INFO info; + GetSystemInfo(&info); + corecnt = info.dwNumberOfProcessors; + linux_layout = false; +#else + corecnt = sysconf(_SC_NPROCESSORS_ONLN); + linux_layout = true; +#endif // _WIN32 + } + + int32_t L3KB_size = 0; + uint32_t corecnt; + bool old_amd = false; + bool linux_layout; +}; + +} // namespace cpu +} // namepsace xmrstak diff --git a/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 92a668a..ad3b863 100644 --- a/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -1,8 +1,8 @@ #pragma once -#include "console.h" -#include <hwloc.h> -#include <stdio.h> +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/params.hpp" #ifdef _WIN32 #include <windows.h> @@ -10,6 +10,17 @@ #include <unistd.h> #endif // _WIN32 +#include <string> + +#include <hwloc.h> +#include <stdio.h> + + +namespace xmrstak +{ +namespace cpu +{ + class autoAdjust { public: @@ -18,17 +29,22 @@ public: { } - void printConfig() + bool printConfig() { - printer::inst()->print_str("The configuration for 'cpu_threads_conf' in your config file is 'null'.\n"); - printer::inst()->print_str("The miner evaluates your system and prints a suggestion for the section `cpu_threads_conf` to the terminal.\n"); - printer::inst()->print_str("The values are not optimal, please try to tweak the values based on notes in config.txt.\n"); - printer::inst()->print_str("Please copy & paste the block within the asterisks to your config.\n\n"); - + hwloc_topology_t topology; hwloc_topology_init(&topology); hwloc_topology_load(topology); + std::string conf; + configEditor configTpl{}; + + // load the template of the backend config into a char variable + const char *tpl = + #include "./config.tpl" + ; + configTpl.set( std::string(tpl) ); + try { std::vector<hwloc_obj_t> tlcs; @@ -43,32 +59,30 @@ public: for(hwloc_obj_t obj : tlcs) proccessTopLevelCache(obj); - - printer::inst()->print_str("\n**************** Copy&Paste BEGIN ****************\n\n"); - printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n"); - + for(uint32_t id : results) { - char str[128]; - snprintf(str, sizeof(str), " { \"low_power_mode\" : %s, \"no_prefetch\" : true, \"affine_to_cpu\" : %u },\n", - (id & 0x8000000) != 0 ? "true" : "false", id & 0x7FFFFFF); - printer::inst()->print_str(str); + conf += std::string(" { \"low_power_mode\" : "); + conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); + conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::to_string(id & 0x7FFFFFF); + conf += std::string(" },\n"); } - - printer::inst()->print_str("],\n\n**************** Copy&Paste END ****************\n"); } catch(const std::runtime_error& err) { - printer::inst()->print_msg(L0, "Autoconf FAILED: %s", err.what()); - printer::inst()->print_str("\nPrinting config for a single thread. Please try to add new ones until the hashrate slows down.\n"); - printer::inst()->print_str("\n**************** FAILURE Copy&Paste BEGIN ****************\n\n"); - printer::inst()->print_str("\"cpu_threads_conf\" :\n[\n"); - printer::inst()->print_str(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); - printer::inst()->print_str("],\n\n**************** FAILURE Copy&Paste END ****************\n"); + // \todo add fallback to default auto adjust + conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); + printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what()); } + configTpl.replace("CPUCONFIG",conf); + configTpl.write(params::inst().configFileCPU); + printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str()); /* Destroy topology object. */ hwloc_topology_destroy(topology); + + return true; } private: @@ -193,3 +207,6 @@ private: } } }; + +} // namespace cpu +} // namepsace xmrstak diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl new file mode 100644 index 0000000..990a31d --- /dev/null +++ b/xmrstak/backend/cpu/config.tpl @@ -0,0 +1,32 @@ +R"===( +/* + * Thread configuration for each thread. Make sure it matches the number above. + * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will + * consume much less power (as less cores are working), but will max out at around 80-85% of + * the maximum performance. + * + * no_prefetch - Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make + * things slower. + * + * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading + * systems it is better to assign threads to physical cores. On Windows this usually means selecting + * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 + * physical core CPU you should select cpu numbers 0-3. + * + * On the first run the miner will look at your system and suggest a basic configuration that will work, + * you can try to tweak it from there to get the best performance. + * + * A filled out configuration should look like this: + * "cpu_threads_conf" : + * [ + * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 }, + * ], + */ + +"cpu_threads_conf" : +[ +CPUCONFIG +], + +)===" diff --git a/crypto/c_blake256.c b/xmrstak/backend/cpu/crypto/c_blake256.c index ff623dd..ff623dd 100644 --- a/crypto/c_blake256.c +++ b/xmrstak/backend/cpu/crypto/c_blake256.c diff --git a/crypto/c_blake256.h b/xmrstak/backend/cpu/crypto/c_blake256.h index b9c2aad..b9c2aad 100644 --- a/crypto/c_blake256.h +++ b/xmrstak/backend/cpu/crypto/c_blake256.h diff --git a/crypto/c_groestl.c b/xmrstak/backend/cpu/crypto/c_groestl.c index 1318d5a..1318d5a 100644 --- a/crypto/c_groestl.c +++ b/xmrstak/backend/cpu/crypto/c_groestl.c diff --git a/crypto/c_groestl.h b/xmrstak/backend/cpu/crypto/c_groestl.h index 2b51339..2b51339 100644 --- a/crypto/c_groestl.h +++ b/xmrstak/backend/cpu/crypto/c_groestl.h diff --git a/crypto/c_jh.c b/xmrstak/backend/cpu/crypto/c_jh.c index 9d685a0..9d685a0 100644 --- a/crypto/c_jh.c +++ b/xmrstak/backend/cpu/crypto/c_jh.c diff --git a/crypto/c_jh.h b/xmrstak/backend/cpu/crypto/c_jh.h index d10d40f..d10d40f 100644 --- a/crypto/c_jh.h +++ b/xmrstak/backend/cpu/crypto/c_jh.h diff --git a/crypto/c_keccak.c b/xmrstak/backend/cpu/crypto/c_keccak.c index eadb85b..eadb85b 100644 --- a/crypto/c_keccak.c +++ b/xmrstak/backend/cpu/crypto/c_keccak.c diff --git a/crypto/c_keccak.h b/xmrstak/backend/cpu/crypto/c_keccak.h index 4f7f857..4f7f857 100644 --- a/crypto/c_keccak.h +++ b/xmrstak/backend/cpu/crypto/c_keccak.h diff --git a/crypto/c_skein.c b/xmrstak/backend/cpu/crypto/c_skein.c index 2453713..2453713 100644 --- a/crypto/c_skein.c +++ b/xmrstak/backend/cpu/crypto/c_skein.c diff --git a/crypto/c_skein.h b/xmrstak/backend/cpu/crypto/c_skein.h index 6165a2a..6165a2a 100644 --- a/crypto/c_skein.h +++ b/xmrstak/backend/cpu/crypto/c_skein.h diff --git a/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h index 978c798..978c798 100644 --- a/crypto/cryptonight.h +++ b/xmrstak/backend/cpu/crypto/cryptonight.h diff --git a/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 8bbb27c..8bbb27c 100644 --- a/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h diff --git a/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index 9d03ed7..9d03ed7 100644 --- a/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp diff --git a/crypto/groestl_tables.h b/xmrstak/backend/cpu/crypto/groestl_tables.h index a23295c..a23295c 100644 --- a/crypto/groestl_tables.h +++ b/xmrstak/backend/cpu/crypto/groestl_tables.h diff --git a/crypto/hash.h b/xmrstak/backend/cpu/crypto/hash.h index c12d355..c12d355 100644 --- a/crypto/hash.h +++ b/xmrstak/backend/cpu/crypto/hash.h diff --git a/crypto/int-util.h b/xmrstak/backend/cpu/crypto/int-util.h index 8748976..8748976 100644 --- a/crypto/int-util.h +++ b/xmrstak/backend/cpu/crypto/int-util.h diff --git a/crypto/skein_port.h b/xmrstak/backend/cpu/crypto/skein_port.h index 9cbefcb..9cbefcb 100644 --- a/crypto/skein_port.h +++ b/xmrstak/backend/cpu/crypto/skein_port.h diff --git a/crypto/soft_aes.c b/xmrstak/backend/cpu/crypto/soft_aes.c index aba7c20..aba7c20 100644 --- a/crypto/soft_aes.c +++ b/xmrstak/backend/cpu/crypto/soft_aes.c diff --git a/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp index f471951..719c1bb 100644 --- a/hwlocMemory.hpp +++ b/xmrstak/backend/cpu/hwlocMemory.hpp @@ -1,6 +1,6 @@ #pragma once -#include "console.h" +#include "xmrstak/misc/console.hpp" #ifndef CONF_NO_HWLOC diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp new file mode 100644 index 0000000..2ded8c0 --- /dev/null +++ b/xmrstak/backend/cpu/jconf.cpp @@ -0,0 +1,255 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +#include "jconf.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/jext.hpp" + +#include <iostream> +#include <stdio.h> +#include <stdlib.h> +#include <string> + +#ifdef _WIN32 +#define strcasecmp _stricmp +#include <intrin.h> +#else +#include <cpuid.h> +#endif + + +namespace xmrstak +{ +namespace cpu +{ + +using namespace rapidjson; + +/* + * This enum needs to match index in oConfigValues, otherwise we will get a runtime error + */ +enum configEnum { aCpuThreadsConf, sUseSlowMem }; + +struct configVal { + configEnum iName; + const char* sName; + Type iType; +}; + +// Same order as in configEnum, as per comment above +// kNullType means any type +configVal oConfigValues[] = { + { aCpuThreadsConf, "cpu_threads_conf", kNullType } +}; + +constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + +inline bool checkType(Type have, Type want) +{ + if(want == have) + return true; + else if(want == kNullType) + return true; + else if(want == kTrueType && have == kFalseType) + return true; + else if(want == kFalseType && have == kTrueType) + return true; + else + return false; +} + +struct jconf::opaque_private +{ + Document jsonDoc; + const Value* configValues[iConfigCnt]; //Compile time constant + + opaque_private() + { + } +}; + +jconf* jconf::oInst = nullptr; + +jconf::jconf() +{ + prv = new opaque_private(); +} + +bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) +{ + if(!prv->configValues[aCpuThreadsConf]->IsArray()) + return false; + + if(id >= prv->configValues[aCpuThreadsConf]->Size()) + return false; + + const Value& oThdConf = prv->configValues[aCpuThreadsConf]->GetArray()[id]; + + if(!oThdConf.IsObject()) + return false; + + const Value *mode, *no_prefetch, *aff; + mode = GetObjectMember(oThdConf, "low_power_mode"); + no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); + aff = GetObjectMember(oThdConf, "affine_to_cpu"); + + if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) + return false; + + if(!mode->IsBool() || !no_prefetch->IsBool()) + return false; + + if(!aff->IsNumber() && !aff->IsBool()) + return false; + + if(aff->IsNumber() && aff->GetInt64() < 0) + return false; + + cfg.bDoubleMode = mode->GetBool(); + cfg.bNoPrefetch = no_prefetch->GetBool(); + + if(aff->IsNumber()) + cfg.iCpuAff = aff->GetInt64(); + else + cfg.iCpuAff = -1; + + return true; +} + + +size_t jconf::GetThreadCount() +{ + if(prv->configValues[aCpuThreadsConf]->IsArray()) + return prv->configValues[aCpuThreadsConf]->Size(); + else + return 0; +} + +bool jconf::parse_config(const char* sFilename) +{ + FILE * pFile; + char * buffer; + size_t flen; + + pFile = fopen(sFilename, "rb"); + if (pFile == NULL) + { + printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); + return false; + } + + fseek(pFile,0,SEEK_END); + flen = ftell(pFile); + rewind(pFile); + + if(flen >= 64*1024) + { + fclose(pFile); + printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); + return false; + } + + if(flen <= 16) + { + fclose(pFile); + printer::inst()->print_msg(L0, "File is empty or too short - %s.", sFilename); + return false; + } + + buffer = (char*)malloc(flen + 3); + if(fread(buffer+1, flen, 1, pFile) != 1) + { + free(buffer); + fclose(pFile); + printer::inst()->print_msg(L0, "Read error while reading %s.", sFilename); + return false; + } + fclose(pFile); + + //Replace Unicode BOM with spaces - we always use UTF-8 + unsigned char* ubuffer = (unsigned char*)buffer; + if(ubuffer[1] == 0xEF && ubuffer[2] == 0xBB && ubuffer[3] == 0xBF) + { + buffer[1] = ' '; + buffer[2] = ' '; + buffer[3] = ' '; + } + + buffer[0] = '{'; + buffer[flen] = '}'; + buffer[flen + 1] = '\0'; + + prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2); + free(buffer); + + if(prv->jsonDoc.HasParseError()) + { + printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s", + int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError())); + return false; + } + + if(!prv->jsonDoc.IsObject()) + { //This should never happen as we created the root ourselves + printer::inst()->print_msg(L0, "Invalid config file. No root?\n"); + return false; + } + + for(size_t i = 0; i < iConfigCnt; i++) + { + if(oConfigValues[i].iName != i) + { + printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order."); + return false; + } + + prv->configValues[i] = GetObjectMember(prv->jsonDoc, oConfigValues[i].sName); + + if(prv->configValues[i] == nullptr) + { + printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName); + return false; + } + + if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType)) + { + printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName); + return false; + } + } + + thd_cfg c; + for(size_t i=0; i < GetThreadCount(); i++) + { + if(!GetThreadConfig(i, c)) + { + printer::inst()->print_msg(L0, "Thread %llu has invalid config.", int_port(i)); + return false; + } + } + + return true; +} + +} // namespace cpu +} // namepsace xmrstak diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp new file mode 100644 index 0000000..3c7da49 --- /dev/null +++ b/xmrstak/backend/cpu/jconf.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include "xmrstak/params.hpp" + +#include <stdlib.h> +#include <string> + +namespace xmrstak +{ +namespace cpu +{ + +class jconf +{ +public: + static jconf* inst() + { + if (oInst == nullptr) oInst = new jconf; + return oInst; + }; + + bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str()); + + struct thd_cfg { + bool bDoubleMode; + bool bNoPrefetch; + long long iCpuAff; + }; + + size_t GetThreadCount(); + bool GetThreadConfig(size_t id, thd_cfg &cfg); + bool NeedsAutoconf(); + + + + +private: + jconf(); + static jconf* oInst; + + struct opaque_private; + opaque_private* prv; +}; + +} // namespace cpu +} // namepsace xmrstak diff --git a/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index fac9fb4..b02540a 100644 --- a/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -21,21 +21,51 @@ * */ +#include "crypto/cryptonight_aesni.h" + +#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/backend//globalStates.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/params.hpp" +#include "jconf.hpp" + +#include "xmrstak/misc/executor.hpp" +#include "minethd.hpp" +#include "xmrstak/jconf.hpp" + +#include "hwlocMemory.hpp" +#include "xmrstak/backend/miner_work.hpp" + +#ifndef CONF_NO_HWLOC +# include "autoAdjustHwloc.hpp" +#else +# include "autoAdjust.hpp" +#endif + #include <assert.h> #include <cmath> #include <chrono> #include <cstring> #include <thread> #include <bitset> -#include "console.h" + #ifdef _WIN32 #include <windows.h> -void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) +namespace xmrstak +{ +namespace cpu +{ +void minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) { SetThreadAffinityMask(h, 1ULL << cpu_id); } + +} // namespace cpu +} // namespace xmrstak + #else #include <pthread.h> @@ -47,8 +77,12 @@ void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) #include <pthread_np.h> #endif +namespace xmrstak +{ +namespace cpu +{ -void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) +void minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) { #if defined(__APPLE__) thread_port_t mach_thread; @@ -67,88 +101,17 @@ void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id) pthread_setaffinity_np(h, sizeof(cpu_set_t), &mn); #endif } -#endif // _WIN32 -#include "executor.h" -#include "minethd.h" -#include "jconf.h" -#include "crypto/cryptonight_aesni.h" -#include "hwlocMemory.hpp" +} // namespace cpu +} // namespace xmrstak -telemetry::telemetry(size_t iThd) -{ - ppHashCounts = new uint64_t*[iThd]; - ppTimestamps = new uint64_t*[iThd]; - iBucketTop = new uint32_t[iThd]; +#endif // _WIN32 - for (size_t i = 0; i < iThd; i++) - { - ppHashCounts[i] = new uint64_t[iBucketSize]; - ppTimestamps[i] = new uint64_t[iBucketSize]; - iBucketTop[i] = 0; - memset(ppHashCounts[0], 0, sizeof(uint64_t) * iBucketSize); - memset(ppTimestamps[0], 0, sizeof(uint64_t) * iBucketSize); - } -} -double telemetry::calc_telemetry_data(size_t iLastMilisec, size_t iThread) +namespace xmrstak { - using namespace std::chrono; - uint64_t iTimeNow = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); - - uint64_t iEarliestHashCnt = 0; - uint64_t iEarliestStamp = 0; - uint64_t iLastestStamp = 0; - uint64_t iLastestHashCnt = 0; - bool bHaveFullSet = false; - - //Start at 1, buckettop points to next empty - for (size_t i = 1; i < iBucketSize; i++) - { - size_t idx = (iBucketTop[iThread] - i) & iBucketMask; //overflow expected here - - if (ppTimestamps[iThread][idx] == 0) - break; //That means we don't have the data yet - - if (iLastestStamp == 0) - { - iLastestStamp = ppTimestamps[iThread][idx]; - iLastestHashCnt = ppHashCounts[iThread][idx]; - } - - if (iTimeNow - ppTimestamps[iThread][idx] > iLastMilisec) - { - bHaveFullSet = true; - break; //We are out of the requested time period - } - - iEarliestStamp = ppTimestamps[iThread][idx]; - iEarliestHashCnt = ppHashCounts[iThread][idx]; - } - - if (!bHaveFullSet || iEarliestStamp == 0 || iLastestStamp == 0) - return nan(""); - - //Don't think that can happen, but just in case - if (iLastestStamp - iEarliestStamp == 0) - return nan(""); - - double fHashes, fTime; - fHashes = iLastestHashCnt - iEarliestHashCnt; - fTime = iLastestStamp - iEarliestStamp; - fTime /= 1000.0; - - return fHashes / fTime; -} - -void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp) +namespace cpu { - size_t iTop = iBucketTop[iThd]; - ppHashCounts[iThd][iTop] = iHashCount; - ppTimestamps[iThd][iTop] = iTimestamp; - - iBucketTop[iThd] = (iTop + 1) & iBucketMask; -} minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity) { @@ -156,8 +119,6 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefet bQuit = 0; iThreadNo = (uint8_t)iNo; iJobNo = 0; - iHashCount = 0; - iTimestamp = 0; bNoPrefetch = no_prefetch; this->affinity = affinity; @@ -168,31 +129,26 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefet oWorkThd = std::thread(&minethd::work_main, this); } -std::atomic<uint64_t> minethd::iGlobalJobNo; -std::atomic<uint64_t> minethd::iConsumeCnt; //Threads get jobs as they are initialized -minethd::miner_work minethd::oGlobalWork; -uint64_t minethd::iThreadCount = 0; - -cryptonight_ctx* minethd_alloc_ctx() +cryptonight_ctx* minethd::minethd_alloc_ctx() { cryptonight_ctx* ctx; alloc_msg msg = { 0 }; - switch (jconf::inst()->GetSlowMemSetting()) + switch (::jconf::inst()->GetSlowMemSetting()) { - case jconf::never_use: + case ::jconf::never_use: ctx = cryptonight_alloc_ctx(1, 1, &msg); if (ctx == NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); return ctx; - case jconf::no_mlck: + case ::jconf::no_mlck: ctx = cryptonight_alloc_ctx(1, 0, &msg); if (ctx == NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); return ctx; - case jconf::print_warning: + case ::jconf::print_warning: ctx = cryptonight_alloc_ctx(1, 1, &msg); if (msg.warning != NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); @@ -200,10 +156,10 @@ cryptonight_ctx* minethd_alloc_ctx() ctx = cryptonight_alloc_ctx(0, 0, NULL); return ctx; - case jconf::always_use: + case ::jconf::always_use: return cryptonight_alloc_ctx(0, 0, NULL); - case jconf::unknown_value: + case ::jconf::unknown_value: return NULL; //Shut up compiler } @@ -216,27 +172,27 @@ bool minethd::self_test() size_t res; bool fatal = false; - switch (jconf::inst()->GetSlowMemSetting()) + switch (::jconf::inst()->GetSlowMemSetting()) { - case jconf::never_use: + case ::jconf::never_use: res = cryptonight_init(1, 1, &msg); fatal = true; break; - case jconf::no_mlck: + case ::jconf::no_mlck: res = cryptonight_init(1, 0, &msg); fatal = true; break; - case jconf::print_warning: + case ::jconf::print_warning: res = cryptonight_init(1, 1, &msg); break; - case jconf::always_use: + case ::jconf::always_use: res = cryptonight_init(0, 0, &msg); break; - case jconf::unknown_value: + case ::jconf::unknown_value: default: return false; //Shut up compiler } @@ -263,20 +219,20 @@ bool minethd::self_test() cn_hash_fun hashf; cn_hash_fun_dbl hashdf; - hashf = func_selector(jconf::inst()->HaveHardwareAes(), false); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false); hashf("This is a test", 14, out, ctx0); bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - hashf = func_selector(jconf::inst()->HaveHardwareAes(), true); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true); hashf("This is a test", 14, out, ctx0); bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - hashdf = func_dbl_selector(jconf::inst()->HaveHardwareAes(), false); + hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), false); hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashdf = func_dbl_selector(jconf::inst()->HaveHardwareAes(), true); + hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), true); hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; @@ -291,24 +247,36 @@ bool minethd::self_test() return bResult; } -std::vector<minethd*>* minethd::thread_starter(miner_work& pWork) +std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work& pWork) { - iGlobalJobNo = 0; - iConsumeCnt = 0; - std::vector<minethd*>* pvThreads = new std::vector<minethd*>; + std::vector<iBackend*> pvThreads; + + if(!configEditor::file_exist(params::inst().configFileCPU)) + { + autoAdjust adjust; + if(!adjust.printConfig()) + return pvThreads; + } + + if(!jconf::inst()->parse_config()) + { + win_exit(); + } + //Launch the requested number of single and double threads, to distribute //load evenly we need to alternate single and double threads size_t i, n = jconf::inst()->GetThreadCount(); - pvThreads->reserve(n); + pvThreads.reserve(n); jconf::thd_cfg cfg; for (i = 0; i < n; i++) { jconf::inst()->GetThreadConfig(i, cfg); - minethd* thd = new minethd(pWork, i, cfg.bDoubleMode, cfg.bNoPrefetch, cfg.iCpuAff); - pvThreads->push_back(thd); + // \todo need thread offset + minethd* thd = new minethd(pWork, i + threadOffset, cfg.bDoubleMode, cfg.bNoPrefetch, cfg.iCpuAff); + pvThreads.push_back(thd); if(cfg.iCpuAff >= 0) printer::inst()->print_msg(L1, "Starting %s thread, affinity: %d.", cfg.bDoubleMode ? "double" : "single", (int)cfg.iCpuAff); @@ -316,29 +284,14 @@ std::vector<minethd*>* minethd::thread_starter(miner_work& pWork) printer::inst()->print_msg(L1, "Starting %s thread, no affinity.", cfg.bDoubleMode ? "double" : "single"); } - iThreadCount = n; return pvThreads; } -void minethd::switch_work(miner_work& pWork) -{ - // iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work - // faster than threads can consume them. This should never happen in real life. - // Pool cant physically send jobs faster than every 250ms or so due to net latency. - - while (iConsumeCnt.load(std::memory_order_seq_cst) < iThreadCount) - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - oGlobalWork = pWork; - iConsumeCnt.store(0, std::memory_order_seq_cst); - iGlobalJobNo++; -} - void minethd::consume_work() { - memcpy(&oWork, &oGlobalWork, sizeof(miner_work)); + memcpy(&oWork, &globalStates::inst().inst().oGlobalWork, sizeof(miner_work)); iJobNo++; - iConsumeCnt++; + globalStates::inst().inst().iConsumeCnt++; } minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch) @@ -388,12 +341,12 @@ void minethd::work_main() uint32_t* piNonce; job_result result; - hash_fun = func_selector(jconf::inst()->HaveHardwareAes(), bNoPrefetch); + hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch); ctx = minethd_alloc_ctx(); piHashVal = (uint64_t*)(result.bResult + 24); piNonce = (uint32_t*)(oWork.bWorkBlob + 39); - iConsumeCnt++; + globalStates::inst().inst().iConsumeCnt++; while (bQuit == 0) { @@ -403,7 +356,7 @@ void minethd::work_main() either because of network latency, or a socket problem. Since we are raison d'etre of this software it us sensible to just wait until we have something*/ - while (iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); consume_work(); @@ -418,7 +371,7 @@ void minethd::work_main() assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID)); - while(iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { if ((iCount & 0xF) == 0) //Store stats every 16 hashes { @@ -482,7 +435,7 @@ void minethd::double_work_main() uint32_t iNonce; job_result res; - hash_fun = func_dbl_selector(jconf::inst()->HaveHardwareAes(), bNoPrefetch); + hash_fun = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch); ctx0 = minethd_alloc_ctx(); ctx1 = minethd_alloc_ctx(); @@ -491,7 +444,7 @@ void minethd::double_work_main() piNonce0 = (uint32_t*)(bDoubleWorkBlob + 39); piNonce1 = nullptr; - iConsumeCnt++; + globalStates::inst().inst().iConsumeCnt++; while (bQuit == 0) { @@ -501,7 +454,7 @@ void minethd::double_work_main() either because of network latency, or a socket problem. Since we are raison d'etre of this software it us sensible to just wait until we have something*/ - while (iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); consume_work(); @@ -518,7 +471,7 @@ void minethd::double_work_main() assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); - while (iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { if ((iCount & 0x7) == 0) //Store stats every 16 hashes { @@ -553,3 +506,6 @@ void minethd::double_work_main() cryptonight_free_ctx(ctx0); cryptonight_free_ctx(ctx1); } + +} // namespace cpu +} // namepsace xmrstak diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp new file mode 100644 index 0000000..a091ee8 --- /dev/null +++ b/xmrstak/backend/cpu/minethd.hpp @@ -0,0 +1,59 @@ +#pragma once + +#include "crypto/cryptonight.h" +#include "xmrstak/backend/miner_work.hpp" +#include "xmrstak/backend/iBackend.hpp" + +#include <iostream> +#include <thread> +#include <vector> +#include <atomic> +#include <mutex> + +namespace xmrstak +{ +namespace cpu +{ + +class minethd : public iBackend +{ +public: + static std::vector<iBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork); + static bool self_test(); + + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + + static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch); + static void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id); + + static cryptonight_ctx* minethd_alloc_ctx(); + +private: + + typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, cryptonight_ctx* __restrict, cryptonight_ctx* __restrict); + static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch); + + minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity); + + void work_main(); + void double_work_main(); + void consume_work(); + + uint64_t iJobNo; + + static miner_work oGlobalWork; + miner_work oWork; + + void pin_thd_affinity(); + // Held by the creating context to prevent a race cond with oWorkThd = std::thread(...) + std::mutex work_thd_mtx; + + std::thread oWorkThd; + int64_t affinity; + + bool bQuit; + bool bNoPrefetch; +}; + +} // namespace cpu +} // namepsace xmrstak diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp new file mode 100644 index 0000000..9104040 --- /dev/null +++ b/xmrstak/backend/globalStates.cpp @@ -0,0 +1,51 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +#include "miner_work.hpp" +#include "globalStates.hpp" + +#include <assert.h> +#include <cmath> +#include <chrono> +#include <cstring> + + +namespace xmrstak +{ + + +void globalStates::switch_work(miner_work& pWork) +{ + // iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work + // faster than threads can consume them. This should never happen in real life. + // Pool cant physically send jobs faster than every 250ms or so due to net latency. + + while (iConsumeCnt.load(std::memory_order_seq_cst) < iThreadCount) + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + oGlobalWork = pWork; + iConsumeCnt.store(0, std::memory_order_seq_cst); + iGlobalJobNo++; +} + +} // namepsace xmrstak diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp new file mode 100644 index 0000000..73ccf74 --- /dev/null +++ b/xmrstak/backend/globalStates.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include "miner_work.hpp" +#include "xmrstak/misc/environment.hpp" + +#include <atomic> + + +namespace xmrstak +{ + +struct globalStates +{ + + static inline globalStates& inst() + { + auto& env = environment::inst(); + if(env.pglobalStates == nullptr) + env.pglobalStates = new globalStates; + return *env.pglobalStates; + } + + void switch_work(miner_work& pWork); + + miner_work oGlobalWork; + std::atomic<uint64_t> iGlobalJobNo; + std::atomic<uint64_t> iConsumeCnt; + uint64_t iThreadCount; + + private: + + globalStates() : iThreadCount(0) + { + } + +}; + +} // namepsace xmrstak diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp new file mode 100644 index 0000000..0be8f0a --- /dev/null +++ b/xmrstak/backend/iBackend.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include "xmrstak/backend/globalStates.hpp" + +#include <atomic> +#include <cstdint> +#include <climits> + + +namespace xmrstak +{ + // only allowed for unsigned value \todo add static assert + template<typename T> + T reverseBits(T value) + { + /* init with value (to get LSB) */ + T result = value; + /* extra shift needed at end */ + int s = sizeof(T) * CHAR_BIT - 1; + for (value >>= 1; value; value >>= 1) + { + result <<= 1; + result |= value & 1; + s--; + } + /* shift when values highest bits are zero */ + result <<= s; + return result; + } + + struct iBackend + { + inline uint32_t calc_start_nonce(uint32_t resume) + { + return reverseBits<uint32_t>(static_cast<uint32_t>(iThreadNo + globalStates::inst().iThreadCount * resume)); + } + + // Limited version of the nonce calc above + inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume) + { + return start | ( calc_start_nonce(resume) >> 8u ); + } + + std::atomic<uint64_t> iHashCount; + std::atomic<uint64_t> iTimestamp; + uint32_t iThreadNo; + + iBackend() : iHashCount(0), iTimestamp(0) + { + } + }; + +} // namepsace xmrstak diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp new file mode 100644 index 0000000..e9f9e07 --- /dev/null +++ b/xmrstak/backend/miner_work.hpp @@ -0,0 +1,85 @@ +#pragma once + +#include <thread> +#include <atomic> +#include <mutex> +#include <cstdint> +#include <iostream> +#include <cassert> +#include <cstring> + +namespace xmrstak +{ + struct miner_work + { + char sJobID[64]; + uint8_t bWorkBlob[112]; + uint32_t iWorkSize; + uint32_t iResumeCnt; + uint64_t iTarget; + // \todo remove workaround needed for amd + uint32_t iTarget32; + bool bNiceHash; + bool bStall; + size_t iPoolId; + + miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(0) { } + + miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, uint32_t iResumeCnt, + uint64_t iTarget, size_t iPoolId) : iWorkSize(iWorkSize), iResumeCnt(iResumeCnt), + iTarget(iTarget), bNiceHash(false), bStall(false), iPoolId(iPoolId) + { + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); + memcpy(this->bWorkBlob, bWork, iWorkSize); + } + + miner_work(miner_work const&) = delete; + + miner_work& operator=(miner_work const& from) + { + assert(this != &from); + + iWorkSize = from.iWorkSize; + iResumeCnt = from.iResumeCnt; + iTarget = from.iTarget; + iTarget32 = from.iTarget32; + bNiceHash = from.bNiceHash; + bStall = from.bStall; + iPoolId = from.iPoolId; + + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(sJobID, from.sJobID, sizeof(sJobID)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + + return *this; + } + + miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget),iTarget32(from.iTarget32), + bStall(from.bStall), iPoolId(from.iPoolId) + { + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(sJobID, from.sJobID, sizeof(sJobID)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + } + + miner_work& operator=(miner_work&& from) + { + assert(this != &from); + + iWorkSize = from.iWorkSize; + iResumeCnt = from.iResumeCnt; + iTarget = from.iTarget; + iTarget32 = from.iTarget32; + bNiceHash = from.bNiceHash; + bStall = from.bStall; + iPoolId = from.iPoolId; + + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(sJobID, from.sJobID, sizeof(sJobID)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + + return *this; + } + }; +} // namepsace xmrstak diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp new file mode 100644 index 0000000..87787fa --- /dev/null +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -0,0 +1,113 @@ + +#pragma once + +#include "autoAdjust.hpp" + +#include "nvcc_code/cryptonight.hpp" +#include "jconf.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/params.hpp" + +#include <vector> +#include <cstdio> +#include <sstream> +#include <string> + + +namespace xmrstak +{ +namespace nvidia +{ + +class autoAdjust +{ +public: + + autoAdjust() + { + + } + + /** print the adjusted values if needed + * + * Routine exit the application and print the adjusted values if needed else + * nothing is happened. + */ + bool printConfig() + { + int deviceCount = 0; + if(cuda_get_devicecount(&deviceCount) == 0) + return false; + // evaluate config parameter for if auto adjustment is needed + // evaluate config parameter for if auto adjustment is needed + for(int i = 0; i < deviceCount; i++) + { + + nvid_ctx ctx; + + ctx.device_id = i; + // -1 trigger auto adjustment + ctx.device_blocks = -1; + ctx.device_threads = -1; + + // set all evice option those marked as auto (-1) to a valid value +#ifndef _WIN32 + ctx.device_bfactor = 0; + ctx.device_bsleep = 0; +#else + // windows pass, try to avoid that windows kills the miner if the gpu is blocked for 2 seconds + ctx.device_bfactor = 6; + ctx.device_bsleep = 25; +#endif + if( cuda_get_deviceinfo(&ctx) != 1 ) + { + printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", i); + std::exit(0); + } + nvidCtxVec.push_back(ctx); + + } + + generateThreadConfig(); + return true; + + } + +private: + + void generateThreadConfig() + { + // load the template of the backend config into a char variable + const char *tpl = + #include "./config.tpl" + ; + + configEditor configTpl{}; + configTpl.set( std::string(tpl) ); + + constexpr size_t byte2mib = 1024u * 1024u; + std::string conf; + int i = 0; + for(auto& ctx : nvidCtxVec) + { + conf += std::string(" // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n"; + conf += std::string(" // memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n"; + conf += std::string(" { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" + + " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + + " \"affine_to_cpu\" : false,\n" + + " },\n"; + ++i; + } + + configTpl.replace("GPUCONFIG",conf); + configTpl.write(params::inst().configFileNVIDIA); + printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", params::inst().configFileNVIDIA.c_str()); + } + + std::vector<nvid_ctx> nvidCtxVec; +}; + +} // namespace nvidia +} // namepsace xmrstak diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl new file mode 100644 index 0000000..99dc023 --- /dev/null +++ b/xmrstak/backend/nvidia/config.tpl @@ -0,0 +1,28 @@ +R"===( +/* + * GPU configuration. You should play around with threads and blocks as the fastest settings will vary. + * index - GPU index number usually starts from 0. + * threads - Number of GPU threads (nothing to do with CPU threads). + * blocks - Number of GPU blocks (nothing to do with CPU threads). + * bfactor - Enables running the Cryptonight kernel in smaller pieces. + * Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 8 + * bsleep - Insert a delay of X microseconds between kernel launches. + * Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 100 + * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. + * + * On the first run the miner will look at your system and suggest a basic configuration that will work, + * you can try to tweak it from there to get the best performance. + * + * A filled out configuration should look like this: + * "gpu_threads_conf" : + * [ + * { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, "affine_to_cpu" : false}, + * ], + */ + +"gpu_threads_conf" : +[ +GPUCONFIG +], + +)===" diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp new file mode 100644 index 0000000..4208145 --- /dev/null +++ b/xmrstak/backend/nvidia/jconf.cpp @@ -0,0 +1,270 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +#include "jconf.hpp" +#include "xmrstak/misc/jext.hpp" +#include "xmrstak/misc/console.hpp" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef _WIN32 +#define strcasecmp _stricmp +#include <intrin.h> +#else +#include <cpuid.h> +#endif + + +namespace xmrstak +{ +namespace nvidia +{ + +using namespace rapidjson; + +/* + * This enum needs to match index in oConfigValues, otherwise we will get a runtime error + */ +enum configEnum { aGpuThreadsConf }; + +struct configVal { + configEnum iName; + const char* sName; + Type iType; +}; + +// Same order as in configEnum, as per comment above +// kNullType means any type +configVal oConfigValues[] = { + { aGpuThreadsConf, "gpu_threads_conf", kNullType } +}; + +inline bool checkType(Type have, Type want) +{ + if(want == have) + return true; + else if(want == kNullType) + return true; + else if(want == kTrueType && have == kFalseType) + return true; + else if(want == kFalseType && have == kTrueType) + return true; + else + return false; +} + +constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + + + +struct jconf::opaque_private +{ + Document jsonDoc; + const Value* configValues[iConfigCnt]; //Compile time constant + + opaque_private() + { + } +}; + + +bool jconf::NeedsAutoconf() +{ + return !prv->configValues[aGpuThreadsConf]->IsArray(); +} + +jconf* jconf::oInst = nullptr; + +jconf::jconf() +{ + prv = new opaque_private(); +} + +size_t jconf::GetGPUThreadCount() +{ + if(prv->configValues[aGpuThreadsConf]->IsArray()) + return prv->configValues[aGpuThreadsConf]->Size(); + else + return 0; +} + +bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) +{ + if(!prv->configValues[aGpuThreadsConf]->IsArray()) + return false; + + if(id >= prv->configValues[aGpuThreadsConf]->Size()) + return false; + + const Value& oThdConf = prv->configValues[aGpuThreadsConf]->GetArray()[id]; + + if(!oThdConf.IsObject()) + return false; + + const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff; + gid = GetObjectMember(oThdConf, "index"); + blocks = GetObjectMember(oThdConf, "blocks"); + threads = GetObjectMember(oThdConf, "threads"); + bfactor = GetObjectMember(oThdConf, "bfactor"); + bsleep = GetObjectMember(oThdConf, "bsleep"); + aff = GetObjectMember(oThdConf, "affine_to_cpu"); + + if(gid == nullptr || blocks == nullptr || threads == nullptr || + bfactor == nullptr || bsleep == nullptr || aff == nullptr) + { + return false; + } + + if(!gid->IsNumber() || gid->GetInt() < 0) + return false; + + if(!blocks->IsNumber() || blocks->GetInt() < 0) + return false; + + if(!threads->IsNumber() || threads->GetInt() < 0) + return false; + + if(!bfactor->IsNumber() || bfactor->GetInt() < 0) + return false; + + if(!bsleep->IsNumber() || bsleep->GetInt() < 0) + return false; + + if(!aff->IsUint64() && !aff->IsBool()) + return false; + + cfg.id = gid->GetInt(); + cfg.blocks = blocks->GetInt(); + cfg.threads = threads->GetInt(); + cfg.bfactor = bfactor->GetInt(); + cfg.bsleep = bsleep->GetInt(); + + if(aff->IsNumber()) + cfg.cpu_aff = aff->GetInt(); + else + cfg.cpu_aff = -1; + + return true; +} + +bool jconf::parse_config(const char* sFilename) +{ + FILE * pFile; + char * buffer; + size_t flen; + + pFile = fopen(sFilename, "rb"); + if (pFile == NULL) + { + printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); + return false; + } + + fseek(pFile,0,SEEK_END); + flen = ftell(pFile); + rewind(pFile); + + if(flen >= 64*1024) + { + fclose(pFile); + printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); + return false; + } + + if(flen <= 16) + { + fclose(pFile); + printer::inst()->print_msg(L0, "File is empty or too short - %s.", sFilename); + return false; + } + + buffer = (char*)malloc(flen + 3); + if(fread(buffer+1, flen, 1, pFile) != 1) + { + free(buffer); + fclose(pFile); + printer::inst()->print_msg(L0, "Read error while reading %s.", sFilename); + return false; + } + fclose(pFile); + + //Replace Unicode BOM with spaces - we always use UTF-8 + unsigned char* ubuffer = (unsigned char*)buffer; + if(ubuffer[1] == 0xEF && ubuffer[2] == 0xBB && ubuffer[3] == 0xBF) + { + buffer[1] = ' '; + buffer[2] = ' '; + buffer[3] = ' '; + } + + buffer[0] = '{'; + buffer[flen] = '}'; + buffer[flen + 1] = '\0'; + + prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2); + free(buffer); + + if(prv->jsonDoc.HasParseError()) + { + printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s", + int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError())); + return false; + } + + + if(!prv->jsonDoc.IsObject()) + { //This should never happen as we created the root ourselves + printer::inst()->print_msg(L0, "Invalid config file. No root?\n"); + return false; + } + + for(size_t i = 0; i < iConfigCnt; i++) + { + if(oConfigValues[i].iName != i) + { + printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s",oConfigValues[i].sName); + return false; + } + + prv->configValues[i] = GetObjectMember(prv->jsonDoc, oConfigValues[i].sName); + + if(prv->configValues[i] == nullptr) + { + printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName); + return false; + } + + if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType)) + { + printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName); + return false; + } + } + + return true; +} + +} // namespace nvidia +} // namespace xmrstak
\ No newline at end of file diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp new file mode 100644 index 0000000..b09a162 --- /dev/null +++ b/xmrstak/backend/nvidia/jconf.hpp @@ -0,0 +1,51 @@ +#pragma once +#include <stdlib.h> +#include <string> +#include "xmrstak/params.hpp" + +namespace xmrstak +{ +namespace nvidia +{ + +class jconf +{ +public: + static jconf* inst() + { + if (oInst == nullptr) oInst = new jconf; + return oInst; + }; + + bool parse_config(const char* sFilename = params::inst().configFileNVIDIA.c_str()); + + struct thd_cfg { + uint32_t id; + uint32_t blocks; + uint32_t threads; + uint32_t bfactor; + uint32_t bsleep; + bool bDoubleMode; + bool bNoPrefetch; + int32_t cpu_aff; + + long long iCpuAff; + }; + + size_t GetGPUThreadCount(); + + bool GetGPUThreadConfig(size_t id, thd_cfg &cfg); + + bool NeedsAutoconf(); + +private: + jconf(); + static jconf* oInst; + + struct opaque_private; + opaque_private* prv; + +}; + +} // namespace nvidia +} // namepsace xmrstak diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp new file mode 100644 index 0000000..b4080fe --- /dev/null +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -0,0 +1,272 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +#include "minethd.hpp" +#include "autoAdjust.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" +#include "xmrstak/backend/cpu/crypto/cryptonight.h" +#include "xmrstak/backend/cpu/minethd.hpp" +#include "xmrstak/params.hpp" +#include "xmrstak/misc/executor.hpp" +#include "xmrstak/jconf.hpp" +#include "xmrstak/misc/environment.hpp" + +#include <assert.h> +#include <cmath> +#include <chrono> +#include <thread> +#include <bitset> +#include <vector> + +#ifndef USE_PRECOMPILED_HEADERS +#ifdef WIN32 +#include <direct.h> +#include <windows.h> +#else +#include <sys/types.h> +#include <dlfcn.h> +#endif +#include <iostream> +#endif + +namespace xmrstak +{ +namespace nvidia +{ + +#ifdef WIN32 + HINSTANCE lib_handle; +#else + void *lib_handle; +#endif + +minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) +{ + oWork = pWork; + bQuit = 0; + iThreadNo = (uint8_t)iNo; + iJobNo = 0; + + ctx.device_id = (int)cfg.id; + ctx.device_blocks = (int)cfg.blocks; + ctx.device_threads = (int)cfg.threads; + ctx.device_bfactor = (int)cfg.bfactor; + ctx.device_bsleep = (int)cfg.bsleep; + + oWorkThd = std::thread(&minethd::work_main, this); +} + + +bool minethd::self_test() +{ + cryptonight_ctx* ctx0; + unsigned char out[32]; + bool bResult = true; + + ctx0 = new cryptonight_ctx; + if(::jconf::inst()->HaveHardwareAes()) + { + //cryptonight_hash_ctx("This is a test", 14, out, ctx0); + bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + } + else + { + //cryptonight_hash_ctx_soft("This is a test", 14, out, ctx0); + bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + } + delete ctx0; + + //if(!bResult) + // printer::inst()->print_msg(L0, + // "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); + + return bResult; +} + + +extern "C" +{ +#ifdef WIN32 +__declspec(dllexport) +#endif +std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) +{ + environment::inst() = env; + return nvidia::minethd::thread_starter(threadOffset, pWork); +} +} // extern "C" + +std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork) +{ + std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>(); + + if(!configEditor::file_exist(params::inst().configFileNVIDIA)) + { + autoAdjust adjust; + if(!adjust.printConfig()) + return pvThreads; + } + + if(!jconf::inst()->parse_config()) + { + win_exit(); + } + + int deviceCount = 0; + if(cuda_get_devicecount(&deviceCount) != 1) + { + std::cout<<"WARNING: NVIDIA no device found"<<std::endl; + return pvThreads; + } + + size_t i, n = jconf::inst()->GetGPUThreadCount(); + pvThreads->reserve(n); + + jconf::thd_cfg cfg; + for (i = 0; i < n; i++) + { + jconf::inst()->GetGPUThreadConfig(i, cfg); + minethd* thd = new minethd(pWork, i + threadOffset, cfg); + + if(cfg.cpu_aff >= 0) + { +#if defined(__APPLE__) + printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory."); +#endif + cpu::minethd::thd_setaffinity(thd->oWorkThd.native_handle(), cfg.cpu_aff); + } + + pvThreads->push_back(thd); + + if(cfg.cpu_aff >= 0) + printer::inst()->print_msg(L1, "Starting GPU thread, affinity: %d.", (int)cfg.cpu_aff); + else + printer::inst()->print_msg(L1, "Starting GPU thread, no affinity."); + } + + return pvThreads; +} + +void minethd::switch_work(miner_work& pWork) +{ + // iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work + // faster than threads can consume them. This should never happen in real life. + // Pool cant physically send jobs faster than every 250ms or so due to net latency. + + while (globalStates::inst().iConsumeCnt.load(std::memory_order_seq_cst) < globalStates::inst().iThreadCount) + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + globalStates::inst().oGlobalWork = pWork; + globalStates::inst().iConsumeCnt.store(0, std::memory_order_seq_cst); + globalStates::inst().iGlobalJobNo++; +} + +void minethd::consume_work() +{ + memcpy(&oWork, &globalStates::inst().oGlobalWork, sizeof(miner_work)); + iJobNo++; + globalStates::inst().iConsumeCnt++; +} + +void minethd::work_main() +{ + uint64_t iCount = 0; + uint32_t iNonce; + cryptonight_ctx* cpu_ctx; + cpu_ctx = cpu::minethd::minethd_alloc_ctx(); + cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/); + + globalStates::inst().iConsumeCnt++; + + if(/*cuda_get_deviceinfo(&ctx) != 1 ||*/ cryptonight_extra_cpu_init(&ctx) != 1) + { + printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo); + std::exit(0); + } + + while (bQuit == 0) + { + if (oWork.bStall) + { + /* We are stalled here because the executor didn't find a job for us yet, + either because of network latency, or a socket problem. Since we are + raison d'etre of this software it us sensible to just wait until we have something*/ + + while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + consume_work(); + continue; + } + + cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize); + iNonce = calc_start_nonce(oWork.iResumeCnt); + + assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); + + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + { + + uint32_t foundNonce[10]; + uint32_t foundCount; + + cryptonight_extra_cpu_prepare(&ctx, iNonce); + cryptonight_core_cpu_hash(&ctx); + cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce); + + for(size_t i = 0; i < foundCount; i++) + { + + uint8_t bWorkBlob[112]; + uint8_t bResult[32]; + + memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); + memset(bResult, 0, sizeof(job_result::bResult)); + + *(uint32_t*)(bWorkBlob + 39) = foundNonce[i]; + + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx); + if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) + executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult), oWork.iPoolId)); + else + executor::inst()->log_result_error("NVIDIA Invalid Result"); + } + + iCount += ctx.device_blocks * ctx.device_threads; + iNonce += ctx.device_blocks * ctx.device_threads; + + using namespace std::chrono; + uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); + iHashCount.store(iCount, std::memory_order_relaxed); + iTimestamp.store(iStamp, std::memory_order_relaxed); + std::this_thread::yield(); + } + + consume_work(); + } +} + +} // namespace xmrstak + +} //namespace nvidia diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp new file mode 100644 index 0000000..657ee6a --- /dev/null +++ b/xmrstak/backend/nvidia/minethd.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include "xmrstak/jconf.hpp" +#include "jconf.hpp" +#include "nvcc_code/cryptonight.hpp" + +#include "xmrstak/backend/cpu/crypto/cryptonight.h" +#include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/misc/environment.hpp" + +#include <iostream> +#include <thread> +#include <atomic> +#include <vector> + + +namespace xmrstak +{ +namespace nvidia +{ + +class minethd : public iBackend +{ +public: + + static void switch_work(miner_work& pWork); + static std::vector<iBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork); + static bool self_test(); + +private: + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + + minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg); + + void work_main(); + void consume_work(); + + static std::atomic<uint64_t> iGlobalJobNo; + static std::atomic<uint64_t> iConsumeCnt; + static uint64_t iThreadCount; + uint64_t iJobNo; + + static miner_work oGlobalWork; + miner_work oWork; + + std::thread oWorkThd; + + nvid_ctx ctx; + + bool bQuit; +}; + +} // namespace nvidia +} // namepsace xmrstak diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp new file mode 100644 index 0000000..784c38d --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -0,0 +1,48 @@ +#pragma once + +#include <stdint.h> +#include <string> + +typedef struct { + int device_id; + const char *device_name; + int device_arch[2]; + int device_mpcount; + int device_blocks; + int device_threads; + int device_bfactor; + int device_bsleep; + + uint32_t *d_input; + uint32_t inputlen; + uint32_t *d_result_count; + uint32_t *d_result_nonce; + uint32_t *d_long_state; + uint32_t *d_ctx_state; + uint32_t *d_ctx_a; + uint32_t *d_ctx_b; + uint32_t *d_ctx_key1; + uint32_t *d_ctx_key2; + uint32_t *d_ctx_text; + std::string name; + size_t free_device_memory; + size_t total_device_memory; +} nvid_ctx; + +extern "C" { + +/** get device count + * + * @param deviceCount[out] cuda device count + * @return error code: 0 == error is occurred, 1 == no error + */ +int cuda_get_devicecount( int* deviceCount); +int cuda_get_deviceinfo(nvid_ctx *ctx); +int cryptonight_extra_cpu_init(nvid_ctx *ctx); +void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len); +void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce); +void cryptonight_core_cpu_hash(nvid_ctx* ctx); +void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce); + +} + diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp new file mode 100644 index 0000000..e478600 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp @@ -0,0 +1,305 @@ + +#pragma once + +#include <stdint.h> + +#define N_COLS 4 +#define WPOLY 0x011b + +static __constant__ uint32_t d_t_fn[1024] = +{ + 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, + 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U, + 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, + 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU, + 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, + 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU, + 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U, + 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU, + 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU, + 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U, + 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U, + 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU, + 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU, + 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU, + 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU, + 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU, + 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U, + 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU, + 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU, + 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U, + 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U, + 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U, + 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U, + 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U, + 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU, + 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U, + 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU, + 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU, + 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U, + 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U, + 0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U, + 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU, + 0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U, + 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU, + 0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU, + 0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U, + 0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U, + 0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU, + 0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U, + 0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU, + 0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U, + 0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U, + 0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U, + 0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U, + 0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU, + 0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U, + 0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU, + 0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U, + 0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU, + 0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U, + 0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU, + 0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU, + 0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU, + 0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU, + 0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U, + 0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U, + 0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U, + 0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U, + 0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U, + 0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U, + 0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU, + 0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U, + 0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU, + 0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU, + 0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU, + 0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U, + 0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU, + 0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU, + 0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U, + 0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU, + 0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU, + 0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU, + 0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU, + 0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU, + 0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U, + 0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU, + 0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU, + 0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U, + 0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU, + 0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU, + 0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU, + 0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU, + 0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU, + 0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U, + 0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU, + 0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU, + 0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU, + 0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU, + 0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U, + 0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U, + 0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U, + 0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U, + 0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU, + 0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U, + 0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U, + 0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU, + 0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU, + 0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U, + 0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U, + 0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U, + 0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU, + 0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U, + 0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU, + 0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U, + 0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU, + 0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U, + 0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U, + 0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU, + 0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U, + 0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U, + 0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U, + 0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U, + 0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U, + 0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U, + 0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U, + 0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U, + 0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU, + 0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U, + 0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U, + 0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U, + 0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U, + 0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U, + 0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U, + 0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU, + 0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U, + 0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U, + 0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U, + 0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU, + 0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU, + 0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U, + 0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU, + 0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U, + 0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU, + 0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U, + 0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU, + 0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U, + 0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U, + 0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU, + 0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U, + 0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U, + 0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U, + 0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU, + 0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U, + 0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U, + 0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU, + 0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U, + 0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U, + 0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U, + 0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU, + 0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU, + 0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U, + 0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU, + 0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU, + 0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U, + 0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU, + 0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U, + 0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU, + 0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U, + 0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U, + 0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U, + 0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU, + 0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U, + 0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU, + 0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U, + 0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU, + 0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U, + 0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U, + 0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU, + 0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU, + 0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU, + 0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U, + 0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U, + 0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU, + 0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U, + 0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU, + 0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U, + 0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU, + 0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U, + 0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU, + 0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU, + 0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U, + 0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU, + 0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U, + 0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU, + 0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U, + 0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U, + 0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U, + 0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU, + 0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU, + 0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U, + 0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU, + 0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U, + 0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU, + 0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U, + 0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU, + 0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U, + 0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU, + 0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U, + 0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU, + 0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U, + 0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U, + 0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU, + 0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U, + 0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U, + 0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U, + 0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU, + 0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U, + 0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U, + 0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU, + 0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U, + 0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U, + 0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U, + 0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU, + 0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU, + 0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U, + 0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU, + 0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU, + 0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U, + 0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU, + 0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U, + 0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU, + 0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U, + 0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U, + 0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U, + 0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU, + 0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U, + 0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU, + 0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U, + 0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU, + 0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U, + 0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U, + 0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU, + 0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU, + 0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU, + 0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U, + 0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U, + 0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU, + 0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U, + 0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU, + 0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U, + 0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU, + 0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U, + 0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU, + 0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU, + 0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U, + 0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU, + 0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U, + 0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU, + 0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U, + 0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U, + 0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U, + 0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU, + 0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU, + 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U, + 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, + 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U +}; + +#define t_fn0(x) (sharedMemory[ (x)]) +#define t_fn1(x) (sharedMemory[256 + (x)]) +#define t_fn2(x) (sharedMemory[512 + (x)]) +#define t_fn3(x) (sharedMemory[768 + (x)]) + + +#define round(dummy,y,x,k) \ + y[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \ + y[1] = (k)[1] ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \ + y[2] = (k)[2] ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \ + y[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) )); + +__device__ __forceinline__ static void cn_aes_single_round(uint32_t * __restrict__ sharedMemory, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t * __restrict__ expandedKey) +{ + round(sharedMemory, out, in, expandedKey); +} + +__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * __restrict__ sharedMemory, uint32_t * __restrict__ val, const uint32_t * __restrict__ expandedKey) +{ + uint32_t b1[4]; + round(sharedMemory, b1, val, expandedKey); + round(sharedMemory, val, b1, expandedKey + 1 * N_COLS); + round(sharedMemory, b1, val, expandedKey + 2 * N_COLS); + round(sharedMemory, val, b1, expandedKey + 3 * N_COLS); + round(sharedMemory, b1, val, expandedKey + 4 * N_COLS); + round(sharedMemory, val, b1, expandedKey + 5 * N_COLS); + round(sharedMemory, b1, val, expandedKey + 6 * N_COLS); + round(sharedMemory, val, b1, expandedKey + 7 * N_COLS); + round(sharedMemory, b1, val, expandedKey + 8 * N_COLS); + round(sharedMemory, val, b1, expandedKey + 9 * N_COLS); +} + +__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory) +{ + for(int i = threadIdx.x; i < 1024; i += blockDim.x) + sharedMemory[i] = d_t_fn[i]; +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp new file mode 100644 index 0000000..07ae169 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp @@ -0,0 +1,193 @@ +#pragma once + +typedef struct { + uint32_t h[8], s[4], t[2]; + int buflen, nullt; + uint8_t buf[64]; +} blake_state; + +#define U8TO32(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) + +#define U32TO8(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define BLAKE_ROT(x,n) ROTR32(x, n) +#define BLAKE_G(a,b,c,d,e) \ + v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \ + v[d] = BLAKE_ROT(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = BLAKE_ROT(v[b] ^ v[c],12); \ + v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \ + v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ + v[b] = BLAKE_ROT(v[b] ^ v[c], 7); + +__constant__ uint8_t d_blake_sigma[14][16] = +{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8} +}; +__constant__ uint32_t d_blake_cst[16] += { + 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 +}; + +__device__ void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * __restrict__ block) +{ + uint32_t v[16], m[16], i; + + for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4); + for (i = 0; i < 8; ++i) v[i] = S->h[i]; + v[ 8] = S->s[0] ^ 0x243F6A88; + v[ 9] = S->s[1] ^ 0x85A308D3; + v[10] = S->s[2] ^ 0x13198A2E; + v[11] = S->s[3] ^ 0x03707344; + v[12] = 0xA4093822; + v[13] = 0x299F31D0; + v[14] = 0x082EFA98; + v[15] = 0xEC4E6C89; + + if (S->nullt == 0) + { + v[12] ^= S->t[0]; + v[13] ^= S->t[0]; + v[14] ^= S->t[1]; + v[15] ^= S->t[1]; + } + + for (i = 0; i < 14; ++i) + { + BLAKE_G(0, 4, 8, 12, 0); + BLAKE_G(1, 5, 9, 13, 2); + BLAKE_G(2, 6, 10, 14, 4); + BLAKE_G(3, 7, 11, 15, 6); + BLAKE_G(3, 4, 9, 14, 14); + BLAKE_G(2, 7, 8, 13, 12); + BLAKE_G(0, 5, 10, 15, 8); + BLAKE_G(1, 6, 11, 12, 10); + } + + for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i]; + for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; +} + +__device__ void cn_blake_update(blake_state * __restrict__ S, const uint8_t * __restrict__ data, uint64_t datalen) +{ + int left = S->buflen >> 3; + int fill = 64 - left; + + if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) + { + memcpy((void *) (S->buf + left), (void *) data, fill); + S->t[0] += 512; + if (S->t[0] == 0) S->t[1]++; + cn_blake_compress(S, S->buf); + data += fill; + datalen -= (fill << 3); + left = 0; + } + + while (datalen >= 512) + { + S->t[0] += 512; + if (S->t[0] == 0) S->t[1]++; + cn_blake_compress(S, data); + data += 64; + datalen -= 512; + } + + if (datalen > 0) + { + memcpy((void *) (S->buf + left), (void *) data, datalen >> 3); + S->buflen = (left << 3) + datalen; + } + else + { + S->buflen = 0; + } +} + +__device__ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest) +{ + const uint8_t padding[] = + { + 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + }; + + uint8_t pa = 0x81, pb = 0x01; + uint8_t msglen[8]; + uint32_t lo = S->t[0] + S->buflen, hi = S->t[1]; + if (lo < (unsigned) S->buflen) hi++; + U32TO8(msglen + 0, hi); + U32TO8(msglen + 4, lo); + + if (S->buflen == 440) + { + S->t[0] -= 8; + cn_blake_update(S, &pa, 8); + } + else + { + if (S->buflen < 440) + { + if (S->buflen == 0) S->nullt = 1; + S->t[0] -= 440 - S->buflen; + cn_blake_update(S, padding, 440 - S->buflen); + } + else + { + S->t[0] -= 512 - S->buflen; + cn_blake_update(S, padding, 512 - S->buflen); + S->t[0] -= 440; + cn_blake_update(S, padding + 1, 440); + S->nullt = 1; + } + cn_blake_update(S, &pb, 8); + S->t[0] -= 8; + } + S->t[0] -= 64; + cn_blake_update(S, msglen, 64); + + U32TO8(digest + 0, S->h[0]); + U32TO8(digest + 4, S->h[1]); + U32TO8(digest + 8, S->h[2]); + U32TO8(digest + 12, S->h[3]); + U32TO8(digest + 16, S->h[4]); + U32TO8(digest + 20, S->h[5]); + U32TO8(digest + 24, S->h[6]); + U32TO8(digest + 28, S->h[7]); +} + +__device__ void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out) +{ + blake_state bs; + blake_state *S = (blake_state *)&bs; + + S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372; + S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C; + S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19; + S->t[0] = S->t[1] = S->buflen = S->nullt = 0; + S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0; + + cn_blake_update(S, (uint8_t *)in, inlen * 8); + cn_blake_final(S, (uint8_t *)out); +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu new file mode 100644 index 0000000..0c086e8 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -0,0 +1,343 @@ +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <cuda.h> +#include <cuda_runtime.h> + +#ifdef _WIN32 +#include <windows.h> +extern "C" void compat_usleep(uint64_t waitTime) +{ + if (waitTime > 0) + { + if (waitTime > 100) + { + // use a waitable timer for larger intervals > 0.1ms + + HANDLE timer; + LARGE_INTEGER ft; + + ft.QuadPart = -(10*waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time + + timer = CreateWaitableTimer(NULL, TRUE, NULL); + SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); + WaitForSingleObject(timer, INFINITE); + CloseHandle(timer); + } + else + { + // use a polling loop for short intervals <= 100ms + + LARGE_INTEGER perfCnt, start, now; + __int64 elapsed; + + QueryPerformanceFrequency(&perfCnt); + QueryPerformanceCounter(&start); + do { + SwitchToThread(); + QueryPerformanceCounter((LARGE_INTEGER*) &now); + elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000); + } while ( elapsed < waitTime ); + } + } +} +#else +#include <unistd.h> +extern "C" void compat_usleep(uint64_t waitTime) +{ + usleep(waitTime); +} +#endif + +#include "cryptonight.hpp" +#include "cuda_extra.hpp" +#include "cuda_aes.hpp" +#include "cuda_device.hpp" + +/* sm_2X is limited to 2GB due to the small TLB + * therefore we never use 64bit indices + */ +#if defined(XMR_STAK_LARGEGRID) && (__CUDA_ARCH__ >= 300) +typedef uint64_t IndexType; +#else +typedef int IndexType; +#endif + +__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi ) +{ + *product_hi = __umul64hi( multiplier, multiplicand ); + return (multiplier * multiplicand ); +} + +template< typename T > +__device__ __forceinline__ T loadGlobal64( T * const addr ) +{ + T x; + asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) ); + return x; +} + +template< typename T > +__device__ __forceinline__ T loadGlobal32( T * const addr ) +{ + T x; + asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) ); + return x; +} + + +template< typename T > +__device__ __forceinline__ void storeGlobal32( T* addr, T const & val ) +{ + asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) ); +} + +__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1 ) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init( sharedMemory ); + __syncthreads( ); + + const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; + const int sub = ( threadIdx.x & 7 ) << 2; + + const int batchsize = 0x80000 >> bfactor; + const int start = partidx * batchsize; + const int end = start + batchsize; + + if ( thread >= threads ) + return; + + uint32_t key[40], text[4]; + + MEMCPY8( key, ctx_key1 + thread * 40, 20 ); + + if( partidx == 0 ) + { + // first round + MEMCPY8( text, ctx_state + thread * 50 + sub + 16, 2 ); + } + else + { + // load previous text data + MEMCPY8( text, &long_state[( (uint64_t) thread << 19 ) + sub + start - 32], 2 ); + } + __syncthreads( ); + for ( int i = start; i < end; i += 32 ) + { + cn_aes_pseudo_round_mut( sharedMemory, text, key ); + MEMCPY8(&long_state[((uint64_t) thread << 19) + (sub + i)], text, 2); + } +} + +/** avoid warning `unused parameter` */ +template< typename T > +__forceinline__ __device__ void unusedVar( const T& ) +{ +} + +/** shuffle data for + * + * - this method can be used with all compute architectures + * - for <sm_30 shared memory is needed + * + * @param ptr pointer to shared memory, size must be `threadIdx.x * sizeof(uint32_t)` + * value can be NULL for compute architecture >=sm_30 + * @param sub thread number within the group, range [0;4) + * @param value value to share with other threads within the group + * @param src thread number within the group from where the data is read, range [0;4) + */ +__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src) +{ +#if( __CUDA_ARCH__ < 300 ) + ptr[sub] = val; + return ptr[src&3]; +#else + unusedVar( ptr ); + unusedVar( sub ); + return __shfl( val, src, 4 ); +#endif +} + +#ifdef XMR_STAK_THREADS +__launch_bounds__( XMR_STAK_THREADS * 4 ) +#endif +__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b ) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init( sharedMemory ); + + __syncthreads( ); + + const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2; + const int sub = threadIdx.x & 3; + const int sub2 = sub & 2; + +#if( __CUDA_ARCH__ < 300 ) + extern __shared__ uint32_t shuffleMem[]; + volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC)); +#else + volatile uint32_t* sPtr = NULL; +#endif + if ( thread >= threads ) + return; + + int i, k; + uint32_t j; + const int batchsize = ITER >> ( 2 + bfactor ); + const int start = partidx * batchsize; + const int end = start + batchsize; + uint32_t * long_state = &d_long_state[(IndexType) thread << 19]; + uint32_t * ctx_a = d_ctx_a + thread * 4; + uint32_t * ctx_b = d_ctx_b + thread * 4; + uint32_t a, d[2]; + uint32_t t1[2], t2[2], res; + + a = ctx_a[sub]; + d[1] = ctx_b[sub]; + #pragma unroll 2 + for ( i = start; i < end; ++i ) + { + #pragma unroll 2 + for ( int x = 0; x < 2; ++x ) + { + j = ( ( shuffle(sPtr,sub, a, 0) & 0x1FFFF0 ) >> 2 ) + sub; + + const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j ); + const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1); + const uint32_t x_2 = shuffle(sPtr,sub, x_0, sub + 2); + const uint32_t x_3 = shuffle(sPtr,sub, x_0, sub + 3); + d[x] = a ^ + t_fn0( x_0 & 0xff ) ^ + t_fn1( (x_1 >> 8) & 0xff ) ^ + t_fn2( (x_2 >> 16) & 0xff ) ^ + t_fn3( ( x_3 >> 24 ) ); + + + //XOR_BLOCKS_DST(c, b, &long_state[j]); + t1[0] = shuffle(sPtr,sub, d[x], 0); + //long_state[j] = d[0] ^ d[1]; + storeGlobal32( long_state + j, d[0] ^ d[1] ); + + //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]); + j = ( ( *t1 & 0x1FFFF0 ) >> 2 ) + sub; + + uint32_t yy[2]; + *( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) ); + uint32_t zz[2]; + zz[0] = shuffle(sPtr,sub, yy[0], 0); + zz[1] = shuffle(sPtr,sub, yy[1], 0); + + t1[1] = shuffle(sPtr,sub, d[x], 1); + #pragma unroll + for ( k = 0; k < 2; k++ ) + t2[k] = shuffle(sPtr,sub, a, k + sub2); + + *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) ); + + res = *( (uint64_t *) t2 ) >> ( sub & 1 ? 32 : 0 ); + + storeGlobal32( long_state + j, res ); + a = ( sub & 1 ? yy[1] : yy[0] ) ^ res; + } + } + + if ( bfactor > 0 ) + { + ctx_a[sub] = a; + ctx_b[sub] = d[1]; + } +} + +__global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 ) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init( sharedMemory ); + __syncthreads( ); + + int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; + int sub = ( threadIdx.x & 7 ) << 2; + + const int batchsize = 0x80000 >> bfactor; + const int start = partidx * batchsize; + const int end = start + batchsize; + + if ( thread >= threads ) + return; + + uint32_t key[40], text[4]; + MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); + MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 ); + + __syncthreads( ); + for ( int i = start; i < end; i += 32 ) + { +#pragma unroll + for ( int j = 0; j < 4; ++j ) + text[j] ^= long_state[((IndexType) thread << 19) + (sub + i + j)]; + + cn_aes_pseudo_round_mut( sharedMemory, text, key ); + } + + MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); +} + +extern "C" void cryptonight_core_cpu_hash(nvid_ctx* ctx) +{ + dim3 grid( ctx->device_blocks ); + dim3 block( ctx->device_threads ); + dim3 block4( ctx->device_threads << 2 ); + dim3 block8( ctx->device_threads << 3 ); + + int partcount = 1 << ctx->device_bfactor; + + /* bfactor for phase 1 and 3 + * + * phase 1 and 3 consume less time than phase 2, therefore we begin with the + * kernel splitting if the user defined a `bfactor >= 5` + */ + int bfactorOneThree = ctx->device_bfactor - 4; + if( bfactorOneThree < 0 ) + bfactorOneThree = 0; + + int partcountOneThree = 1 << bfactorOneThree; + + for ( int i = 0; i < partcountOneThree; i++ ) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key1 )); + + if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + } + if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + + for ( int i = 0; i < partcount; i++ ) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase2<<< + grid, + block4, + block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( + ctx->device_blocks*ctx->device_threads, + ctx->device_bfactor, + i, + ctx->d_long_state, + ctx->d_ctx_a, + ctx->d_ctx_b + )); + + if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + } + + for ( int i = 0; i < partcountOneThree; i++ ) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + ctx->d_ctx_state, ctx->d_ctx_key2 )); + } +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp new file mode 100644 index 0000000..078c165 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp @@ -0,0 +1,30 @@ + +#pragma once + +#include <cuda_runtime.h> +#include <stdexcept> +#include <iostream> +#include <string> + +/** execute and check a CUDA api command + * + * @param id gpu id (thread id) + * @param ... CUDA api command + */ +#define CUDA_CHECK(id, ...) { \ + cudaError_t error = __VA_ARGS__; \ + if(error!=cudaSuccess){ \ + std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__ << std::endl; \ + throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \ + } \ +} \ +( (void) 0 ) + +/** execute and check a CUDA kernel + * + * @param id gpu id (thread id) + * @param ... CUDA kernel call + */ +#define CUDA_CHECK_KERNEL(id, ...) \ + __VA_ARGS__; \ + CUDA_CHECK(id, cudaGetLastError()) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu new file mode 100644 index 0000000..7734473 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -0,0 +1,367 @@ +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <device_functions.hpp> +#include <algorithm> + +#ifdef __CUDACC__ +__constant__ +#else +const +#endif +uint64_t keccakf_rndc[24] ={ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +typedef unsigned char BitSequence; +typedef unsigned long long DataLength; + +#include "cryptonight.hpp" +#include "cuda_extra.hpp" +#include "cuda_keccak.hpp" +#include "cuda_blake.hpp" +#include "cuda_groestl.hpp" +#include "cuda_jh.hpp" +#include "cuda_skein.hpp" +#include "cuda_device.hpp" + +__constant__ uint8_t d_sub_byte[16][16] ={ + {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 }, + {0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 }, + {0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 }, + {0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 }, + {0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 }, + {0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf }, + {0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 }, + {0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 }, + {0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 }, + {0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb }, + {0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 }, + {0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 }, + {0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a }, + {0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e }, + {0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf }, + {0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 } +}; + +__device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__ key, const uint32_t * __restrict__ data ) +{ + int i, j; + uint8_t temp[4]; + const uint32_t aes_gf[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; + + MEMSET4( key, 0, 40 ); + MEMCPY4( key, data, 8 ); + +#pragma unroll + for ( i = 8; i < 40; i++ ) + { + *(uint32_t *) temp = key[i - 1]; + if ( i % 8 == 0 ) + { + *(uint32_t *) temp = ROTR32( *(uint32_t *) temp, 8 ); + for ( j = 0; j < 4; j++ ) + temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f]; + *(uint32_t *) temp ^= aes_gf[i / 8 - 1]; + } + else + { + if ( i % 8 == 4 ) + { +#pragma unroll + for ( j = 0; j < 4; j++ ) + temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f]; + } + } + + key[i] = key[( i - 8 )] ^ *(uint32_t *) temp; + } +} + +__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 ) +{ + int thread = ( blockDim.x * blockIdx.x + threadIdx.x ); + + if ( thread >= threads ) + return; + + uint32_t ctx_state[50]; + uint32_t ctx_a[4]; + uint32_t ctx_b[4]; + uint32_t ctx_key1[40]; + uint32_t ctx_key2[40]; + uint32_t input[21]; + + memcpy( input, d_input, len ); + //*((uint32_t *)(((char *)input) + 39)) = startNonce + thread; + uint32_t nonce = startNonce + thread; + for ( int i = 0; i < sizeof (uint32_t ); ++i ) + ( ( (char *) input ) + 39 )[i] = ( (char*) ( &nonce ) )[i]; //take care of pointer alignment + + cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state ); + cryptonight_aes_set_key( ctx_key1, ctx_state ); + cryptonight_aes_set_key( ctx_key2, ctx_state + 8 ); + XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); + XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); + + memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 ); + memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); + memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); + memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); +} + +__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state ) +{ + const int thread = blockDim.x * blockIdx.x + threadIdx.x; + + if ( thread >= threads ) + return; + + int i; + uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50; + uint64_t hash[4]; + uint32_t state[50]; + +#pragma unroll + for ( i = 0; i < 50; i++ ) + state[i] = ctx_state[i]; + + cn_keccakf2( (uint64_t *) state ); + + switch ( ( (uint8_t *) state )[0] & 0x03 ) + { + case 0: + cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); + break; + case 1: + cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + case 2: + cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + case 3: + cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + default: + break; + } + + // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values + // and expect an accurate result for target > 32-bit without implementing carries + + if ( hash[3] < target ) + { + uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + + if(idx < 10) + d_res_nonce[idx] = thread; + } +} + +extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len ) +{ + ctx->inputlen = len; + CUDA_CHECK(ctx->device_id, cudaMemcpy( ctx->d_input, data, len, cudaMemcpyHostToDevice )); +} + +extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) +{ + cudaError_t err; + err = cudaSetDevice(ctx->device_id); + if(err != cudaSuccess) + { + printf("GPU %d: %s", ctx->device_id, cudaGetErrorString(err)); + return 0; + } + + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + + size_t wsize = ctx->device_blocks * ctx->device_threads; + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, (size_t)MEMORY * wsize)); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize)); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize)); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize)); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_text, 32 * sizeof(uint32_t) * wsize)); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize)); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, 4 * sizeof(uint32_t) * wsize)); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) )); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) )); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) )); + return 1; +} + +extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce) +{ + int threadsperblock = 128; + uint32_t wsize = ctx->device_blocks * ctx->device_threads; + + dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock ); + dim3 block( threadsperblock ); + + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); +} + +extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce) +{ + int threadsperblock = 128; + uint32_t wsize = ctx->device_blocks * ctx->device_threads; + + dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock ); + dim3 block( threadsperblock ); + + CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) )); + CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) )); + + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state )); + + CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost )); + CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )); + + /* There is only a 32bit limit for the counter on the device side + * therefore this value can be greater than 10, in that case limit rescount + * to 10 entries. + */ + if(*rescount > 10) + *rescount = 10; + for(int i=0; i < *rescount; i++) + resnonce[i] += startNonce; +} + +extern "C" int cuda_get_devicecount( int* deviceCount) +{ + cudaError_t err; + *deviceCount = 0; + err = cudaGetDeviceCount(deviceCount); + if(err != cudaSuccess) + { + if(err == cudaErrorNoDevice) + printf("ERROR: NVIDIA no CUDA device found!\n"); + else if(err == cudaErrorInsufficientDriver) + printf("WARNING: NVIDIA Insufficient driver!\n"); + else + printf("WARNING: NVIDIA Unable to query number of CUDA devices!\n"); + return 0; + } + + return 1; +} + +extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) +{ + cudaError_t err; + int version; + + err = cudaDriverGetVersion(&version); + if(err != cudaSuccess) + { + printf("Unable to query CUDA driver version! Is an nVidia driver installed?\n"); + return 0; + } + + if(version < CUDART_VERSION) + { + printf("Driver does not support CUDA %d.%d API! Update your nVidia driver!\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10); + return 0; + } + + int GPU_N; + if(cuda_get_devicecount(&GPU_N) == 0) + { + return 0; + } + + if(ctx->device_id >= GPU_N) + { + printf("Invalid device ID!\n"); + return 0; + } + + cudaDeviceProp props; + err = cudaGetDeviceProperties(&props, ctx->device_id); + if(err != cudaSuccess) + { + printf("\nGPU %d: %s\n%s line %d\n", ctx->device_id, cudaGetErrorString(err), __FILE__, __LINE__); + return 0; + } + + ctx->device_name = strdup(props.name); + ctx->device_mpcount = props.multiProcessorCount; + ctx->device_arch[0] = props.major; + ctx->device_arch[1] = props.minor; + + ctx->name = std::string(props.name); + + // set all evice option those marked as auto (-1) to a valid value + if(ctx->device_blocks == -1) + { + /* good values based of my experience + * - 3 * SMX count >=sm_30 + * - 2 * SMX count for <sm_30 + */ + ctx->device_blocks = props.multiProcessorCount * + ( props.major < 3 ? 2 : 3 ); + } + if(ctx->device_threads == -1) + { + /* sm_20 devices can only run 512 threads per cuda block + * `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts + * `8 * ctx->device_threads` threads per block + */ + ctx->device_threads = 64; + constexpr size_t byte2mib = 1024u * 1024u; + + // no limit by default 1TiB + size_t maxMemUsage = byte2mib * byte2mib; + if(props.major < 6) + { + // limit memory usage for GPUs before pascal + maxMemUsage = size_t(2048u) * byte2mib; + } + if(props.major == 2) + { + // limit memory usage for sm 20 GPUs + maxMemUsage = size_t(1024u) * byte2mib; + } + + size_t freeMemory = 0; + size_t totalMemory = 0; + CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory)); + + ctx->total_device_memory = totalMemory; + ctx->free_device_memory = freeMemory; + + // keep 64MiB memory free (value is randomly chosen) + // 200byte are meta data memory (result nonce, ...) + size_t availableMem = freeMemory - (64u * 1024 * 1024) - 200u; + size_t limitedMemory = std::min(availableMem, maxMemUsage); + // up to 920bytes extra memory is used per thread for some kernel (lmem/local memory) + // 680bytes are extra meta data memory per hash + size_t perThread = size_t(MEMORY) + 740u + 680u; + size_t max_intensity = limitedMemory / perThread; + ctx->device_threads = max_intensity / ctx->device_blocks; + // use only odd number of threads + ctx->device_threads = ctx->device_threads & 0xFFFFFFFE; + + if(props.major == 2 && ctx->device_threads > 64) + { + // Fermi gpus only support 512 threads per block (we need start 4 * configured threads) + ctx->device_threads = 64; + } + + } + + return 1; +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp new file mode 100644 index 0000000..3ccdcd6 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp @@ -0,0 +1,104 @@ +#pragma once + +#ifdef __INTELLISENSE__ +#define __CUDA_ARCH__ 520 +/* avoid red underlining */ + +struct uint3 +{ + unsigned int x, y, z; +}; + +struct uint3 threadIdx; +struct uint3 blockIdx; +struct uint3 blockDim; +#define __funnelshift_r(a,b,c) 1 +#define __syncthreads() +#define asm(x) +#define __shfl(a,b,c) 1 +#endif + +#define MEMORY (1 << 21) // 2 MiB / 2097152 B +#define ITER (1 << 20) // 1048576 +#define AES_BLOCK_SIZE 16 +#define AES_KEY_SIZE 32 +#define INIT_SIZE_BLK 8 +#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B + +#define C32(x) ((uint32_t)(x ## U)) +#define T32(x) ((x) & C32(0xFFFFFFFF)) + +#if __CUDA_ARCH__ >= 350 +__forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) +{ + uint2 result; + if(offset >= 32) + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } + else + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#define ROTL64(x, n) (cuda_ROTL64(x, n)) +#else +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif + +#if __CUDA_ARCH__ < 350 +#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else +#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) +#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) +#endif + +#define MEMSET8(dst,what,cnt) { \ + int i_memset8; \ + uint64_t *out_memset8 = (uint64_t *)(dst); \ + for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \ + out_memset8[i_memset8] = (what); } + +#define MEMSET4(dst,what,cnt) { \ + int i_memset4; \ + uint32_t *out_memset4 = (uint32_t *)(dst); \ + for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \ + out_memset4[i_memset4] = (what); } + +#define MEMCPY8(dst,src,cnt) { \ + int i_memcpy8; \ + uint64_t *in_memcpy8 = (uint64_t *)(src); \ + uint64_t *out_memcpy8 = (uint64_t *)(dst); \ + for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \ + out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; } + +#define MEMCPY4(dst,src,cnt) { \ + int i_memcpy4; \ + uint32_t *in_memcpy4 = (uint32_t *)(src); \ + uint32_t *out_memcpy4 = (uint32_t *)(dst); \ + for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \ + out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; } + +#define XOR_BLOCKS(a,b) { \ + ((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \ + ((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; } + +#define XOR_BLOCKS_DST(x,y,z) { \ + ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \ + ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; } + +#define MUL_SUM_XOR_DST(a,c,dst) { \ + const uint64_t dst0 = ((uint64_t *)dst)[0]; \ + uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \ + hi += ((uint64_t *)c)[0]; \ + ((uint64_t *)c)[0] = dst0 ^ hi; \ + ((uint64_t *)dst)[0] = hi; \ + ((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \ + } + +#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff))) + diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp new file mode 100644 index 0000000..a37934c --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp @@ -0,0 +1,357 @@ +#pragma once + +#define GROESTL_ROWS 8 +#define GROESTL_LENGTHFIELDLEN GROESTL_ROWS +#define GROESTL_COLS512 8 + +#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512) + +#define GROESTL_ROUNDS512 10 +#define GROESTL_HASH_BIT_LEN 256 + +#define GROESTL_ROTL32(v, n) ROTL32(v, n) + + +#define li_32(h) 0x##h##u +#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n))) + +#define u32BIG(a) \ + ((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a,24) & li_32(FF00FF00))) + +typedef struct { + uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)]; /* actual state */ + uint32_t block_counter1, + block_counter2; /* message block counter(s) */ + BitSequence buffer[GROESTL_SIZE512]; /* data buffer */ + int buf_ptr; /* data buffer pointer */ + int bits_in_last_byte; /* no. of message bits in last byte of data buffer */ +} groestlHashState; + + +__constant__ uint32_t d_groestl_T[512] = +{ + 0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc +, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5 +, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d +, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded +, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1 +, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441 +, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4 +, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba +, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616 +, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2 +, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c +, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de +, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7 +, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e +, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c +, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7 +, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b +, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4 +, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e +, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a +, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37 +, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86 +, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b +, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028 +, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3 +, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94 +, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836 +, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0 +, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2 +, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e +, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3 +, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e +}; + +#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \ + { temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \ + v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \ + v1 = temp_var; } + +#define GROESTL_COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \ + tu = d_groestl_T[2*(uint32_t)x[4*c0+0]]; \ + tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1]; \ + tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]]; \ + tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]]; \ + tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]]; \ + tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]]; \ + tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1]; \ + tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]]; \ + tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]]; \ + tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]]; \ + tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ + tl ^= tv1; \ + tu ^= tv2; \ + y[i] = tu; \ + y[i+1] = tl; + +__device__ void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r) +{ + uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; + uint32_t* x32 = (uint32_t*)x; + x32[ 0] ^= 0x00000000^r; + x32[ 2] ^= 0x00000010^r; + x32[ 4] ^= 0x00000020^r; + x32[ 6] ^= 0x00000030^r; + x32[ 8] ^= 0x00000040^r; + x32[10] ^= 0x00000050^r; + x32[12] ^= 0x00000060^r; + x32[14] ^= 0x00000070^r; + GROESTL_COLUMN(x,y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y,10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y,12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y,14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); +} + +__device__ void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r) +{ + uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; + uint32_t* x32 = (uint32_t*)x; + x32[ 0] = ~x32[ 0]; + x32[ 1] ^= 0xffffffff^r; + x32[ 2] = ~x32[ 2]; + x32[ 3] ^= 0xefffffff^r; + x32[ 4] = ~x32[ 4]; + x32[ 5] ^= 0xdfffffff^r; + x32[ 6] = ~x32[ 6]; + x32[ 7] ^= 0xcfffffff^r; + x32[ 8] = ~x32[ 8]; + x32[ 9] ^= 0xbfffffff^r; + x32[10] = ~x32[10]; + x32[11] ^= 0xafffffff^r; + x32[12] = ~x32[12]; + x32[13] ^= 0x9fffffff^r; + x32[14] = ~x32[14]; + x32[15] ^= 0x8fffffff^r; + GROESTL_COLUMN(x,y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y,10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y,12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x,y,14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); +} + +__device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m) +{ + int i; + uint32_t Ptmp[2*GROESTL_COLS512]; + uint32_t Qtmp[2*GROESTL_COLS512]; + uint32_t y[2*GROESTL_COLS512]; + uint32_t z[2*GROESTL_COLS512]; + + for (i = 0; i < 2*GROESTL_COLS512; i++) + { + z[i] = m[i]; + Ptmp[i] = h[i]^m[i]; + } + + cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000); + cn_groestl_RND512Q((uint8_t*)y, z, 0x01000000); + cn_groestl_RND512Q((uint8_t*)z, y, 0x02000000); + cn_groestl_RND512Q((uint8_t*)y, z, 0x03000000); + cn_groestl_RND512Q((uint8_t*)z, y, 0x04000000); + cn_groestl_RND512Q((uint8_t*)y, z, 0x05000000); + cn_groestl_RND512Q((uint8_t*)z, y, 0x06000000); + cn_groestl_RND512Q((uint8_t*)y, z, 0x07000000); + cn_groestl_RND512Q((uint8_t*)z, y, 0x08000000); + cn_groestl_RND512Q((uint8_t*)y, Qtmp, 0x09000000); + + cn_groestl_RND512P((uint8_t*)Ptmp, y, 0x00000000); + cn_groestl_RND512P((uint8_t*)y, z, 0x00000001); + cn_groestl_RND512P((uint8_t*)z, y, 0x00000002); + cn_groestl_RND512P((uint8_t*)y, z, 0x00000003); + cn_groestl_RND512P((uint8_t*)z, y, 0x00000004); + cn_groestl_RND512P((uint8_t*)y, z, 0x00000005); + cn_groestl_RND512P((uint8_t*)z, y, 0x00000006); + cn_groestl_RND512P((uint8_t*)y, z, 0x00000007); + cn_groestl_RND512P((uint8_t*)z, y, 0x00000008); + cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009); + + for (i = 0; i < 2*GROESTL_COLS512; i++) + h[i] ^= Ptmp[i]^Qtmp[i]; +} + +__device__ void cn_groestl_outputtransformation(groestlHashState *ctx) +{ + int j; + uint32_t temp[2*GROESTL_COLS512]; + uint32_t y[2*GROESTL_COLS512]; + uint32_t z[2*GROESTL_COLS512]; + + for (j = 0; j < 2*GROESTL_COLS512; j++) + temp[j] = ctx->chaining[j]; + + cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000); + cn_groestl_RND512P((uint8_t*)y, z, 0x00000001); + cn_groestl_RND512P((uint8_t*)z, y, 0x00000002); + cn_groestl_RND512P((uint8_t*)y, z, 0x00000003); + cn_groestl_RND512P((uint8_t*)z, y, 0x00000004); + cn_groestl_RND512P((uint8_t*)y, z, 0x00000005); + cn_groestl_RND512P((uint8_t*)z, y, 0x00000006); + cn_groestl_RND512P((uint8_t*)y, z, 0x00000007); + cn_groestl_RND512P((uint8_t*)z, y, 0x00000008); + cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009); + + for (j = 0; j < 2*GROESTL_COLS512; j++) + ctx->chaining[j] ^= temp[j]; +} + +__device__ void cn_groestl_transform(groestlHashState * __restrict__ ctx, + const uint8_t * __restrict__ input, int msglen) +{ + for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) + { + cn_groestl_F512(ctx->chaining,(uint32_t*)input); + ctx->block_counter1++; + + if (ctx->block_counter1 == 0) + ctx->block_counter2++; + } +} + +__device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, + BitSequence* __restrict__ output) +{ + int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8; + uint8_t *s = (BitSequence*)ctx->chaining; + + if (ctx->bits_in_last_byte) + { + ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<ctx->bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte); + ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte); + ctx->bits_in_last_byte = 0; + } + else + { + ctx->buffer[(int)ctx->buf_ptr++] = 0x80; + } + + if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + { + while (ctx->buf_ptr < GROESTL_SIZE512) + ctx->buffer[(int)ctx->buf_ptr++] = 0; + + cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); + ctx->buf_ptr = 0; + } + + while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + ctx->buffer[(int)ctx->buf_ptr++] = 0; + + ctx->block_counter1++; + if (ctx->block_counter1 == 0) + ctx->block_counter2++; + ctx->buf_ptr = GROESTL_SIZE512; + + while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t)) + { + ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1; + ctx->block_counter1 >>= 8; + } + while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + { + ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2; + ctx->block_counter2 >>= 8; + } + cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); + cn_groestl_outputtransformation(ctx); + + for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) + output[j] = s[i]; + + for (i = 0; i < GROESTL_COLS512; i++) + ctx->chaining[i] = 0; + for (i = 0; i < GROESTL_SIZE512; i++) + ctx->buffer[i] = 0; +} + +__device__ void cn_groestl_update(groestlHashState* __restrict__ ctx, + const BitSequence* __restrict__ input, DataLength databitlen) +{ + int index = 0; + int msglen = (int)(databitlen/8); + int rem = (int)(databitlen%8); + + if (ctx->buf_ptr) + { + while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen) + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + + if (ctx->buf_ptr < GROESTL_SIZE512) + { + if (rem) + { + ctx->bits_in_last_byte = rem; + ctx->buffer[(int)ctx->buf_ptr++] = input[index]; + } + return; + } + + ctx->buf_ptr = 0; + cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); + } + + cn_groestl_transform(ctx, input+index, msglen-index); + index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512; + + while (index < msglen) + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + + if (rem) + { + ctx->bits_in_last_byte = rem; + ctx->buffer[(int)ctx->buf_ptr++] = input[index]; + } +} + +__device__ void cn_groestl_init(groestlHashState* ctx) +{ + int i = 0; + + for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++) + ctx->chaining[i] = 0; + + ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN); + ctx->buf_ptr = 0; + ctx->block_counter1 = 0; + ctx->block_counter2 = 0; + ctx->bits_in_last_byte = 0; +} + +__device__ void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +{ + DataLength databitlen = len << 3; + groestlHashState context; + + cn_groestl_init(&context); + cn_groestl_update(&context, data, databitlen); + cn_groestl_final(&context, hashval); +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp new file mode 100644 index 0000000..679046e --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp @@ -0,0 +1,301 @@ +#include <stdint.h> + +typedef struct { + int hashbitlen; + unsigned long long databitlen; + unsigned long long datasize_in_buffer; + uint64_t x[8][2]; + unsigned char buffer[64]; +} jhHashState; + +__constant__ unsigned char d_JH256_H0[512] = +{ + 0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, + 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, + 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, + 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, + 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, + 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, + 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, + 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69 +}; + +__constant__ unsigned char d_E8_rc[42][32] = +{ + {0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40}, + {0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31}, + {0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc}, + {0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3}, + {0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23}, + {0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97}, + {0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14}, + {0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4}, + {0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36}, + {0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f}, + {0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b}, + {0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62}, + {0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5}, + {0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f}, + {0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a}, + {0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf}, + {0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0}, + {0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a}, + {0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6}, + {0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67}, + {0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18}, + {0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e}, + {0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1}, + {0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83}, + {0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef}, + {0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65}, + {0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c}, + {0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71}, + {0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0}, + {0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f}, + {0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad}, + {0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6}, + {0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63}, + {0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f}, + {0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a}, + {0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5}, + {0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48}, + {0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e}, + {0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7}, + {0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde}, + {0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a}, + {0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2} +}; + +#define JH_SWAP1(x) (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1)); +#define JH_SWAP2(x) (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2)); +#define JH_SWAP4(x) (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4)); +#define JH_SWAP8(x) (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8)); +#define JH_SWAP16(x) (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16)); +#define JH_SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); + +#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \ + (m4) ^= (m1); \ + (m5) ^= (m2); \ + (m6) ^= (m0) ^ (m3); \ + (m7) ^= (m0); \ + (m0) ^= (m5); \ + (m1) ^= (m6); \ + (m2) ^= (m4) ^ (m7); \ + (m3) ^= (m4); + +#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1) \ + m3 = ~(m3); \ + m7 = ~(m7); \ + m0 ^= ((~(m2)) & (cc0)); \ + m4 ^= ((~(m6)) & (cc1)); \ + temp0 = (cc0) ^ ((m0) & (m1));\ + temp1 = (cc1) ^ ((m4) & (m5));\ + m0 ^= ((m2) & (m3)); \ + m4 ^= ((m6) & (m7)); \ + m3 ^= ((~(m1)) & (m2)); \ + m7 ^= ((~(m5)) & (m6)); \ + m1 ^= ((m0) & (m2)); \ + m5 ^= ((m4) & (m6)); \ + m2 ^= ((m0) & (~(m3))); \ + m6 ^= ((m4) & (~(m7))); \ + m0 ^= ((m1) | (m3)); \ + m4 ^= ((m5) | (m7)); \ + m3 ^= ((m1) & (m2)); \ + m7 ^= ((m5) & (m6)); \ + m1 ^= (temp0 & (m0)); \ + m5 ^= (temp1 & (m4)); \ + m2 ^= temp0; \ + m6 ^= temp1; + +__device__ void cn_jh_E8(jhHashState *state) +{ + uint64_t i,roundnumber,temp0,temp1; + + for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) + { + for (i = 0; i < 2; i++) + { + JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] ); + JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]); + } + + for (i = 0; i < 2; i++) + { + JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] ); + JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]); + } + + for (i = 0; i < 2; i++) + { + JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] ); + JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]); + } + + for (i = 0; i < 2; i++) + { + JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] ); + JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]); + } + + for (i = 0; i < 2; i++) + { + JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] ); + JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]); + } + + for (i = 0; i < 2; i++) + { + JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] ); + JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]); + } + + for (i = 0; i < 2; i++) + { + JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] ); + JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + } + + for (i = 1; i < 8; i = i+2) + { + temp0 = state->x[i][0]; + state->x[i][0] = state->x[i][1]; + state->x[i][1] = temp0; + } + } +} + +__device__ void cn_jh_F8(jhHashState *state) +{ + uint64_t i; + + for (i = 0; i < 8; i++) + state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i]; + + cn_jh_E8(state); + + for (i = 0; i < 8; i++) + state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i]; +} + +__device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +{ + DataLength index; + + state->databitlen += databitlen; + index = 0; + + if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) + { + if ( (databitlen & 7) == 0 ) + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)); + else + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1); + state->datasize_in_buffer += databitlen; + databitlen = 0; + } + + if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) + { + memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ); + index = 64-(state->datasize_in_buffer >> 3); + databitlen = databitlen - (512 - state->datasize_in_buffer); + cn_jh_F8(state); + state->datasize_in_buffer = 0; + } + + for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) + { + memcpy(state->buffer, data+index, 64); + cn_jh_F8(state); + } + + if ( databitlen > 0) + { + if ((databitlen & 7) == 0) + memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3); + else + memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1); + state->datasize_in_buffer = databitlen; + } +} + +/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/ +__device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval) +{ + unsigned int i; + //uint32_t *bufptr = (uint32_t *)state->buffer; + + if ( (state->databitlen & 0x1ff) == 0 ) + { + /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ + memset(state->buffer, 0, 64); + //for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000; + state->buffer[0] = 0x80; + state->buffer[63] = state->databitlen & 0xff; + state->buffer[62] = (state->databitlen >> 8) & 0xff; + state->buffer[61] = (state->databitlen >> 16) & 0xff; + state->buffer[60] = (state->databitlen >> 24) & 0xff; + state->buffer[59] = (state->databitlen >> 32) & 0xff; + state->buffer[58] = (state->databitlen >> 40) & 0xff; + state->buffer[57] = (state->databitlen >> 48) & 0xff; + state->buffer[56] = (state->databitlen >> 56) & 0xff; + cn_jh_F8(state); + } + else + { + /*set the rest of the bytes in the buffer to 0*/ + if ( (state->datasize_in_buffer & 7) == 0) + { + for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) + state->buffer[i] = 0; + } + else + { + for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) + state->buffer[i] = 0; + } + + /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ + state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7)); + + cn_jh_F8(state); + memset(state->buffer, 0, 64); + //for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000; + state->buffer[63] = state->databitlen & 0xff; + state->buffer[62] = (state->databitlen >> 8) & 0xff; + state->buffer[61] = (state->databitlen >> 16) & 0xff; + state->buffer[60] = (state->databitlen >> 24) & 0xff; + state->buffer[59] = (state->databitlen >> 32) & 0xff; + state->buffer[58] = (state->databitlen >> 40) & 0xff; + state->buffer[57] = (state->databitlen >> 48) & 0xff; + state->buffer[56] = (state->databitlen >> 56) & 0xff; + cn_jh_F8(state); + } + + memcpy(hashval,(unsigned char*)state->x+64+32,32); +} + +__device__ void cn_jh_init(jhHashState *state, int hashbitlen) +{ + state->databitlen = 0; + state->datasize_in_buffer = 0; + state->hashbitlen = hashbitlen; + memcpy(state->x, d_JH256_H0, 128); +} + +__device__ void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +{ + int hashbitlen = 256; + DataLength databitlen = len << 3; + jhHashState state; + + cn_jh_init(&state, hashbitlen); + cn_jh_update(&state, data, databitlen); + cn_jh_final(&state, hashval); +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp new file mode 100644 index 0000000..99c6516 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp @@ -0,0 +1,197 @@ +#if __CUDA_ARCH__ >= 350 + __forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset) + { + uint2 result; + if(offset >= 32) + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } + else + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); + } + #define rotl64_1(x, y) (cuda_rotl64((x), (y))) +#else + #define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y)))) +#endif + +#define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y)) +#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) + +__device__ __forceinline__ void cn_keccakf2(uint64_t *s) +{ + uint8_t i; + + for(i = 0; i < 24; ++i) + { + uint64_t bc[5], tmpxor[5], tmp1, tmp2; + + tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = rotl64_2(s[6] ^ bc[0], 12); + s[6] = rotl64_1(s[9] ^ bc[3], 20); + s[9] = rotl64_2(s[22] ^ bc[1], 29); + s[22] = rotl64_2(s[14] ^ bc[3], 7); + s[14] = rotl64_1(s[20] ^ bc[4], 18); + s[20] = rotl64_2(s[2] ^ bc[1], 30); + s[2] = rotl64_2(s[12] ^ bc[1], 11); + s[12] = rotl64_1(s[13] ^ bc[2], 25); + s[13] = rotl64_1(s[19] ^ bc[3], 8); + s[19] = rotl64_2(s[23] ^ bc[2], 24); + s[23] = rotl64_2(s[15] ^ bc[4], 9); + s[15] = rotl64_1(s[4] ^ bc[3], 27); + s[4] = rotl64_1(s[24] ^ bc[3], 14); + s[24] = rotl64_1(s[21] ^ bc[0], 2); + s[21] = rotl64_2(s[8] ^ bc[2], 23); + s[8] = rotl64_2(s[16] ^ bc[0], 13); + s[16] = rotl64_2(s[5] ^ bc[4], 4); + s[5] = rotl64_1(s[3] ^ bc[2], 28); + s[3] = rotl64_1(s[18] ^ bc[2], 21); + s[18] = rotl64_1(s[17] ^ bc[1], 15); + s[17] = rotl64_1(s[11] ^ bc[0], 10); + s[11] = rotl64_1(s[7] ^ bc[1], 6); + s[7] = rotl64_1(s[10] ^ bc[4], 3); + s[10] = rotl64_1(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= keccakf_rndc[i]; + } +} + +__device__ __forceinline__ void cn_keccakf(uint64_t *s) +{ + uint64_t bc[5], tmpxor[5], tmp1, tmp2; + + tmpxor[0] = s[0] ^ s[5]; + tmpxor[1] = s[1] ^ s[6] ^ 0x8000000000000000ULL; + tmpxor[2] = s[2] ^ s[7]; + tmpxor[3] = s[3] ^ s[8]; + tmpxor[4] = s[4] ^ s[9]; + + bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = rotl64_2(s[6] ^ bc[0], 12); + s[6] = rotl64_1(s[9] ^ bc[3], 20); + s[9] = rotl64_2(bc[1], 29); + s[22] = rotl64_2(bc[3], 7); + s[14] = rotl64_1(bc[4], 18); + s[20] = rotl64_2(s[2] ^ bc[1], 30); + s[2] = rotl64_2(bc[1], 11); + s[12] = rotl64_1(bc[2], 25); + s[13] = rotl64_1(bc[3], 8); + s[19] = rotl64_2(bc[2], 24); + s[23] = rotl64_2(bc[4], 9); + s[15] = rotl64_1(s[4] ^ bc[3], 27); + s[4] = rotl64_1(bc[3], 14); + s[24] = rotl64_1(bc[0], 2); + s[21] = rotl64_2(s[8] ^ bc[2], 23); + s[8] = rotl64_2(0x8000000000000000ULL ^ bc[0], 13); + s[16] = rotl64_2(s[5] ^ bc[4], 4); + s[5] = rotl64_1(s[3] ^ bc[2], 28); + s[3] = rotl64_1(bc[2], 21); + s[18] = rotl64_1(bc[1], 15); + s[17] = rotl64_1(bc[0], 10); + s[11] = rotl64_1(s[7] ^ bc[1], 6); + s[7] = rotl64_1(bc[4], 3); + s[10] = rotl64_1(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= 0x0000000000000001; + + for(int i = 1; i < 24; ++i) + { + tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = rotl64_2(s[6] ^ bc[0], 12); + s[6] = rotl64_1(s[9] ^ bc[3], 20); + s[9] = rotl64_2(s[22] ^ bc[1], 29); + s[22] = rotl64_2(s[14] ^ bc[3], 7); + s[14] = rotl64_1(s[20] ^ bc[4], 18); + s[20] = rotl64_2(s[2] ^ bc[1], 30); + s[2] = rotl64_2(s[12] ^ bc[1], 11); + s[12] = rotl64_1(s[13] ^ bc[2], 25); + s[13] = rotl64_1(s[19] ^ bc[3], 8); + s[19] = rotl64_2(s[23] ^ bc[2], 24); + s[23] = rotl64_2(s[15] ^ bc[4], 9); + s[15] = rotl64_1(s[4] ^ bc[3], 27); + s[4] = rotl64_1(s[24] ^ bc[3], 14); + s[24] = rotl64_1(s[21] ^ bc[0], 2); + s[21] = rotl64_2(s[8] ^ bc[2], 23); + s[8] = rotl64_2(s[16] ^ bc[0], 13); + s[16] = rotl64_2(s[5] ^ bc[4], 4); + s[5] = rotl64_1(s[3] ^ bc[2], 28); + s[3] = rotl64_1(s[18] ^ bc[2], 21); + s[18] = rotl64_1(s[17] ^ bc[1], 15); + s[17] = rotl64_1(s[11] ^ bc[0], 10); + s[11] = rotl64_1(s[7] ^ bc[1], 6); + s[7] = rotl64_1(s[10] ^ bc[4], 3); + s[10] = rotl64_1(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= keccakf_rndc[i]; + } +} + +__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md) +{ + uint64_t st[25]; + + MEMSET8(st + 8, 0x00, 25 - 8); + memcpy(st, in, len); + ((uint8_t*)st)[len] = 0x01; + st[16] = 0x8000000000000000ULL; + + cn_keccakf(st); + + MEMCPY8(md, st, 25); + return; +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp new file mode 100644 index 0000000..041a593 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp @@ -0,0 +1,347 @@ +#pragma once + +typedef unsigned int uint_t; /* native unsigned integer */ + +#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ + +#define SKEIN_256_STATE_WORDS ( 4) +#define SKEIN_512_STATE_WORDS ( 8) +#define SKEIN1024_STATE_WORDS (16) + +#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) + +#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) + +#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) + +#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ + +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ + +#define SKEIN_T1_FLAG_FIRST (((uint64_t) 1 ) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t) 1 ) << SKEIN_T1_POS_BIT_PAD) +#define SKEIN_T1_FLAG_FINAL (((uint64_t) 1 ) << SKEIN_T1_POS_FINAL) + +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ + +#define SKEIN_T1_BLK_TYPE(T) (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) + +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ + +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} + +#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) +#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) + +#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \ + Skein_Set_T0(ctxPtr,(T0)); \ + Skein_Set_T1(ctxPtr,(T1)); } + +#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ +{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } + +#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } + +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \ + X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \ + X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \ + X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6; + +#define I512(R) \ + X0 += ks[((R)+1) % 9]; \ + X1 += ks[((R)+2) % 9]; \ + X2 += ks[((R)+3) % 9]; \ + X3 += ks[((R)+4) % 9]; \ + X4 += ks[((R)+5) % 9]; \ + X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ + X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ + X7 += ks[((R)+8) % 9] + (R)+1; + + +#define R512_8_rounds(R) \ + R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ + R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ + R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ + R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ + I512(2*(R)); \ + R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ + R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ + R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ + R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ + I512(2*(R)+1); + +typedef struct +{ + size_t hashBitLen; + size_t bCnt; + uint64_t T[SKEIN_MODIFIER_WORDS]; +} Skein_Ctxt_Hdr_t; + +typedef struct { + Skein_Ctxt_Hdr_t h; + uint64_t X[SKEIN_256_STATE_WORDS]; + uint8_t b[SKEIN_256_BLOCK_BYTES]; +} Skein_256_Ctxt_t; + +typedef struct { + Skein_Ctxt_Hdr_t h; + uint64_t X[SKEIN_512_STATE_WORDS]; + uint8_t b[SKEIN_512_BLOCK_BYTES]; +} Skein_512_Ctxt_t; + +typedef struct { + Skein_Ctxt_Hdr_t h; + uint64_t X[SKEIN1024_STATE_WORDS]; + uint8_t b[SKEIN1024_BLOCK_BYTES]; +} Skein1024_Ctxt_t; + +typedef struct { + uint_t statebits; + union { + Skein_Ctxt_Hdr_t h; + Skein_256_Ctxt_t ctx_256; + Skein_512_Ctxt_t ctx_512; + Skein1024_Ctxt_t ctx1024; + } u; +} skeinHashState; + +__device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen) +{ + const uint64_t SKEIN_512_IV_256[] = + { + SKEIN_MK_64(0xCCD044A1,0x2FDB3E13), + SKEIN_MK_64(0xE8359030,0x1A79A9EB), + SKEIN_MK_64(0x55AEA061,0x4F816E6F), + SKEIN_MK_64(0x2A2767A4,0xAE9B94DB), + SKEIN_MK_64(0xEC06025E,0x74DD7683), + SKEIN_MK_64(0xE7A436CD,0xC4746251), + SKEIN_MK_64(0xC36FBAF9,0x393AD185), + SKEIN_MK_64(0x3EEDBA18,0x33EDFC13) + }; + + Skein_512_Ctxt_t *ctx = &state->u.ctx_512; + + ctx->h.hashBitLen = hashBitLen; + + memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X)); + + Skein_Start_New_Type(ctx, MSG); +} + +__device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) +{ + enum { + R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, + R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, + R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, + R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, + R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, + R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, + R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, + R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22 + }; + + uint64_t X0,X1,X2,X3,X4,X5,X6,X7; + uint64_t w[SKEIN_512_STATE_WORDS]; + uint64_t kw[SKEIN_512_STATE_WORDS+4]; + + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + + do + { + + ts[0] += byteCntAdd; + + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + memcpy(w, blkPtr, SKEIN_512_STATE_WORDS << 3); + + X0 = w[0] + ks[0]; + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; + + blkPtr += SKEIN_512_BLOCK_BYTES; + + R512_8_rounds( 0); + R512_8_rounds( 1); + R512_8_rounds( 2); + R512_8_rounds( 3); + R512_8_rounds( 4); + R512_8_rounds( 5); + R512_8_rounds( 6); + R512_8_rounds( 7); + R512_8_rounds( 8); + + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + ctx->X[4] = X4 ^ w[4]; + ctx->X[5] = X5 ^ w[5]; + ctx->X[6] = X6 ^ w[6]; + ctx->X[7] = X7 ^ w[7]; + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +__device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal) +{ + size_t i,n,byteCnt; + uint64_t X[SKEIN_512_STATE_WORDS]; + Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; + //size_t tmp; + //uint8_t *p8; + //uint64_t *p64; + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; + + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + { + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + //p8 = &ctx->b[ctx->h.bCnt]; + //tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; + //for( i = 0; i < tmp; i++ ) *(p8+i) = 0; + } + + cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt); + + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + //uint8_t b[SKEIN_512_BLOCK_BYTES] == 64 + memset(ctx->b,0,sizeof(ctx->b)); + //p64 = (uint64_t *)ctx->b; + //for( i = 0; i < 8; i++ ) *(p64+i) = 0; + + memcpy(X,ctx->X,sizeof(X)); + + for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) + { + ((uint64_t *)ctx->b)[0]= (uint64_t)i; + Skein_Start_New_Type(ctx,OUT_FINAL); + cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t)); + n = byteCnt - i*SKEIN_512_BLOCK_BYTES; + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } +} + +__device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt) +{ + size_t n; + + if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + { + + if (ctx->h.bCnt) + { + + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; + + if (n) + { + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + + cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + + if (msgByteCnt > SKEIN_512_BLOCK_BYTES) + { + n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; + cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; + } + } + + if (msgByteCnt) + { + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } +} + +__device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +{ + if ((databitlen & 7) == 0) + { + cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3); + } + else + { + + size_t bCnt = (databitlen >> 3) + 1; + uint8_t b,mask; + + mask = (uint8_t) (1u << (7 - (databitlen & 7))); + b = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask); + + cn_skein512_update(&state->u.ctx_512,data,bCnt-1); + cn_skein512_update(&state->u.ctx_512,&b , 1 ); + + Skein_Set_Bit_Pad_Flag(state->u.h); + } +} + +__device__ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +{ + int hashbitlen = 256; + DataLength databitlen = len << 3; + skeinHashState state; + + state.statebits = 64*SKEIN_512_STATE_WORDS; + + cn_skein_init(&state, hashbitlen); + cn_skein_update(&state, data, databitlen); + cn_skein_final(&state, hashval); +} diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp new file mode 100644 index 0000000..ff7c9d8 --- /dev/null +++ b/xmrstak/backend/plugin.hpp @@ -0,0 +1,99 @@ +#pragma once + +#include "xmrstak/misc/environment.hpp" +#include "xmrstak/params.hpp" + +#include <thread> +#include <atomic> +#include <vector> +#include <string> +#include "iBackend.hpp" +#include <iostream> + +#ifndef USE_PRECOMPILED_HEADERS +# ifdef WIN32 +# include <direct.h> +# include <windows.h> +# else +# include <sys/types.h> +# include <dlfcn.h> +# endif +# include <iostream> +#endif + +namespace xmrstak +{ + +struct plugin +{ + + plugin(const std::string backendName, const std::string libName) : fn_starterBackend(nullptr), m_backendName(backendName) + { +#ifdef WIN32 + libBackend = LoadLibrary(TEXT((libName + ".dll").c_str())); + if(!libBackend) + { + std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << (libName + ".dll") << std::endl; + return; + } +#else + libBackend = dlopen((params::inst().executablePrefix + "/lib" + libName + ".so").c_str(), RTLD_LAZY); + if(!libBackend) + { + std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << dlerror() << std::endl; + return; + } +#endif + +#ifdef WIN32 + fn_starterBackend = (starterBackend_t) GetProcAddress(libBackend, "xmrstak_start_backend"); + if (!fn_starterBackend) + { + std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " <<GetLastError()<< std::endl; + } +#else + // reset last error + dlerror(); + fn_starterBackend = (starterBackend_t) dlsym(libBackend, "xmrstak_start_backend"); + const char* dlsym_error = dlerror(); + if(dlsym_error) + { + std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " << dlsym_error << std::endl; + } +#endif + } + + std::vector<iBackend*>* startBackend(uint32_t threadOffset, miner_work& pWork, environment& env) + { + if(fn_starterBackend == nullptr) + { + std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>(); + std::cerr << "WARNING: " << m_backendName << " Backend disabled"<< std::endl; + return pvThreads; + } + + return fn_starterBackend(threadOffset, pWork, env); + } + + std::string m_backendName; + + typedef std::vector<iBackend*>* (*starterBackend_t)(uint32_t threadOffset, miner_work& pWork, environment& env); + + starterBackend_t fn_starterBackend; + +#ifdef WIN32 + HINSTANCE libBackend; +#else + void *libBackend; +#endif + +/* \todo add unload to destructor and change usage of plugin that libs keeped open until the miner endss +#ifdef WIN32 + FreeLibrary(libBackend); +#else + dlclose(libBackend); +#endif + * */ +}; + +} // namepsace xmrstak diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp new file mode 100644 index 0000000..ff31d2c --- /dev/null +++ b/xmrstak/cli/cli-miner.cpp @@ -0,0 +1,370 @@ + /* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +#include "xmrstak/misc/executor.hpp" +#include "xmrstak/backend/miner_work.hpp" +#include "xmrstak/backend/globalStates.hpp" +#include "xmrstak/backend/backendConnector.hpp" +#include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/donate-level.hpp" +#include "xmrstak/params.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/version.hpp" + +#ifndef CONF_NO_HTTPD +# include "xmrstak/http//httpd.hpp" +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <string> +#include <iostream> +#include <time.h> +#include <iostream> + +#ifndef CONF_NO_TLS +#include <openssl/ssl.h> +#include <openssl/err.h> +#endif + + +#ifdef _WIN32 +# define strcasecmp _stricmp +#endif // _WIN32 + +void do_benchmark(); + +void help() +{ + using namespace std; + using namespace xmrstak; + + cout<<"Usage: "<<params::inst().binaryName<<" [OPTION]..."<<endl; + cout<<" "<<endl; + cout<<" -c, --config FILE common miner configuration file"<<endl; + cout<<" -h, --help show this help"<<endl; +#ifndef CONF_NO_CPU + cout<<" --noCPU disable the CPU miner backend"<<endl; + cout<<" --cpu FILE CPU backend miner config file"<<endl; +#endif +#ifndef CONF_NO_OPENCL + cout<<" --noAMD disable the AMD miner backend"<<endl; + cout<<" --amd FILE AMD backend miner config file"<<endl; +#endif +#ifndef CONF_NO_CUDA + cout<<" --noNVIDIA disable the NVIDIA miner backend"<<endl; + cout<<" --nvidia FILE NVIDIA backend miner config file"<<endl; +#endif + cout<<" "<<endl; + cout<<"The Following options temporary overwrites the config file settings:"<<endl; + cout<<" -o, --url URL pool url and port, e.g. pool.usxmrpool.com:3333"<<endl; + cout<<" -u, --user USERNAME pool user name or wallet address"<<endl; + cout<<" -p, --pass PASSWD pool password, in the most cases x or empty \"\""<<endl; + cout<<" \n"<<endl; + cout<<XMR_STAK_NAME<<" "<<XMR_STAK_VERSION<<endl; + cout<<"Brought to by fireice_uk and psychocrypt under GPLv3."<<endl; +} + +int main(int argc, char *argv[]) +{ +#ifndef CONF_NO_TLS + SSL_library_init(); + SSL_load_error_strings(); + ERR_load_BIO_strings(); + ERR_load_crypto_strings(); + SSL_load_error_strings(); + OpenSSL_add_all_digests(); +#endif + + srand(time(0)); + + using namespace xmrstak; + + std::string pathWithName(argv[0]); + auto pos = pathWithName.rfind("/"); + if(pos == std::string::npos) + { + // try windows "\" + pos = pathWithName.rfind("\\"); + } + params::inst().binaryName = std::string(pathWithName, pos + 1, std::string::npos); + if(params::inst().binaryName.compare(pathWithName) != 0) + params::inst().executablePrefix = std::string(pathWithName, 0, pos); + + bool userSetPasswd = false; + for(int i = 1; i < argc; ++i) + { + std::string opName(argv[i]); + if(opName.compare("-h") == 0 || opName.compare("--help") == 0) + { + help(); + // \todo give return code to win_exit to allow passing CI + //win_exit(); + return 0; + } + else if(opName.compare("--noCPU") == 0) + { + params::inst().useCPU = false; + } + else if(opName.compare("--noAMD") == 0) + { + params::inst().useAMD = false; + } + else if(opName.compare("--noNVIDIA") == 0) + { + params::inst().useNVIDIA = false; + } + else if(opName.compare("--cpu") == 0) + { + ++i; + if( i >=argc ) + { + printer::inst()->print_msg(L0, "No argument for parameter '--cpu' given"); + win_exit(); + return 1; + } + params::inst().configFileCPU = argv[i]; + } + else if(opName.compare("--amd") == 0) + { + ++i; + if( i >=argc ) + { + printer::inst()->print_msg(L0, "No argument for parameter '--amd' given"); + win_exit(); + return 1; + } + params::inst().configFileAMD = argv[i]; + } + else if(opName.compare("--nvidia") == 0) + { + ++i; + if( i >=argc ) + { + printer::inst()->print_msg(L0, "No argument for parameter '--nvidia' given"); + win_exit(); + return 1; + } + params::inst().configFileNVIDIA = argv[i]; + } + else if(opName.compare("-o") == 0 || opName.compare("--url") == 0) + { + ++i; + if( i >=argc ) + { + printer::inst()->print_msg(L0, "No argument for parameter '-o/--url' given"); + win_exit(); + return 1; + } + params::inst().poolURL = argv[i]; + } + else if(opName.compare("-u") == 0 || opName.compare("--user") == 0) + { + ++i; + if( i >=argc ) + { + printer::inst()->print_msg(L0, "No argument for parameter '-u/--user' given"); + win_exit(); + return 1; + } + params::inst().poolUsername = argv[i]; + } + else if(opName.compare("-p") == 0 || opName.compare("--pass") == 0) + { + ++i; + if( i >=argc ) + { + printer::inst()->print_msg(L0, "No argument for parameter '-p/--pass' given"); + win_exit(); + return 1; + } + userSetPasswd = true; + params::inst().poolPasswd = argv[i]; + } + else if(opName.compare("-c") == 0 || opName.compare("--config") == 0) + { + ++i; + if( i >=argc ) + { + printer::inst()->print_msg(L0, "No argument for parameter '-c/--config' given"); + win_exit(); + return 1; + } + params::inst().configFile = argv[i]; + } + else + { + printer::inst()->print_msg(L0, "Parameter unknown '%s'",argv[i]); + win_exit(); + return 1; + } + } + + // check if we need a guided start + if(!configEditor::file_exist(params::inst().configFile)) + { + // load the template of the backend config into a char variable + const char *tpl = + #include "../config.tpl" + ; + configEditor configTpl{}; + configTpl.set(std::string(tpl)); + auto& pool = params::inst().poolURL; + if(pool.empty()) + { + std::cout<<"Please enter:\n- pool address: e.g. pool.usxmrpool.com:3333"<<std::endl; + std::cin >> pool; + } + auto& userName = params::inst().poolUsername; + if(userName.empty()) + { + std::cout<<"- user name (wallet address or pool login):"<<std::endl; + std::cin >> userName; + } + auto& passwd = params::inst().poolPasswd; + if(passwd.empty() && (!userSetPasswd)) + { + // clear everything from stdin to allow an empty password + std::cin.clear(); std::cin.ignore(INT_MAX,'\n'); + std::cout<<"- password (mostly empty or x):"<<std::endl; + getline(std::cin, passwd); + } + configTpl.replace("POOLURL", pool); + configTpl.replace("POOLUSER", userName); + configTpl.replace("POOLPASSWD", passwd); + configTpl.write(params::inst().configFile); + std::cout<<"Configuration stored in file '"<<params::inst().configFile<<"'"<<std::endl; + } + + if(!jconf::inst()->parse_config(params::inst().configFile.c_str())) + { + win_exit(); + return 0; + } + + if (!BackendConnector::self_test()) + { + win_exit(); + return 0; + } + +#ifndef CONF_NO_HTTPD + if(jconf::inst()->GetHttpdPort() != 0) + { + if (!httpd::inst()->start_daemon()) + { + win_exit(); + return 0; + } + } +#endif + + printer::inst()->print_str("-------------------------------------------------------------------\n"); + printer::inst()->print_str( XMR_STAK_NAME" " XMR_STAK_VERSION " mining software.\n"); + printer::inst()->print_str("Based on CPU mining code by wolf9466 (heavily optimized by fireice_uk).\n"); +#ifndef CONF_NO_CUDA + printer::inst()->print_str("NVIDIA mining code was written by KlausT and psychocrypt.\n"); +#endif +#ifndef CONF_NO_OPENCL + printer::inst()->print_str("AMD mining code was written by wolf9466.\n"); +#endif + printer::inst()->print_str("Brought to you by fireice_uk and psychocrypt under GPLv3.\n\n"); + char buffer[64]; + snprintf(buffer, sizeof(buffer), "Configurable dev donation level is set to %.1f %%\n\n", fDevDonationLevel * 100.0); + printer::inst()->print_str(buffer); + printer::inst()->print_str("You can use following keys to display reports:\n"); + printer::inst()->print_str("'h' - hashrate\n"); + printer::inst()->print_str("'r' - results\n"); + printer::inst()->print_str("'c' - connection\n"); + printer::inst()->print_str("-------------------------------------------------------------------\n"); + + if(strlen(jconf::inst()->GetOutputFile()) != 0) + printer::inst()->open_logfile(jconf::inst()->GetOutputFile()); + + executor::inst()->ex_start(jconf::inst()->DaemonMode()); + + using namespace std::chrono; + uint64_t lastTime = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); + + int key; + while(true) + { + key = get_key(); + + switch(key) + { + case 'h': + executor::inst()->push_event(ex_event(EV_USR_HASHRATE)); + break; + case 'r': + executor::inst()->push_event(ex_event(EV_USR_RESULTS)); + break; + case 'c': + executor::inst()->push_event(ex_event(EV_USR_CONNSTAT)); + break; + default: + break; + } + + uint64_t currentTime = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); + + /* Hard guard to make sure we never get called more than twice per second */ + if( currentTime - lastTime < 500) + std::this_thread::sleep_for(std::chrono::milliseconds(500 - (currentTime - lastTime))); + lastTime = currentTime; + } + + return 0; +} + +void do_benchmark() +{ + using namespace std::chrono; + std::vector<xmrstak::iBackend*>* pvThreads; + + printer::inst()->print_msg(L0, "Running a 60 second benchmark..."); + + uint8_t work[76] = {0}; + xmrstak::miner_work oWork = xmrstak::miner_work("", work, sizeof(work), 0, 0, 0); + pvThreads = xmrstak::BackendConnector::thread_starter(oWork); + + uint64_t iStartStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); + + std::this_thread::sleep_for(std::chrono::seconds(60)); + + oWork = xmrstak::miner_work(); + xmrstak::globalStates::inst().switch_work(oWork); + + double fTotalHps = 0.0; + for (uint32_t i = 0; i < pvThreads->size(); i++) + { + double fHps = pvThreads->at(i)->iHashCount; + fHps /= (pvThreads->at(i)->iTimestamp - iStartStamp) / 1000.0; + + printer::inst()->print_msg(L0, "Thread %u: %.1f H/S", i, fHps); + fTotalHps += fHps; + } + + printer::inst()->print_msg(L0, "Total: %.1f H/S", fTotalHps); +} diff --git a/config.txt b/xmrstak/config.tpl index 36d10d4..60d85cd 100644 --- a/config.txt +++ b/xmrstak/config.tpl @@ -1,184 +1,161 @@ -/*
- * Thread configuration for each thread. Make sure it matches the number above.
- * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will
- * consume much less power (as less cores are working), but will max out at around 80-85% of
- * the maximum performance.
- *
- * no_prefetch - Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
- * things slower.
- *
- * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
- * systems it is better to assign threads to physical cores. On Windows this usually means selecting
- * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
- * physical core CPU you should select cpu numbers 0-3.
- *
- * On the first run the miner will look at your system and suggest a basic configuration that will work,
- * you can try to tweak it from there to get the best performance.
- *
- * A filled out configuration should look like this:
- * "cpu_threads_conf" :
- * [
- * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
- * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
- * ],
- */
-"cpu_threads_conf" :
-null,
-
-/*
- * LARGE PAGE SUPPORT
- * Large pages need a properly set up OS. It can be difficult if you are not used to systems administration,
- * but the performance results are worth the trouble - you will get around 20% boost. Slow memory mode is
- * meant as a backup, you won't get stellar results there. If you are running into trouble, especially
- * on Windows, please read the common issues in the README.
- *
- * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.
- * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN
- *
- * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.
- * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.
- * 3. Expand Security Settings, and then expand Local Policies.
- * 4. Select the User Rights Assignment folder.
- * 5. The policies will be displayed in the details pane.
- * 6. In the pane, double-click Lock pages in memory.
- * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group.
- * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on
- * 9. Reboot for change to take effect.
- *
- * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need
- * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.
- *
- * On Linux you will need to configure large page support "sudo sysctl -w vm.nr_hugepages=128" and increase your
- * ulimit -l. To do do this you need to add following lines to /etc/security/limits.conf - "* soft memlock 262144"
- * and "* hard memlock 262144". You can also do it Windows-style and simply run-as-root, but this is NOT
- * recommended for security reasons.
- *
- * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a
- * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between
- * locked and unlocked memory. If that is your setup see option "no_mlck".
- */
-
-/*
- * use_slow_memory defines our behavior with regards to large pages. There are three possible options here:
- * always - Don't even try to use large pages. Always use slow memory.
- * warn - We will try to use large pages, but fall back to slow memory if that fails.
- * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.
- * It will never use slow memory, but it won't attempt to mlock
- * never - If we fail to allocate large pages we will print an error and exit.
- */
-"use_slow_memory" : "warn",
-
-/*
- * NiceHash mode
- * nicehash_nonce - Limit the nonce to 3 bytes as required by nicehash. This cuts all the safety margins, and
- * if a block isn't found within 30 minutes then you might run into nonce collisions. Number
- * of threads in this mode is hard-limited to 32.
- */
-"nicehash_nonce" : false,
-
-/*
- * Manual hardware AES override
- *
- * Some VMs don't report AES capability correctly. You can set this value to true to enforce hardware AES or
- * to false to force disable AES or null to let the miner decide if AES is used.
- *
- * WARNING: setting this to true on a CPU that doesn't support hardware AES will crash the miner.
- */
-"aes_override" : null,
-
-/*
- * TLS Settings
- * If you need real security, make sure tls_secure_algo is enabled (otherwise MITM attack can downgrade encryption
- * to trivially breakable stuff like DES and MD5), and verify the server's fingerprint through a trusted channel.
- *
- * use_tls - This option will make us connect using Transport Layer Security.
- * tls_secure_algo - Use only secure algorithms. This will make us quit with an error if we can't negotiate a secure algo.
- * tls_fingerprint - Server's SHA256 fingerprint. If this string is non-empty then we will check the server's cert against it.
- */
-"use_tls" : false,
-"tls_secure_algo" : true,
-"tls_fingerprint" : "",
-
-/*
- * pool_address - Pool address should be in the form "pool.supportxmr.com:3333". Only stratum pools are supported.
- * wallet_address - Your wallet, or pool login.
- * pool_password - Can be empty in most cases or "x".
- *
- * We feature pools up to 1MH/s. For a more complete list see M5M400's pool list at www.moneropools.com
- */
-"pool_address" : "pool.usxmrpool.com:3333",
-"wallet_address" : "",
-"pool_password" : "",
-
-/*
- * Network timeouts.
- * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make
- * sure it is there. We detect a buggy / overloaded server by the call timeout. The default values will be ok for
- * nearly all cases. If they aren't the pool has most likely overload issues. Low call timeout values are preferable -
- * long timeouts mean that we waste hashes on potentially stale jobs. Connection report will tell you how long the
- * server usually takes to process our calls.
- *
- * call_timeout - How long should we wait for a response from the server before we assume it is dead and drop the connection.
- * retry_time - How long should we wait before another connection attempt.
- * Both values are in seconds.
- * giveup_limit - Limit how many times we try to reconnect to the pool. Zero means no limit. Note that stak miners
- * don't mine while the connection is lost, so your computer's power usage goes down to idle.
- */
-"call_timeout" : 10,
-"retry_time" : 10,
-"giveup_limit" : 0,
-
-/*
- * Output control.
- * Since most people are used to miners printing all the time, that's what we do by default too. This is suboptimal
- * really, since you cannot see errors under pages and pages of text and performance stats. Given that we have internal
- * performance monitors, there is very little reason to spew out pages of text instead of concise reports.
- * Press 'h' (hashrate), 'r' (results) or 'c' (connection) to print reports.
- *
- * verbose_level - 0 - Don't print anything.
- * 1 - Print intro, connection event, disconnect event
- * 2 - All of level 1, and new job (block) event if the difficulty is different from the last job
- * 3 - All of level 1, and new job (block) event in all cases, result submission event.
- * 4 - All of level 3, and automatic hashrate report printing
- */
-"verbose_level" : 3,
-
-/*
- * Automatic hashrate report
- *
- * h_print_time - How often, in seconds, should we print a hashrate report if verbose_level is set to 4.
- * This option has no effect if verbose_level is not 4.
- */
-"h_print_time" : 60,
-
-/*
- * Daemon mode
- *
- * If you are running the process in the background and you don't need the keyboard reports, set this to true.
- * This should solve the hashrate problems on some emulated terminals.
- */
-"daemon_mode" : false,
-
-/*
- * Output file
- *
- * output_file - This option will log all output to a file.
- *
- */
-"output_file" : "",
-
-/*
- * Built-in web server
- * I like checking my hashrate on my phone. Don't you?
- * Keep in mind that you will need to set up port forwarding on your router if you want to access it from
- * outside of your home network. Ports lower than 1024 on Linux systems will require root.
- *
- * httpd_port - Port we should listen on. Default, 0, will switch off the server.
- */
-"httpd_port" : 0,
-
-/*
- * prefer_ipv4 - IPv6 preference. If the host is available on both IPv4 and IPv6 net, which one should be choose?
- * This setting will only be needed in 2020's. No need to worry about it now.
- */
-"prefer_ipv4" : true,
+R"===( +/* + * pool_address - Pool address should be in the form "pool.supportxmr.com:3333". Only stratum pools are supported. + * wallet_address - Your wallet, or pool login. + * pool_password - Can be empty in most cases or "x". + * + * We feature pools up to 1MH/s. For a more complete list see M5M400's pool list at www.moneropools.com + */ +"pool_address" : "POOLURL", +"wallet_address" : "POOLUSER", +"pool_password" : "POOLPASSWD", + +/* + * Network timeouts. + * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make + * sure it is there. We detect a buggy / overloaded server by the call timeout. The default values will be ok for + * nearly all cases. If they aren't the pool has most likely overload issues. Low call timeout values are preferable - + * long timeouts mean that we waste hashes on potentially stale jobs. Connection report will tell you how long the + * server usually takes to process our calls. + * + * call_timeout - How long should we wait for a response from the server before we assume it is dead and drop the connection. + * retry_time - How long should we wait before another connection attempt. + * Both values are in seconds. + * giveup_limit - Limit how many times we try to reconnect to the pool. Zero means no limit. Note that stak miners + * don't mine while the connection is lost, so your computer's power usage goes down to idle. + */ +"call_timeout" : 10, +"retry_time" : 10, +"giveup_limit" : 0, + +/* + * Output control. + * Since most people are used to miners printing all the time, that's what we do by default too. This is suboptimal + * really, since you cannot see errors under pages and pages of text and performance stats. Given that we have internal + * performance monitors, there is very little reason to spew out pages of text instead of concise reports. + * Press 'h' (hashrate), 'r' (results) or 'c' (connection) to print reports. + * + * verbose_level - 0 - Don't print anything. + * 1 - Print intro, connection event, disconnect event + * 2 - All of level 1, and new job (block) event if the difficulty is different from the last job + * 3 - All of level 1, and new job (block) event in all cases, result submission event. + * 4 - All of level 3, and automatic hashrate report printing + */ +"verbose_level" : 3, + +/* + * Automatic hashrate report + * + * h_print_time - How often, in seconds, should we print a hashrate report if verbose_level is set to 4. + * This option has no effect if verbose_level is not 4. + */ +"h_print_time" : 60, + +/* + * Manual hardware AES override + * + * Some VMs don't report AES capability correctly. You can set this value to true to enforce hardware AES or + * to false to force disable AES or null to let the miner decide if AES is used. + * + * WARNING: setting this to true on a CPU that doesn't support hardware AES will crash the miner. + */ +"aes_override" : null, + +/* + * LARGE PAGE SUPPORT + * Large pages need a properly set up OS. It can be difficult if you are not used to systems administration, + * but the performance results are worth the trouble - you will get around 20% boost. Slow memory mode is + * meant as a backup, you won't get stellar results there. If you are running into trouble, especially + * on Windows, please read the common issues in the README. + * + * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows. + * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN + * + * 1. On the Start menu, click Run. In the Open box, type gpedit.msc. + * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings. + * 3. Expand Security Settings, and then expand Local Policies. + * 4. Select the User Rights Assignment folder. + * 5. The policies will be displayed in the details pane. + * 6. In the pane, double-click Lock pages in memory. + * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group. + * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on + * 9. Reboot for change to take effect. + * + * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need + * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory. + * + * On Linux you will need to configure large page support "sudo sysctl -w vm.nr_hugepages=128" and increase your + * ulimit -l. To do do this you need to add following lines to /etc/security/limits.conf - "* soft memlock 262144" + * and "* hard memlock 262144". You can also do it Windows-style and simply run-as-root, but this is NOT + * recommended for security reasons. + * + * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a + * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between + * locked and unlocked memory. If that is your setup see option "no_mlck". + */ + +/* + * use_slow_memory defines our behavior with regards to large pages. There are three possible options here: + * always - Don't even try to use large pages. Always use slow memory. + * warn - We will try to use large pages, but fall back to slow memory if that fails. + * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory. + * It will never use slow memory, but it won't attempt to mlock + * never - If we fail to allocate large pages we will print an error and exit. + */ +"use_slow_memory" : "warn", + +/* + * NiceHash mode + * nicehash_nonce - Limit the nonce to 3 bytes as required by nicehash. This cuts all the safety margins, and + * if a block isn't found within 30 minutes then you might run into nonce collisions. Number + * of threads in this mode is hard-limited to 32. + */ +"nicehash_nonce" : false, + +/* + * TLS Settings + * If you need real security, make sure tls_secure_algo is enabled (otherwise MITM attack can downgrade encryption + * to trivially breakable stuff like DES and MD5), and verify the server's fingerprint through a trusted channel. + * + * use_tls - This option will make us connect using Transport Layer Security. + * tls_secure_algo - Use only secure algorithms. This will make us quit with an error if we can't negotiate a secure algo. + * tls_fingerprint - Server's SHA256 fingerprint. If this string is non-empty then we will check the server's cert against it. + */ +"use_tls" : false, +"tls_secure_algo" : true, +"tls_fingerprint" : "", + +/* + * Daemon mode + * + * If you are running the process in the background and you don't need the keyboard reports, set this to true. + * This should solve the hashrate problems on some emulated terminals. + */ +"daemon_mode" : false, + +/* + * Output file + * + * output_file - This option will log all output to a file. + * + */ +"output_file" : "", + +/* + * Built-in web server + * I like checking my hashrate on my phone. Don't you? + * Keep in mind that you will need to set up port forwarding on your router if you want to access it from + * outside of your home network. Ports lower than 1024 on Linux systems will require root. + * + * httpd_port - Port we should listen on. Default, 0, will switch off the server. + */ +"httpd_port" : 0, + +/* + * prefer_ipv4 - IPv6 preference. If the host is available on both IPv4 and IPv6 net, which one should be choose? + * This setting will only be needed in 2020's. No need to worry about it now. + */ +"prefer_ipv4" : true, + +)===" +
\ No newline at end of file diff --git a/donate-level.h b/xmrstak/donate-level.hpp index 71b7962..71b7962 100644 --- a/donate-level.h +++ b/xmrstak/donate-level.hpp diff --git a/httpd.cpp b/xmrstak/http/httpd.cpp index 53b73f1..a112bbb 100644 --- a/httpd.cpp +++ b/xmrstak/http/httpd.cpp @@ -23,19 +23,19 @@ #ifndef CONF_NO_HTTPD + +#include "httpd.hpp" +#include "webdesign.hpp" +#include "xmrstak/net/msgstruct.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/executor.hpp" +#include "xmrstak/jconf.hpp" + #include <stdlib.h> #include <stdio.h> #include <string.h> #include <string> -#include "msgstruct.h" -#include "httpd.h" -#include "console.h" -#include "executor.h" -#include "jconf.h" - -#include "webdesign.h" - #include <microhttpd.h> #ifdef _WIN32 #define strcasecmp _stricmp diff --git a/httpd.h b/xmrstak/http/httpd.hpp index bc8bcf6..3836968 100644 --- a/httpd.h +++ b/xmrstak/http/httpd.hpp @@ -1,5 +1,7 @@ #pragma once +#include <stdlib.h> + struct MHD_Daemon; struct MHD_Connection; diff --git a/webdesign.cpp b/xmrstak/http/webdesign.cpp index e07b015..4dfd3c2 100644 --- a/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -115,7 +115,7 @@ extern const char sHtmlCommonHeader [] = extern const char sHtmlHashrateBodyHigh [] = "<div class=data>" "<table>" - "<tr><th>Thread ID</th><th>2.5s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>"; + "<tr><th>Thread ID</th><th>10s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>"; extern const char sHtmlHashrateTableRow [] = "<tr><th>%u</th><td>%s</td><td>%s</td><td>%s</td></tr>"; diff --git a/webdesign.h b/xmrstak/http/webdesign.hpp index 92639a0..92639a0 100644 --- a/webdesign.h +++ b/xmrstak/http/webdesign.hpp diff --git a/jconf.cpp b/xmrstak/jconf.cpp index 4ac4c13..c033e66 100644 --- a/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -21,8 +21,12 @@ * */ -#include "jconf.h" -#include "console.h" +#include "jconf.hpp" +#include "params.hpp" + +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/jext.hpp" +#include "xmrstak/misc/console.hpp" #include <stdio.h> #include <stdlib.h> @@ -35,20 +39,16 @@ #include <cpuid.h> #endif -#include "rapidjson/document.h" -#include "rapidjson/error/en.h" -#include "jext.h" -#include "console.h" using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { aCpuThreadsConf, sUseSlowMem, bNiceHashMode, bAesOverride, +enum configEnum { bTlsMode, bTlsSecureAlgo, sTlsFingerprint, sPoolAddr, sWalletAddr, sPoolPwd, iCallTimeout, iNetRetry, iGiveUpLimit, iVerboseLevel, iAutohashTime, - bDaemonMode, sOutputFile, iHttpdPort, bPreferIpv4 }; + bDaemonMode, sOutputFile, iHttpdPort, bPreferIpv4, bNiceHashMode, bAesOverride, sUseSlowMem }; struct configVal { configEnum iName; @@ -59,10 +59,6 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aCpuThreadsConf, "cpu_threads_conf", kNullType }, - { sUseSlowMem, "use_slow_memory", kStringType }, - { bNiceHashMode, "nicehash_nonce", kTrueType }, - { bAesOverride, "aes_override", kNullType }, { bTlsMode, "use_tls", kTrueType }, { bTlsSecureAlgo, "tls_secure_algo", kTrueType }, { sTlsFingerprint, "tls_fingerprint", kStringType }, @@ -77,7 +73,10 @@ configVal oConfigValues[] = { { bDaemonMode, "daemon_mode", kTrueType }, { sOutputFile, "output_file", kStringType }, { iHttpdPort, "httpd_port", kNumberType }, - { bPreferIpv4, "prefer_ipv4", kTrueType } + { bPreferIpv4, "prefer_ipv4", kTrueType }, + { bNiceHashMode, "nicehash_nonce", kTrueType }, + { bAesOverride, "aes_override", kNullType }, + { sUseSlowMem, "use_slow_memory", kStringType } }; constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); @@ -106,70 +105,11 @@ struct jconf::opaque_private } }; -jconf* jconf::oInst = nullptr; - jconf::jconf() { prv = new opaque_private(); } -bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) -{ - if(!prv->configValues[aCpuThreadsConf]->IsArray()) - return false; - - if(id >= prv->configValues[aCpuThreadsConf]->Size()) - return false; - - const Value& oThdConf = prv->configValues[aCpuThreadsConf]->GetArray()[id]; - - if(!oThdConf.IsObject()) - return false; - - const Value *mode, *no_prefetch, *aff; - mode = GetObjectMember(oThdConf, "low_power_mode"); - no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); - aff = GetObjectMember(oThdConf, "affine_to_cpu"); - - if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) - return false; - - if(!mode->IsBool() || !no_prefetch->IsBool()) - return false; - - if(!aff->IsNumber() && !aff->IsBool()) - return false; - - if(aff->IsNumber() && aff->GetInt64() < 0) - return false; - - cfg.bDoubleMode = mode->GetBool(); - cfg.bNoPrefetch = no_prefetch->GetBool(); - - if(aff->IsNumber()) - cfg.iCpuAff = aff->GetInt64(); - else - cfg.iCpuAff = -1; - - return true; -} - -jconf::slow_mem_cfg jconf::GetSlowMemSetting() -{ - const char* opt = prv->configValues[sUseSlowMem]->GetString(); - - if(strcasecmp(opt, "always") == 0) - return always_use; - else if(strcasecmp(opt, "no_mlck") == 0) - return no_mlck; - else if(strcasecmp(opt, "warn") == 0) - return print_warning; - else if(strcasecmp(opt, "never") == 0) - return never_use; - else - return unknown_value; -} - bool jconf::GetTlsSetting() { return prv->configValues[bTlsMode]->GetBool(); @@ -187,17 +127,27 @@ const char* jconf::GetTlsFingerprint() const char* jconf::GetPoolAddress() { - return prv->configValues[sPoolAddr]->GetString(); + auto& poolURL = xmrstak::params::inst().poolURL; + if(poolURL.empty()) + poolURL = prv->configValues[sPoolAddr]->GetString(); + return poolURL.c_str(); } const char* jconf::GetPoolPwd() { - return prv->configValues[sPoolPwd]->GetString(); + auto& poolPasswd = xmrstak::params::inst().poolPasswd; + if(poolPasswd.empty()) + poolPasswd = prv->configValues[sPoolPwd]->GetString(); + return poolPasswd.c_str(); + } const char* jconf::GetWalletAddress() { - return prv->configValues[sWalletAddr]->GetString(); + auto& poolUsername = xmrstak::params::inst().poolUsername; + if(poolUsername.empty()) + poolUsername = prv->configValues[sWalletAddr]->GetString(); + return poolUsername.c_str(); } bool jconf::PreferIpv4() @@ -205,19 +155,6 @@ bool jconf::PreferIpv4() return prv->configValues[bPreferIpv4]->GetBool(); } -size_t jconf::GetThreadCount() -{ - if(prv->configValues[aCpuThreadsConf]->IsArray()) - return prv->configValues[aCpuThreadsConf]->Size(); - else - return 0; -} - -bool jconf::NeedsAutoconf() -{ - return !prv->configValues[aCpuThreadsConf]->IsArray(); -} - uint64_t jconf::GetCallTimeout() { return prv->configValues[iCallTimeout]->GetUint64(); @@ -248,11 +185,6 @@ uint16_t jconf::GetHttpdPort() return prv->configValues[iHttpdPort]->GetUint(); } -bool jconf::NiceHashMode() -{ - return prv->configValues[bNiceHashMode]->GetBool(); -} - bool jconf::DaemonMode() { return prv->configValues[bDaemonMode]->GetBool(); @@ -263,6 +195,12 @@ const char* jconf::GetOutputFile() return prv->configValues[sOutputFile]->GetString(); } +bool jconf::NiceHashMode() +{ + return prv->configValues[bNiceHashMode]->GetBool(); +} + + void jconf::cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) { memset(val, 0, sizeof(int32_t)*4); @@ -289,6 +227,22 @@ bool jconf::check_cpu_features() return bHaveSse2; } +jconf::slow_mem_cfg jconf::GetSlowMemSetting() +{ + const char* opt = prv->configValues[sUseSlowMem]->GetString(); + + if(strcasecmp(opt, "always") == 0) + return always_use; + else if(strcasecmp(opt, "no_mlck") == 0) + return no_mlck; + else if(strcasecmp(opt, "warn") == 0) + return print_warning; + else if(strcasecmp(opt, "never") == 0) + return never_use; + else + return unknown_value; +} + bool jconf::parse_config(const char* sFilename) { FILE * pFile; @@ -389,29 +343,6 @@ bool jconf::parse_config(const char* sFilename) } } - thd_cfg c; - for(size_t i=0; i < GetThreadCount(); i++) - { - if(!GetThreadConfig(i, c)) - { - printer::inst()->print_msg(L0, "Thread %llu has invalid config.", int_port(i)); - return false; - } - } - - if(NiceHashMode() && GetThreadCount() >= 32) - { - printer::inst()->print_msg(L0, "You need to use less than 32 threads in NiceHash mode."); - return false; - } - - if(GetSlowMemSetting() == unknown_value) - { - printer::inst()->print_msg(L0, - "Invalid config file. use_slow_memory must be \"always\", \"no_mlck\", \"warn\" or \"never\""); - return false; - } - if(!prv->configValues[iCallTimeout]->IsUint64() || !prv->configValues[iNetRetry]->IsUint64() || !prv->configValues[iGiveUpLimit]->IsUint64()) @@ -444,18 +375,14 @@ bool jconf::parse_config(const char* sFilename) } #endif // CONF_NO_TLS -#ifdef _WIN32 - if(GetSlowMemSetting() == no_mlck) + /* \todo check in the cpu backend if we have more than 32 worker + * keep in mined that we have change the why how the nonce is calculated (reverse thread index) + if(NiceHashMode() && GetThreadCount() >= 32) { - printer::inst()->print_msg(L0, "On Windows large pages need mlock. Please use another option."); + printer::inst()->print_msg(L0, "You need to use less than 32 threads in NiceHash mode."); return false; } -#endif // _WIN32 - - printer::inst()->set_verbose_level(prv->configValues[iVerboseLevel]->GetUint64()); - - if(NeedsAutoconf()) - return true; + */ if(prv->configValues[bAesOverride]->IsBool()) bHaveAes = prv->configValues[bAesOverride]->GetBool(); @@ -463,5 +390,22 @@ bool jconf::parse_config(const char* sFilename) if(!bHaveAes) printer::inst()->print_msg(L0, "Your CPU doesn't support hardware AES. Don't expect high hashrates."); + printer::inst()->set_verbose_level(prv->configValues[iVerboseLevel]->GetUint64()); + + if(GetSlowMemSetting() == unknown_value) + { + printer::inst()->print_msg(L0, + "Invalid config file. use_slow_memory must be \"always\", \"no_mlck\", \"warn\" or \"never\""); + return false; + } + +#ifdef _WIN32 + if(GetSlowMemSetting() == no_mlck) + { + printer::inst()->print_msg(L0, "On Windows large pages need mlock. Please use another option."); + return false; + } +#endif // _WIN32 + return true; } diff --git a/jconf.h b/xmrstak/jconf.hpp index c42fbe0..688ffe1 100644 --- a/jconf.h +++ b/xmrstak/jconf.hpp @@ -1,17 +1,24 @@ #pragma once + +#include "xmrstak/misc/environment.hpp" +#include "params.hpp" + #include <stdlib.h> #include <string> + class jconf { public: static jconf* inst() { - if (oInst == nullptr) oInst = new jconf; - return oInst; + auto& env = xmrstak::environment::inst(); + if(env.pJconfConfig == nullptr) + env.pJconfConfig = new jconf; + return env.pJconfConfig; }; - bool parse_config(const char* sFilename); + bool parse_config(const char* sFilename = xmrstak::params::inst().configFile.c_str()); struct thd_cfg { bool bDoubleMode; @@ -27,12 +34,6 @@ public: unknown_value }; - size_t GetThreadCount(); - bool GetThreadConfig(size_t id, thd_cfg &cfg); - bool NeedsAutoconf(); - - slow_mem_cfg GetSlowMemSetting(); - bool GetTlsSetting(); bool TlsSecureAlgos(); const char* GetTlsFingerprint(); @@ -52,19 +53,21 @@ public: uint16_t GetHttpdPort(); - bool NiceHashMode(); - bool DaemonMode(); bool PreferIpv4(); + + bool NiceHashMode(); + inline bool HaveHardwareAes() { return bHaveAes; } static void cpuid(uint32_t eax, int32_t ecx, int32_t val[4]); + slow_mem_cfg GetSlowMemSetting(); + private: jconf(); - static jconf* oInst; bool check_cpu_features(); struct opaque_private; diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp new file mode 100644 index 0000000..3960384 --- /dev/null +++ b/xmrstak/misc/configEditor.hpp @@ -0,0 +1,57 @@ +#pragma once + +#include <atomic> +#include <string> +#include <fstream> +#include <streambuf> +#include <regex> + + +namespace xmrstak +{ + +struct configEditor +{ + std::string m_fileContent; + + configEditor() + { + + } + + static bool file_exist( const std::string filename) + { + std::ifstream fstream(filename); + return fstream.good(); + } + + void set( const std::string && content) + { + m_fileContent = content; + } + + bool load(const std::string filename) + { + std::ifstream fstream(filename); + m_fileContent = std::string( + (std::istreambuf_iterator<char>(fstream)), + std::istreambuf_iterator<char>() + ); + return fstream.good(); + } + + void write(const std::string filename) + { + std::ofstream out(filename); + out << m_fileContent; + out.close(); + } + + void replace(const std::string search, const std::string substring) + { + m_fileContent = std::regex_replace(m_fileContent, std::regex(search), substring); + } + +}; + +} // namepsace xmrstak diff --git a/console.cpp b/xmrstak/misc/console.cpp index 6a2555b..ba34bb3 100644 --- a/console.cpp +++ b/xmrstak/misc/console.cpp @@ -21,11 +21,13 @@ * */ -#include "console.h" +#include "xmrstak/misc/console.hpp" + #include <time.h> #include <stdio.h> #include <string.h> #include <stdarg.h> +#include <cstdlib> #ifdef _WIN32 #include <windows.h> @@ -150,8 +152,6 @@ inline void comp_localtime(const time_t* ctime, tm* stime) #endif // __WIN32 } -printer* printer::oInst = nullptr; - printer::printer() { verbose_level = LINF; @@ -211,3 +211,18 @@ void printer::print_str(const char* str) fflush(logfile); } } + +//Do a press any key for the windows folk. *insert any key joke here* +#ifdef _WIN32 +void win_exit() +{ + printer::inst()->print_str("Press any key to exit."); + get_key(); + std::exit(1); +} + +#else +void win_exit() { + std::exit(1); +} +#endif // _WIN32 diff --git a/console.h b/xmrstak/misc/console.hpp index 47c3c94..4d5be78 100644 --- a/console.h +++ b/xmrstak/misc/console.hpp @@ -1,6 +1,10 @@ #pragma once + +#include "xmrstak/misc/environment.hpp" + #include <mutex> + enum out_colours { K_RED, K_GREEN, K_BLUE, K_YELLOW, K_CYAN, K_MAGENTA, K_WHITE, K_NONE }; // Warning - on Linux get_key will detect control keys, but not on Windows. @@ -24,8 +28,10 @@ class printer public: static inline printer* inst() { - if (oInst == nullptr) oInst = new printer; - return oInst; + auto& env = xmrstak::environment::inst(); + if(env.pPrinter == nullptr) + env.pPrinter = new printer; + return env.pPrinter; }; inline void set_verbose_level(size_t level) { verbose_level = (verbosity)level; } @@ -35,9 +41,10 @@ public: private: printer(); - static printer* oInst; std::mutex print_mutex; verbosity verbose_level; FILE* logfile; }; + +void win_exit(); diff --git a/xmrstak/misc/environment.hpp b/xmrstak/misc/environment.hpp new file mode 100644 index 0000000..6140d7d --- /dev/null +++ b/xmrstak/misc/environment.hpp @@ -0,0 +1,46 @@ +#pragma once + +class printer; +class jconf; +class executor; + +namespace xmrstak +{ + +class globalStates; +class params; + +struct environment +{ + + static environment& inst() + { + static environment env; + return env; + } + + environment& operator=(const environment& env) + { + this->pPrinter = env.pPrinter; + this->pglobalStates = env.pglobalStates; + this->pJconfConfig = env.pJconfConfig; + this->pExecutor = env.pExecutor; + this->pParams = env.pParams; + return *this; + } + + + environment() : pPrinter(nullptr), pglobalStates(nullptr) + { + } + + + printer* pPrinter; + globalStates* pglobalStates; + jconf* pJconfConfig; + executor* pExecutor; + params* pParams; + +}; + +} // namepsace xmrstak diff --git a/executor.cpp b/xmrstak/misc/executor.cpp index 948b156..ce5fbb2 100644 --- a/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -21,26 +21,30 @@ * */ +#include "executor.hpp" +#include "xmrstak/net/jpsock.hpp" + +#include "telemetry.hpp" +#include "xmrstak/backend/miner_work.hpp" +#include "xmrstak/backend/globalStates.hpp" +#include "xmrstak/backend/backendConnector.hpp" + +#include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/donate-level.hpp" +#include "xmrstak/http/webdesign.hpp" + #include <thread> #include <string> #include <cmath> #include <algorithm> #include <assert.h> #include <time.h> -#include "executor.h" -#include "jpsock.h" -#include "minethd.h" -#include "jconf.h" -#include "console.h" -#include "donate-level.h" -#include "webdesign.h" #ifdef _WIN32 #define strncasecmp _strnicmp #endif // _WIN32 -executor* executor::oInst = NULL; - executor::executor() { } @@ -116,8 +120,8 @@ void executor::sched_reconnect() printer::inst()->print_msg(L1, "Pool connection lost. Waiting %lld s before retry (attempt %llu).", rt, int_port(iReconnectAttempts)); - auto work = minethd::miner_work(); - minethd::switch_work(work); + auto work = xmrstak::miner_work(); + xmrstak::globalStates::inst().switch_work(work); push_timed_event(ex_event(EV_RECONNECT, usr_pool_id), rt); } @@ -229,12 +233,13 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob) jpsock* pool = pick_pool_by_id(pool_id); - minethd::miner_work oWork(oPoolJob.sJobID, oPoolJob.bWorkBlob, + xmrstak::miner_work oWork(oPoolJob.sJobID, oPoolJob.bWorkBlob, oPoolJob.iWorkLen, oPoolJob.iResumeCnt, oPoolJob.iTarget, - pool_id != dev_pool_id && jconf::inst()->NiceHashMode(), pool_id); - minethd::switch_work(oWork); + oWork.iTarget32 = oPoolJob.iTarget32; + + xmrstak::globalStates::inst().switch_work(oWork); if(pool_id == dev_pool_id) return; @@ -350,11 +355,13 @@ void executor::on_switch_pool(size_t pool_id) return; } - minethd::miner_work oWork(oPoolJob.sJobID, oPoolJob.bWorkBlob, + xmrstak::miner_work oWork(oPoolJob.sJobID, oPoolJob.bWorkBlob, oPoolJob.iWorkLen, oPoolJob.iResumeCnt, oPoolJob.iTarget, - jconf::inst()->NiceHashMode(), pool_id); + pool_id); + + oWork.iTarget32 = oPoolJob.iTarget32; - minethd::switch_work(oWork); + xmrstak::globalStates::inst().switch_work(oWork); if(dev_pool->is_running()) push_timed_event(ex_event(EV_DEV_POOL_EXIT), 5); @@ -365,9 +372,18 @@ void executor::ex_main() { assert(1000 % iTickTime == 0); - minethd::miner_work oWork = minethd::miner_work(); - pvThreads = minethd::thread_starter(oWork); - telem = new telemetry(pvThreads->size()); + xmrstak::miner_work oWork = xmrstak::miner_work(); + + // \todo collect all backend threads + pvThreads = xmrstak::BackendConnector::thread_starter(oWork); + + if(pvThreads->size()==0) + { + printer::inst()->print_msg(L1, "ERROR: No miner backend enabled."); + win_exit(); + } + + telem = new xmrstak::telemetry(pvThreads->size()); current_pool_id = usr_pool_id; usr_pool = new jpsock(usr_pool_id, jconf::inst()->GetTlsSetting()); @@ -499,9 +515,9 @@ void executor::hashrate_report(std::string& out) size_t i; out.append("HASHRATE REPORT\n"); - out.append("| ID | 2.5s | 60s | 15m |"); + out.append("| ID | 10s | 60s | 15m |"); if(nthd != 1) - out.append(" ID | 2.5s | 60s | 15m |\n"); + out.append(" ID | 10s | 60s | 15m |\n"); else out.append(1, '\n'); @@ -509,7 +525,7 @@ void executor::hashrate_report(std::string& out) { double fHps[3]; - fHps[0] = telem->calc_telemetry_data(2500, i); + fHps[0] = telem->calc_telemetry_data(10000, i); fHps[1] = telem->calc_telemetry_data(60000, i); fHps[2] = telem->calc_telemetry_data(900000, i); diff --git a/executor.h b/xmrstak/misc/executor.hpp index 968db06..d705c27 100644 --- a/executor.h +++ b/xmrstak/misc/executor.hpp @@ -1,22 +1,37 @@ #pragma once + #include "thdq.hpp" -#include "msgstruct.h" +#include "telemetry.hpp" +#include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/misc/environment.hpp" +#include "xmrstak/net/msgstruct.hpp" + #include <atomic> #include <array> #include <list> #include <future> + class jpsock; + +namespace xmrstak +{ +namespace cpu +{ class minethd; -class telemetry; + +} // namespace cpu +} // namepsace xmrstak class executor { public: static executor* inst() { - if (oInst == nullptr) oInst = new executor; - return oInst; + auto& env = xmrstak::environment::inst(); + if(env.pExecutor == nullptr) + env.pExecutor = new executor; + return env.pExecutor; }; void ex_start(bool daemon) { daemon ? ex_main() : std::thread(&executor::ex_main, this).detach(); } @@ -25,6 +40,7 @@ public: inline void push_event(ex_event&& ev) { oEventQ.push(std::move(ev)); } void push_timed_event(ex_event&& ev, size_t sec); + void log_result_error(std::string&& sError); constexpr static size_t invalid_pool_id = 0; constexpr static size_t dev_pool_id = 1; @@ -50,8 +66,8 @@ private: std::mutex timed_event_mutex; thdq<ex_event> oEventQ; - telemetry* telem; - std::vector<minethd*>* pvThreads; + xmrstak::telemetry* telem; + std::vector<xmrstak::iBackend*>* pvThreads; size_t current_pool_id; @@ -63,7 +79,6 @@ private: bool is_dev_time; executor(); - static executor* oInst; void ex_main(); @@ -157,7 +172,6 @@ private: double fHighestHps = 0.0; void log_socket_error(std::string&& sError); - void log_result_error(std::string&& sError); void log_result_ok(uint64_t iActualDiff); void sched_reconnect(); diff --git a/jext.h b/xmrstak/misc/jext.hpp index dce73a0..f4a333c 100644 --- a/jext.h +++ b/xmrstak/misc/jext.hpp @@ -1,5 +1,8 @@ #pragma once +#include "xmrstak/rapidjson/document.h" +#include "xmrstak/rapidjson/error/en.h" + using namespace rapidjson; /* This macro brings rapidjson more in line with other libs */ diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp new file mode 100644 index 0000000..c5cce23 --- /dev/null +++ b/xmrstak/misc/telemetry.cpp @@ -0,0 +1,109 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Additional permission under GNU GPL version 3 section 7 + * + * If you modify this Program, or any covered work, by linking or combining + * it with OpenSSL (or a modified version of that library), containing parts + * covered by the terms of OpenSSL License and SSLeay License, the licensors + * of this Program grant you additional permission to convey the resulting work. + * + */ + +#include "telemetry.hpp" + +#include <cmath> +#include <cstring> +#include <chrono> + + +namespace xmrstak +{ + +telemetry::telemetry(size_t iThd) +{ + ppHashCounts = new uint64_t*[iThd]; + ppTimestamps = new uint64_t*[iThd]; + iBucketTop = new uint32_t[iThd]; + + for (size_t i = 0; i < iThd; i++) + { + ppHashCounts[i] = new uint64_t[iBucketSize]; + ppTimestamps[i] = new uint64_t[iBucketSize]; + iBucketTop[i] = 0; + memset(ppHashCounts[i], 0, sizeof(uint64_t) * iBucketSize); + memset(ppTimestamps[i], 0, sizeof(uint64_t) * iBucketSize); + } +} + +double telemetry::calc_telemetry_data(size_t iLastMilisec, size_t iThread) +{ + using namespace std::chrono; + uint64_t iTimeNow = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count(); + + uint64_t iEarliestHashCnt = 0; + uint64_t iEarliestStamp = 0; + uint64_t iLastestStamp = 0; + uint64_t iLastestHashCnt = 0; + bool bHaveFullSet = false; + + //Start at 1, buckettop points to next empty + for (size_t i = 1; i < iBucketSize; i++) + { + size_t idx = (iBucketTop[iThread] - i) & iBucketMask; //overflow expected here + + if (ppTimestamps[iThread][idx] == 0) + break; //That means we don't have the data yet + + if (iLastestStamp == 0) + { + iLastestStamp = ppTimestamps[iThread][idx]; + iLastestHashCnt = ppHashCounts[iThread][idx]; + } + + if (iTimeNow - ppTimestamps[iThread][idx] > iLastMilisec) + { + bHaveFullSet = true; + break; //We are out of the requested time period + } + + iEarliestStamp = ppTimestamps[iThread][idx]; + iEarliestHashCnt = ppHashCounts[iThread][idx]; + } + + if (!bHaveFullSet || iEarliestStamp == 0 || iLastestStamp == 0) + return nan(""); + + //Don't think that can happen, but just in case + if (iLastestStamp - iEarliestStamp == 0) + return nan(""); + + double fHashes, fTime; + fHashes = iLastestHashCnt - iEarliestHashCnt; + fTime = iLastestStamp - iEarliestStamp; + fTime /= 1000.0; + + return fHashes / fTime; +} + +void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp) +{ + size_t iTop = iBucketTop[iThd]; + ppHashCounts[iThd][iTop] = iHashCount; + ppTimestamps[iThd][iTop] = iTimestamp; + + iBucketTop[iThd] = (iTop + 1) & iBucketMask; +} + +} // namepsace xmrstak diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp new file mode 100644 index 0000000..b35bbbf --- /dev/null +++ b/xmrstak/misc/telemetry.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include <cstdint> +#include <cstring> + +namespace xmrstak +{ + +class telemetry +{ +public: + telemetry(size_t iThd); + void push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp); + double calc_telemetry_data(size_t iLastMilisec, size_t iThread); + +private: + constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations + constexpr static size_t iBucketMask = iBucketSize - 1; + uint32_t* iBucketTop; + uint64_t** ppHashCounts; + uint64_t** ppTimestamps; +}; + +} // namepsace xmrstak diff --git a/thdq.hpp b/xmrstak/misc/thdq.hpp index 248c807..248c807 100644 --- a/thdq.hpp +++ b/xmrstak/misc/thdq.hpp diff --git a/jpsock.cpp b/xmrstak/net/jpsock.cpp index d179f2a..ae20e52 100644 --- a/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -24,15 +24,15 @@ #include <stdarg.h> #include <assert.h> -#include "jpsock.h" -#include "executor.h" -#include "jconf.h" +#include "jpsock.hpp" +#include "socks.hpp" +#include "socket.hpp" + +#include "xmrstak/misc/executor.hpp" +#include "xmrstak/jconf.hpp" +#include "xmrstak/misc/jext.hpp" +#include "xmrstak/version.hpp" -#include "rapidjson/document.h" -#include "jext.h" -#include "socks.h" -#include "socket.h" -#include "version.h" #define AGENTID_STR XMR_STAK_NAME "/" XMR_STAK_VERSION @@ -394,7 +394,10 @@ bool jpsock::process_pool_job(const opq_json_val* params) if(!hex2bin(sTempStr, 8, (unsigned char*)&iTempInt) || iTempInt == 0) return set_socket_error("PARSE error: Invalid target"); + oPoolJob.iTarget = t32_to_t64(iTempInt); + oPoolJob.iTarget32 = iTempInt; + } else if(target_slen <= 16) { diff --git a/jpsock.h b/xmrstak/net/jpsock.hpp index 4baaade..c2194ad 100644 --- a/jpsock.h +++ b/xmrstak/net/jpsock.hpp @@ -1,11 +1,13 @@ #pragma once + +#include "msgstruct.hpp" + #include <mutex> #include <atomic> #include <condition_variable> #include <thread> #include <string> -#include "msgstruct.h" /* Our pool can have two kinds of errors: - Parsing or connection error diff --git a/msgstruct.h b/xmrstak/net/msgstruct.hpp index 6f4a6fb..05d15fe 100644 --- a/msgstruct.h +++ b/xmrstak/net/msgstruct.hpp @@ -1,4 +1,5 @@ #pragma once + #include <string> #include <string.h> #include <assert.h> @@ -11,6 +12,8 @@ struct pool_job char sJobID[64]; uint8_t bWorkBlob[112]; uint64_t iTarget; + // \todo remove workaround needed for amd + uint32_t iTarget32; uint32_t iWorkLen; uint32_t iResumeCnt; @@ -38,6 +41,7 @@ struct job_result } }; + enum ex_event_name { EV_INVALID_VAL, EV_SOCK_READY, EV_SOCK_ERROR, EV_POOL_HAVE_JOB, EV_MINER_HAVE_RESULT, EV_PERF_TICK, EV_RECONNECT, EV_SWITCH_POOL, EV_DEV_POOL_EXIT, EV_USR_HASHRATE, EV_USR_RESULTS, EV_USR_CONNSTAT, diff --git a/socket.cpp b/xmrstak/net/socket.cpp index 52f46b5..b93376e 100644 --- a/socket.cpp +++ b/xmrstak/net/socket.cpp @@ -21,11 +21,11 @@ * */ -#include "socket.h" -#include "jpsock.h" -#include "jconf.h" -#include "console.h" -#include "executor.h" +#include "socket.hpp" +#include "jpsock.hpp" +#include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/executor.hpp" #ifndef CONF_NO_TLS #include <openssl/ssl.h> diff --git a/socket.h b/xmrstak/net/socket.hpp index 94bbf03..192a32c 100644 --- a/socket.h +++ b/xmrstak/net/socket.hpp @@ -1,5 +1,7 @@ #pragma once -#include "socks.h" + +#include "socks.hpp" + class jpsock; class base_socket diff --git a/socks.h b/xmrstak/net/socks.hpp index 82bfa2f..1d25d3a 100644 --- a/socks.h +++ b/xmrstak/net/socks.hpp @@ -1,4 +1,5 @@ #pragma once + #ifdef _WIN32 #ifndef _WIN32_WINNT #define _WIN32_WINNT 0x0601 /* Windows 7 */ @@ -7,6 +8,7 @@ #include <ws2tcpip.h> #include <windows.h> + inline void sock_init() { static bool bWSAInit = false; diff --git a/xmrstak/params.hpp b/xmrstak/params.hpp new file mode 100644 index 0000000..dddf82e --- /dev/null +++ b/xmrstak/params.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "xmrstak/misc/environment.hpp" + +#include <string> + +namespace xmrstak +{ + +struct params +{ + + static inline params& inst() + { + auto& env = environment::inst(); + if(env.pParams == nullptr) + env.pParams = new params; + return *env.pParams; + } + + std::string executablePrefix; + std::string binaryName; + bool useAMD; + bool useNVIDIA; + bool useCPU; + + std::string poolURL; + std::string poolPasswd; + std::string poolUsername; + + std::string configFile; + std::string configFileAMD; + std::string configFileNVIDIA; + std::string configFileCPU; + + params() : + binaryName("xmr-stak"), + executablePrefix("./"), + useAMD(true), + useNVIDIA(true), + useCPU(true), + configFile("config.txt"), + configFileAMD("amd.txt"), + configFileCPU("cpu.txt"), + configFileNVIDIA("nvidia.txt") + {} + +}; + +} // namepsace xmrstak diff --git a/rapidjson/allocators.h b/xmrstak/rapidjson/allocators.h index 98affe0..98affe0 100644 --- a/rapidjson/allocators.h +++ b/xmrstak/rapidjson/allocators.h diff --git a/rapidjson/document.h b/xmrstak/rapidjson/document.h index 895af88..895af88 100644 --- a/rapidjson/document.h +++ b/xmrstak/rapidjson/document.h diff --git a/rapidjson/encodedstream.h b/xmrstak/rapidjson/encodedstream.h index 1450683..1450683 100644 --- a/rapidjson/encodedstream.h +++ b/xmrstak/rapidjson/encodedstream.h diff --git a/rapidjson/encodings.h b/xmrstak/rapidjson/encodings.h index baa7c2b..baa7c2b 100644 --- a/rapidjson/encodings.h +++ b/xmrstak/rapidjson/encodings.h diff --git a/rapidjson/error/en.h b/xmrstak/rapidjson/error/en.h index 2db838b..2db838b 100644 --- a/rapidjson/error/en.h +++ b/xmrstak/rapidjson/error/en.h diff --git a/rapidjson/error/error.h b/xmrstak/rapidjson/error/error.h index 95cb31a..95cb31a 100644 --- a/rapidjson/error/error.h +++ b/xmrstak/rapidjson/error/error.h diff --git a/rapidjson/filereadstream.h b/xmrstak/rapidjson/filereadstream.h index b56ea13..b56ea13 100644 --- a/rapidjson/filereadstream.h +++ b/xmrstak/rapidjson/filereadstream.h diff --git a/rapidjson/filewritestream.h b/xmrstak/rapidjson/filewritestream.h index 6378dd6..6378dd6 100644 --- a/rapidjson/filewritestream.h +++ b/xmrstak/rapidjson/filewritestream.h diff --git a/rapidjson/fwd.h b/xmrstak/rapidjson/fwd.h index e8104e8..e8104e8 100644 --- a/rapidjson/fwd.h +++ b/xmrstak/rapidjson/fwd.h diff --git a/rapidjson/internal/biginteger.h b/xmrstak/rapidjson/internal/biginteger.h index 9d3e88c..9d3e88c 100644 --- a/rapidjson/internal/biginteger.h +++ b/xmrstak/rapidjson/internal/biginteger.h diff --git a/rapidjson/internal/diyfp.h b/xmrstak/rapidjson/internal/diyfp.h index c9fefdc..c9fefdc 100644 --- a/rapidjson/internal/diyfp.h +++ b/xmrstak/rapidjson/internal/diyfp.h diff --git a/rapidjson/internal/dtoa.h b/xmrstak/rapidjson/internal/dtoa.h index 8d6350e..8d6350e 100644 --- a/rapidjson/internal/dtoa.h +++ b/xmrstak/rapidjson/internal/dtoa.h diff --git a/rapidjson/internal/ieee754.h b/xmrstak/rapidjson/internal/ieee754.h index 82bb0b9..82bb0b9 100644 --- a/rapidjson/internal/ieee754.h +++ b/xmrstak/rapidjson/internal/ieee754.h diff --git a/rapidjson/internal/itoa.h b/xmrstak/rapidjson/internal/itoa.h index 01a4e7e..01a4e7e 100644 --- a/rapidjson/internal/itoa.h +++ b/xmrstak/rapidjson/internal/itoa.h diff --git a/rapidjson/internal/meta.h b/xmrstak/rapidjson/internal/meta.h index 5a9aaa4..5a9aaa4 100644 --- a/rapidjson/internal/meta.h +++ b/xmrstak/rapidjson/internal/meta.h diff --git a/rapidjson/internal/pow10.h b/xmrstak/rapidjson/internal/pow10.h index 02f475d..02f475d 100644 --- a/rapidjson/internal/pow10.h +++ b/xmrstak/rapidjson/internal/pow10.h diff --git a/rapidjson/internal/regex.h b/xmrstak/rapidjson/internal/regex.h index 8530cd7..8530cd7 100644 --- a/rapidjson/internal/regex.h +++ b/xmrstak/rapidjson/internal/regex.h diff --git a/rapidjson/internal/stack.h b/xmrstak/rapidjson/internal/stack.h index 022c9aa..022c9aa 100644 --- a/rapidjson/internal/stack.h +++ b/xmrstak/rapidjson/internal/stack.h diff --git a/rapidjson/internal/strfunc.h b/xmrstak/rapidjson/internal/strfunc.h index de41d8f..de41d8f 100644 --- a/rapidjson/internal/strfunc.h +++ b/xmrstak/rapidjson/internal/strfunc.h diff --git a/rapidjson/internal/strtod.h b/xmrstak/rapidjson/internal/strtod.h index 289c413..289c413 100644 --- a/rapidjson/internal/strtod.h +++ b/xmrstak/rapidjson/internal/strtod.h diff --git a/rapidjson/internal/swap.h b/xmrstak/rapidjson/internal/swap.h index 666e49f..666e49f 100644 --- a/rapidjson/internal/swap.h +++ b/xmrstak/rapidjson/internal/swap.h diff --git a/rapidjson/istreamwrapper.h b/xmrstak/rapidjson/istreamwrapper.h index f5fe289..f5fe289 100644 --- a/rapidjson/istreamwrapper.h +++ b/xmrstak/rapidjson/istreamwrapper.h diff --git a/rapidjson/memorybuffer.h b/xmrstak/rapidjson/memorybuffer.h index 39bee1d..39bee1d 100644 --- a/rapidjson/memorybuffer.h +++ b/xmrstak/rapidjson/memorybuffer.h diff --git a/rapidjson/memorystream.h b/xmrstak/rapidjson/memorystream.h index 1d71d8a..1d71d8a 100644 --- a/rapidjson/memorystream.h +++ b/xmrstak/rapidjson/memorystream.h diff --git a/rapidjson/msinttypes/inttypes.h b/xmrstak/rapidjson/msinttypes/inttypes.h index 1811128..1811128 100644 --- a/rapidjson/msinttypes/inttypes.h +++ b/xmrstak/rapidjson/msinttypes/inttypes.h diff --git a/rapidjson/msinttypes/stdint.h b/xmrstak/rapidjson/msinttypes/stdint.h index 3d4477b..3d4477b 100644 --- a/rapidjson/msinttypes/stdint.h +++ b/xmrstak/rapidjson/msinttypes/stdint.h diff --git a/rapidjson/ostreamwrapper.h b/xmrstak/rapidjson/ostreamwrapper.h index 6f4667c..6f4667c 100644 --- a/rapidjson/ostreamwrapper.h +++ b/xmrstak/rapidjson/ostreamwrapper.h diff --git a/rapidjson/pointer.h b/xmrstak/rapidjson/pointer.h index 0206ac1..0206ac1 100644 --- a/rapidjson/pointer.h +++ b/xmrstak/rapidjson/pointer.h diff --git a/rapidjson/prettywriter.h b/xmrstak/rapidjson/prettywriter.h index abd964f..abd964f 100644 --- a/rapidjson/prettywriter.h +++ b/xmrstak/rapidjson/prettywriter.h diff --git a/rapidjson/rapidjson.h b/xmrstak/rapidjson/rapidjson.h index 053b2ce..053b2ce 100644 --- a/rapidjson/rapidjson.h +++ b/xmrstak/rapidjson/rapidjson.h diff --git a/rapidjson/reader.h b/xmrstak/rapidjson/reader.h index 71916c0..71916c0 100644 --- a/rapidjson/reader.h +++ b/xmrstak/rapidjson/reader.h diff --git a/rapidjson/schema.h b/xmrstak/rapidjson/schema.h index e7af3cf..e7af3cf 100644 --- a/rapidjson/schema.h +++ b/xmrstak/rapidjson/schema.h diff --git a/rapidjson/stream.h b/xmrstak/rapidjson/stream.h index fef82c2..fef82c2 100644 --- a/rapidjson/stream.h +++ b/xmrstak/rapidjson/stream.h diff --git a/rapidjson/stringbuffer.h b/xmrstak/rapidjson/stringbuffer.h index 4e38b82..4e38b82 100644 --- a/rapidjson/stringbuffer.h +++ b/xmrstak/rapidjson/stringbuffer.h diff --git a/rapidjson/writer.h b/xmrstak/rapidjson/writer.h index 8f6e174..8f6e174 100644 --- a/rapidjson/writer.h +++ b/xmrstak/rapidjson/writer.h diff --git a/xmrstak/version.hpp b/xmrstak/version.hpp new file mode 100644 index 0000000..44214c8 --- /dev/null +++ b/xmrstak/version.hpp @@ -0,0 +1,4 @@ +#pragma once + +#define XMR_STAK_NAME "xmr-stak" +#define XMR_STAK_VERSION "2.0.0-predev" |