summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.appveyor.yml4
-rw-r--r--.github/ISSUE_TEMPLATE.md30
-rw-r--r--.github/PULL_REQUEST_TEMPLATE.md1
-rw-r--r--CMakeLists.txt41
-rw-r--r--Dockerfile26
-rw-r--r--README.md2
-rw-r--r--doc/FAQ.md2
-rw-r--r--doc/Linux_deployment.md2
-rw-r--r--doc/compile.md7
-rw-r--r--doc/compile_MacOS.md31
-rw-r--r--doc/compile_Windows.md18
-rw-r--r--doc/tuning.md28
-rw-r--r--doc/usage.md37
-rwxr-xr-xscripts/build_xmr-stak_docker/build_xmr-stak_docker.sh135
-rw-r--r--xmrstak/backend/amd/amd_gpu/gpu.cpp126
-rw-r--r--xmrstak/backend/amd/amd_gpu/gpu.hpp2
-rw-r--r--xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl19
-rw-r--r--xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl30
-rw-r--r--xmrstak/backend/amd/autoAdjust.hpp2
-rw-r--r--xmrstak/backend/amd/config.tpl5
-rw-r--r--xmrstak/backend/amd/jconf.cpp9
-rw-r--r--xmrstak/backend/amd/jconf.hpp1
-rw-r--r--xmrstak/backend/amd/minethd.cpp1
-rw-r--r--xmrstak/backend/cpu/config.tpl6
-rw-r--r--xmrstak/backend/cpu/crypto/cryptonight_aesni.h322
-rw-r--r--xmrstak/backend/cpu/jconf.cpp11
-rw-r--r--xmrstak/backend/cpu/jconf.hpp2
-rw-r--r--xmrstak/backend/cpu/minethd.cpp251
-rw-r--r--xmrstak/backend/cpu/minethd.hpp16
-rw-r--r--xmrstak/backend/nvidia/autoAdjust.hpp2
-rw-r--r--xmrstak/backend/nvidia/config.tpl10
-rw-r--r--xmrstak/backend/nvidia/jconf.cpp11
-rw-r--r--xmrstak/backend/nvidia/jconf.hpp1
-rw-r--r--xmrstak/backend/nvidia/minethd.cpp1
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp3
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cuda_core.cu30
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp37
-rw-r--r--xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu32
-rw-r--r--xmrstak/cli/cli-miner.cpp65
-rw-r--r--xmrstak/jconf.cpp1
-rw-r--r--xmrstak/misc/console.cpp9
-rw-r--r--xmrstak/misc/executor.cpp45
-rw-r--r--xmrstak/net/jpsock.hpp4
43 files changed, 1092 insertions, 326 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index 7989cb6..b463d0d 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -22,11 +22,11 @@ build_script:
- mkdir build
- cd build
- set CMAKE_PREFIX_PATH=C:\xmr-stak-dep\hwloc;C:\xmr-stak-dep\libmicrohttpd;C:\xmr-stak-dep\openssl;
- - cmake -G "Visual Studio 14 2015 Win64" -T v140,host=x64 .. -DWIN_UAC=OFF -DCUDA_ARCH=30
+ - cmake -G "Visual Studio 14 2015 Win64" -T v140,host=x64 .. -DCUDA_ARCH=30
- cmake --build . --config Release --target install
test_script:
- cd c:\xmr-stak\build\bin\Release
- dir
- copy C:\xmr-stak-dep\openssl\bin\* .
- - xmr-stak.exe --help
+# - xmr-stak.exe --help
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..8451f32
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,30 @@
+Please provide as much as possible information to reproduce the issue.
+
+# Basic information
+ - Type of the CPU.
+ - Type of the GPU (if you try to miner with the GPU).
+
+# Compile issues
+ - Which OS do you use?
+ ```
+ add **all** commands you used and the **full** compile output here
+ ```
+ ```
+ run `cmake -LA .` in the build folder and add the output here
+ ```
+
+# Issue with the execution
+ - Do you compiled the miner by our own?
+ ```
+ run `./xmr-stak --version-long` and add the output here
+ ```
+
+# AMD OpenCl issue
+
+ ```
+ run `clinfo` and add the output here
+ ```
+
+# Stability issue
+ - Is the CPU or GPU overclocked?
+ - Is the Main memory of the CPU or GPU undervolted?
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..16e55c8
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1 @@
+Please make sure your PR is against **dev** branch. Merging PRs directly into master branch would interfere with our workflow. \ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87c0e8a..9540d7a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
project(xmr-stak)
-cmake_minimum_required(VERSION 3.0.1)
+cmake_minimum_required(VERSION 3.1.0)
# enforce C++11
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -42,23 +42,23 @@ set_property(CACHE XMR-STAK_CURRENCY PROPERTY STRINGS "all;monero;aeon")
set(XMR-STAK_COMPILE "native" CACHE STRING "select CPU compute architecture")
set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic")
-if("${XMR-STAK_COMPILE}" STREQUAL "native")
+if(XMR-STAK_COMPILE STREQUAL "native")
if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(CMAKE_CXX_FLAGS "-march=native -mtune=native ${CMAKE_CXX_FLAGS}")
set(CMAKE_C_FLAGS "-march=native -mtune=native ${CMAKE_C_FLAGS}")
endif()
-elseif("${XMR-STAK_COMPILE}" STREQUAL "generic")
+elseif(XMR-STAK_COMPILE STREQUAL "generic")
add_definitions("-DCONF_ENFORCE_OpenCL_1_2=1")
else()
message(FATAL_ERROR "XMR-STAK_COMPILE is set to an unknown value '${XMR-STAK_COMPILE}'")
endif()
-if("${XMR-STAK_CURRENCY}" STREQUAL "all")
+if(XMR-STAK_CURRENCY STREQUAL "all")
message(STATUS "Set miner currency to 'monero' and 'aeon'")
-elseif("${XMR-STAK_CURRENCY}" STREQUAL "aeon")
+elseif(XMR-STAK_CURRENCY STREQUAL "aeon")
message(STATUS "Set miner currency to 'aeon'")
add_definitions("-DCONF_NO_MONERO=1")
-elseif("${XMR-STAK_CURRENCY}" STREQUAL "monero")
+elseif(XMR-STAK_CURRENCY STREQUAL "monero")
message(STATUS "Set miner currency to 'monero'")
add_definitions("-DCONF_NO_AEON=1")
endif()
@@ -134,7 +134,7 @@ if(CUDA_ENABLE)
option(CUDA_SHOW_REGISTER "Show registers used for each kernel and compute architecture" OFF)
option(CUDA_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps" OFF)
- if("${CUDA_COMPILER}" STREQUAL "clang")
+ if(CUDA_COMPILER STREQUAL "clang")
set(CLANG_BUILD_FLAGS "-O3 -x cuda --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
# activation usage of FMA
set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} -ffp-contract=fast")
@@ -152,9 +152,9 @@ if(CUDA_ENABLE)
set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} --cuda-gpu-arch=sm_${CUDA_ARCH_ELEM}")
endforeach()
- elseif("${CUDA_COMPILER}" STREQUAL "nvcc")
+ elseif(CUDA_COMPILER STREQUAL "nvcc")
# add c++11 for cuda
- if(NOT "${CMAKE_CXX_FLAGS}" MATCHES "-std=c\\+\\+11")
+ if(NOT CMAKE_CXX_FLAGS MATCHES "-std=c\\+\\+11")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
endif()
@@ -307,7 +307,7 @@ if(MICROHTTPD_ENABLE)
ENV "MICROHTTPD_ROOT"
PATH_SUFFIXES
lib)
- if("${MHTD}" STREQUAL "MHTD-NOTFOUND")
+ if(MHTD STREQUAL "MHTD-NOTFOUND")
message(FATAL_ERROR "microhttpd NOT found: use `-DMICROHTTPD_ENABLE=OFF` to build without http deamon support")
else()
set(LIBS ${LIBS} ${MHTD})
@@ -361,7 +361,7 @@ if(HWLOC_ENABLE)
PATH_SUFFIXES
lib)
- if("${HWLOC}" STREQUAL "MHTD-NOTFOUND" OR ${HWLOC_INCLUDE_DIR} STREQUAL "HWLOC_INCLUDE_DIR-NOTFOUND")
+ if(HWLOC STREQUAL "HWLOC-NOTFOUND" OR ${HWLOC_INCLUDE_DIR} STREQUAL "HWLOC_INCLUDE_DIR-NOTFOUND")
message(FATAL_ERROR "hwloc NOT found: use `-DHWLOC_ENABLE=OFF` to build without hwloc support")
else()
set(LIBS ${LIBS} ${HWLOC})
@@ -399,10 +399,10 @@ execute_process(
OUTPUT_STRIP_TRAILING_WHITESPACE
)
-if(NOT "${GIT_COMMIT_HASH}" STREQUAL "")
+if(NOT GIT_COMMIT_HASH STREQUAL "")
add_definitions("-DGIT_COMMIT_HASH=${GIT_COMMIT_HASH}")
endif()
-if(NOT "${GIT_BRANCH}" STREQUAL "")
+if(NOT GIT_BRANCH STREQUAL "")
add_definitions("-DGIT_BRANCH=${GIT_BRANCH}")
endif()
@@ -446,7 +446,10 @@ add_library(xmr-stak-c
${SRCFILES_C}
)
set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99)
-target_link_libraries(xmr-stak-c ${MHTD} ${LIBS})
+if(MICROHTTPD_ENABLE)
+ target_link_libraries(xmr-stak-c ${MHTD})
+endif()
+target_link_libraries(xmr-stak-c ${LIBS})
# compile generic backend files
file(GLOB BACKEND_CPP
@@ -470,7 +473,7 @@ if(CUDA_FOUND)
"xmrstak/backend/nvidia/nvcc_code/*.cu"
"xmrstak/backend/nvidia/*.cpp")
- if("${CUDA_COMPILER}" STREQUAL "clang")
+ if(CUDA_COMPILER STREQUAL "clang")
# build device code with clang
add_library(
xmrstak_cuda_backend
@@ -518,19 +521,13 @@ set(LIBRARY_OUTPUT_PATH "bin")
target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend)
-option(WIN_UAC "Add UAC manifest on Windows" ON)
-
-if(WIN_UAC AND CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
- set_property(TARGET xmr-stak PROPERTY LINK_FLAGS "/level='requireAdministrator' /uiAccess='false'")
-endif()
-
################################################################################
# Install
################################################################################
# do not install the binary if the project and install are equal
-if( NOT "${CMAKE_INSTALL_PREFIX}" STREQUAL "${PROJECT_BINARY_DIR}" )
+if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
install(TARGETS xmr-stak
RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
if(CUDA_FOUND)
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3e996ef
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,26 @@
+# Latest version of ubuntu
+FROM nvidia/cuda:9.0-base
+
+# Default git repository
+ENV GIT_REPOSITORY https://github.com/fireice-uk/xmr-stak.git
+ENV XMRSTAK_CMAKE_FLAGS -DXMR-STAK_COMPILE=generic -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF
+
+# Innstall packages
+RUN apt-get update \
+ && set -x \
+ && apt-get install -qq --no-install-recommends -y ca-certificates cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
+ && git clone $GIT_REPOSITORY \
+ && cd /xmr-stak \
+ && cmake ${XMRSTAK_CMAKE_FLAGS} . \
+ && make \
+ && cd - \
+ && mv /xmr-stak/bin/* /usr/local/bin/ \
+ && rm -rf /xmr-stak \
+ && apt-get purge -y -qq cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
+ && apt-get clean -qq
+
+VOLUME /mnt
+
+WORKDIR /mnt
+
+ENTRYPOINT ["/usr/local/bin/xmr-stak"]
diff --git a/README.md b/README.md
index 5609d9d..b348928 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV
## Download
-You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/xmr-stak/xmr-stak/releases).
+You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/fireice-uk/xmr-stak/releases).
If you are running on Linux (especially Linux VMs), checkout [Linux Portable Binary](doc/Linux_deployment.md).
## Default Developer Donation
diff --git a/doc/FAQ.md b/doc/FAQ.md
index ca23038..23507f2 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -29,7 +29,7 @@ Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid
## VirtualAlloc failed
-If you set up the user rights properly (see above), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation.
+If you set up the user rights properly ([see above](https://github.com/fireice-uk/xmr-stak/blob/master/doc/FAQ.md#selockmemoryprivilege-failed)), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation.
If that happens, disable all auto-staring applications and run the miner after a reboot.
diff --git a/doc/Linux_deployment.md b/doc/Linux_deployment.md
index caf762c..323a97f 100644
--- a/doc/Linux_deployment.md
+++ b/doc/Linux_deployment.md
@@ -1,5 +1,7 @@
# Deploying portable **XMR-Stak** on Linux systems
+**This is an experimental feature** we reserve the right to remove the binary if we get too many issues.
+
XMR-Stak releases include a pre-built portable version. If you are simply using it to avoid having to compile the application, you can simply download **xmr-stak-portbin-linux.tar.gz** from our [latest releases](https://github.com/fireice-uk/xmr-stak/releases/latest). Open up command line, and use the following commands:
```
diff --git a/doc/compile.md b/doc/compile.md
index cf25135..e97affa 100644
--- a/doc/compile.md
+++ b/doc/compile.md
@@ -9,6 +9,7 @@
* [Compile on Windows](compile_Windows.md)
* [Compile on Linux](compile_Linux.md)
* [Compile on FreeBSD](compile_FreeBSD.md)
+* [Compile on MacOS](compile_MacOS.md)
## Build System
@@ -30,6 +31,7 @@ After the configuration you need to compile the miner, follow the guide for your
* [Compile in Windows](compile_Windows.md)
* [Compile in Linux](compile_Linux.md)
* [Compile in FreeBSD](compile_FreeBSD.md)
+* [Compile in MacOS](compile_MacOS.md)
## Generic Build Options
- `CMAKE_INSTALL_PREFIX` install miner to the home folder
@@ -50,11 +52,6 @@ After the configuration you need to compile the miner, follow the guide for your
- native means the miner binary can be used only on the system where it is compiled but will archive the highest hash rate
- use `cmake .. -DXMR-STAK_COMPILE=generic` to run the miner on all CPU's with sse2
-### only available for Windows
-- `WIN_UAC` will enable or disable the "Run As Administrator" prompt on Windows.
- - UAC confirmation is needed to use large pages on Windows 7.
- - On Windows 10 it is only needed once to set up the account to use them.
-
## CPU Build Options
- `CPU_ENABLE` allow to disable/enable the CPU backend of the miner
diff --git a/doc/compile_MacOS.md b/doc/compile_MacOS.md
new file mode 100644
index 0000000..1b0af91
--- /dev/null
+++ b/doc/compile_MacOS.md
@@ -0,0 +1,31 @@
+# Compile **xmr-stak** for MacOS
+
+## Dependencies
+
+Assuming you already have [Homebrew](https://brew.sh) installed, the installation of dependencies is pretty straightforward and will generate the `xmr-stak` binary in the `bin/` directory.
+
+### For NVIDIA GPUs
+
+```shell
+brew tap caskroom/drivers
+brew cask install nvidia-cuda
+brew install hwloc libmicrohttpd gcc openssl cmake
+cmake . -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DOpenCL_ENABLE=OFF
+make install
+```
+
+[All available CMake options](compile.md#nvidia-build-options)
+
+### For AMD GPUs
+
+> 🖐 We need help with AMD GPU compilation instructions. Please submit a PR if you managed to install [AMD APP SDK](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/) and to compile `xmr-stak` on MacOS.
+
+### For CPU-only mining
+
+```shell
+brew install hwloc libmicrohttpd gcc openssl cmake
+cmake . -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF
+make install
+```
+
+[All available CMake options](compile.md#cpu-build-options)
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index 532348c..c9a8ff7 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -12,7 +12,7 @@
- download VS2017 Community and install from [https://www.visualstudio.com/downloads/](https://www.visualstudio.com/downloads/)
- during the install chose the components
- `Desktop development with C++` (left side)
- - `Toolset for Visual Studio C++ 2015.3 v140...` (right side)
+ - `VC++ 2015.3 v140 toolset for desktop` (right side)
### CMake for Win64
@@ -34,8 +34,12 @@
- download and install the latest version from [http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/)
### Dependencies OpenSSL/Hwloc and Microhttpd
-- with CUDA 8: download the version 1 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip)
-- with CUDA 9: download the version 2 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip)
+- for CUDA 8*:
+ - download the version 1 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip)
+ - version 1 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v141
+- for CUDA 9 **and/or** AMD GPUs, CPU:
+ - download the version 2 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip)
+ - version 2 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v140
- unzip all to `C:\xmr-stak-dep`
### Validate the Dependency Folder
@@ -81,15 +85,19 @@
mkdir build
cd build
```
- - with CUDA 8
+ - for CUDA 8*
```
cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 ..
```
- - with CUDA 9
+ - for CUDA 9 **and/or** AMD GPUs, CPU
```
cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 ..
```
```
cmake --build . --config Release --target install
cd bin\Release
+ copy C:\xmr-stak-dep\openssl\bin\* .
```
+
+\* Miner is also compiled for AMD GPUs (if the AMD APP SDK is installed) and CPUs.
+CUDA 8 requires a downgrade to the old v140 tool chain.
diff --git a/doc/tuning.md b/doc/tuning.md
index 474553b..53e682b 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -1,6 +1,7 @@
# Tuning Guide
## Content Overview
+* [Windows](windows)
* [NVIDIA Backend](#nvidia-backend)
* [Choose Value for `threads` and `blocks`](#choose-value-for-threads-and-blocks)
* [Add more GPUs](#add-more-gpus)
@@ -8,6 +9,14 @@
* [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
* [Add more GPUs](#add-more-gpus)
* [Increase Memory Pool](#increase-memory-pool)
+ * [Scratchpad Indexing](#scratchpad-indexing)
+* [CPU Backend](#cpu-backend)
+ * [Choose Value for `low_power_mode`](#choose-value-for-low_power_mode)
+
+## Windows
+"Run As Administrator" prompt (UAC) confirmation is needed to use large pages on Windows 7.
+On Windows 10 it is only needed once to set up the account to use them.
+Disable the dialog with the command line option `--noUAC`
## NVIDIA Backend
@@ -80,4 +89,21 @@ export GPU_MAX_ALLOC_PERCENT=99
export GPU_SINGLE_ALLOC_PERCENT=99
```
-*Note:* Windows user must use `set` instead of `export` to define an environment variable. \ No newline at end of file
+*Note:* Windows user must use `set` instead of `export` to define an environment variable.
+
+### Scratchpad Indexing
+
+The layout of the hash scratchpad memory can be changed for each GPU with the option `strided_index` in `amd.txt`.
+Try to change the value from the default `true` to `false`.
+
+## CPU Backend
+
+By default the CPU backend can be tuned in the config file `cpu.txt`
+
+### Choose Value for `low_power_mode`
+
+The optimal value for `low_power_mode` depends on the cache size of your CPU, and the number of threads.
+
+The `low_power_mode` can be set to a number between `1` to `5`. When set to a value `N` greater than `1`, this mode increases the single thread performance by `N` times, but also requires at least `2*N` MB of cache per thread. It can also be set to `false` or `true`. The value `false` is equivalent to `1`, and `true` is equivalent to `2`.
+
+This setting is particularly useful for CPUs with very large cache. For example the Intel Crystal Well Processors are equipped with 128MB L4 cache, enough to run 8 threads at an optimal `low_power_mode` value of `5`.
diff --git a/doc/usage.md b/doc/usage.md
index ca379ab..60cf69b 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -20,33 +20,34 @@ The number of files depends on the available backends.
1) Double click the `xmr-stak.exe` file
2) Fill in the pool url, username and password
-## Usage on Linux
+`set XMRSTAK_NOWAIT=1` disable the dialog `Press any key to exit.` for non UAC execution.
+
+
+## Usage on Linux & MacOS
1) Open a terminal within the folder with the binary
2) Start the miner with `./xmr-stak`
## Command Line Options
The miner allow to overwrite some of the settings via command line options.
+Run `xmr-stak --help` to show all available command line options.
+
+## Docker image usage
+
+You can run the Docker image the following way:
+
+```
+docker run --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak
+docker stop xmr-stak
+docker run --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak --config config.txt
+```
+
+Debug the docker image by getting inside:
```
-Usage: xmr-stak [OPTION]...
-
- -h, --help show this help
- -v, --version show version number
- -V, --version-long show long version number
- -c, --config FILE common miner configuration file
- --currency NAME currency to mine: monero or aeon
- --noCPU disable the CPU miner backend
- --cpu FILE CPU backend miner config file
- --noAMD disable the AMD miner backend
- --amd FILE AMD backend miner config file
-
-The Following options temporary overwrites the config file settings:
- -o, --url URL pool url and port, e.g. pool.usxmrpool.com:3333
- -u, --user USERNAME pool user name or wallet address
- -p, --pass PASSWD pool password, in the most cases x or empty ""
+docker run --entrypoint=/bin/bash --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak
```
## HTML and JSON API report configuraton
-To configure the reports shown on the [README](README.md) side you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json
+To configure the reports shown on the [README](../README.md) side you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json
diff --git a/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh b/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
index e046cb0..bfee1b8 100755
--- a/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
+++ b/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
@@ -5,110 +5,131 @@ if [[ $EUID -ne 0 ]]; then
exit 1
fi
-if [ -d xmr-stak-cpu ]; then
- git -C xmr-stak-cpu clean -fd
+if [ -d xmr-stak ]; then
+ git -C xmr-stak clean -fd
else
- git clone https://github.com/fireice-uk/xmr-stak-cpu.git
+ git clone https://github.com/fireice-uk/xmr-stak.git
fi
+wget -c https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run
+chmod a+x cuda_*_linux-run
+
########################
-# Fedora 26
+# Fedora 27
########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu fedora:26 /bin/bash -c "
-set -ex ;
-dnf install -y -q gcc gcc-c++ hwloc-devel libmicrohttpd-devel libstdc++-static make openssl-devel cmake ;
-cd /xmr-stak-cpu ;
-cmake -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+# CUDA is not going to work on Fedora 27 beacuse it's only support these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+docker run --rm -it -v $PWD:/mnt fedora:27 /bin/bash -c "
+set -x ;
+dnf install -y -q cmake gcc-c++ hwloc-devel libmicrohttpd-devel libstdc++-static make openssl-devel;
+cd /mnt/xmr-stak ;
+cmake -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ;
+make ;
"
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_fedora_26
-git -C xmr-stak-cpu clean -fd
+
+test -d fedora_27 || mkdir fedora_27
+mv xmr-stak/bin/* fedora_27
+git -C xmr-stak clean -fd
########################
# Ubuntu (17.04)
########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:17.04 /bin/bash -c "
-set -ex ;
+docker run --rm -it -v $PWD:/mnt ubuntu:17.04 /bin/bash -c "
+set -x ;
apt update -qq ;
apt install -y -qq libmicrohttpd-dev libssl-dev cmake build-essential libhwloc-dev ;
-cd /xmr-stak-cpu ;
-cmake -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+cd /mnt/xmr-stak ;
+/mnt/cuda_*_linux-run --silent --toolkit ;
+cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ;
+make ;
"
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_17.04
-git -C xmr-stak-cpu clean -fd
+
+test -d ubuntu_17.10 || mkdir ubuntu_17.10
+mv xmr-stak/bin/* ubuntu_17.10
+git -C xmr-stak clean -fd
########################
# Ubuntu 16.04
########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:16.04 /bin/bash -c "
-set -ex ;
+docker run --rm -it -v $PWD:/mnt ubuntu:16.04 /bin/bash -c "
+set -x ;
apt update -qq ;
-apt install -y -qq libmicrohttpd-dev libssl-dev cmake build-essential libhwloc-dev ;
-cd /xmr-stak-cpu ;
-cmake -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+apt install -y -qq cmake g++ libmicrohttpd-dev libssl-dev libhwloc-dev ;
+cd /mnt/xmr-stak ;
+/mnt/cuda_*_linux-run --silent --toolkit ;
+cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ;
+make ;
"
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_16.04
-git -C xmr-stak-cpu clean -fd
+
+test -d ubuntu_16.04 || mkdir ubuntu_16.04
+mv xmr-stak/bin/* ubuntu_16.04
+git -C xmr-stak clean -fd
########################
# Ubuntu 14.04
########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:14.04 /bin/bash -c "
-set -ex ;
+docker run --rm -it -v $PWD:/mnt ubuntu:14.04 /bin/bash -c "
+set -x ;
apt update -qq ;
apt install -y -qq curl libmicrohttpd-dev libssl-dev libhwloc-dev software-properties-common ;
add-apt-repository -y ppa:ubuntu-toolchain-r/test ;
apt update -qq ;
-apt install -y -qq gcc-7 g++-7 make ;
-update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 1 --slave /usr/bin/g++ g++ /usr/bin/g++-7 ;
-curl -L https://cmake.org/files/v3.9/cmake-3.9.0.tar.gz | tar -xzf - -C /tmp/ ;
-( cd /tmp/cmake-3.9.0/ && ./configure && make && sudo make install && cd - ) > /dev/null
+apt install -y -qq gcc-6 g++-6 make ;
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 1 --slave /usr/bin/g++ g++ /usr/bin/g++-6 ;
+curl -L https://cmake.org/files/LatestRelease/cmake-3.10.0.tar.gz | tar -xzf - -C /tmp/ ;
+( cd /tmp/cmake-*/ && ./configure && make && sudo make install && cd - ) > /dev/null
update-alternatives --install /usr/bin/cmake cmake /usr/local/bin/cmake 1 --force ;
-cd /xmr-stak-cpu ;
-cmake -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+cd /mnt/xmr-stak ;
+/mnt/cuda_*_linux-run --silent --toolkit ;
+cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ;
+make ;
"
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_14.04
-git -C xmr-stak-cpu clean -fd
+
+test -d ubuntu_14.04 || mkdir ubuntu_14.04
+mv xmr-stak/bin/* ubuntu_14.04
+git -C xmr-stak clean -fd
########################
# CentOS 7
########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu centos:7 /bin/bash -c "
-set -ex ;
+# CUDA is not going to work on CentOS/RHEL beacuse it's only support gcc-4 in these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+docker run --rm -it -v $PWD:/mnt centos:7 /bin/bash -c "
+set -x ;
yum install -y -q centos-release-scl epel-release ;
-yum install -y -q cmake3 devtoolset-4-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ;
-scl enable devtoolset-4 - << EOF
-cd /xmr-stak-cpu ;
-cmake3 -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+yum install -y -q cmake3 devtoolset-7-gcc* hwloc-devel libmicrohttpd-devel make openssl-devel perl ;
+scl enable devtoolset-7 - << EOF
+cd /mnt/xmr-stak ;
+cmake3 -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ;
+make ;
EOF
"
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_centos_7
-git -C xmr-stak-cpu clean -fd
+
+test -d centos_7 || mkdir centos_7
+mv xmr-stak/bin/* centos_7
+git -C xmr-stak clean -fd
########################
# CentOS 6.x
########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu centos:6 /bin/bash -c "
-set -ex ;
+# CUDA is not going to work on CentOS/RHEL beacuse it's only support gcc-4 in these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+docker run --rm -it -v $PWD:/mnt centos:6 /bin/bash -c "
+set -x ;
yum install -y -q centos-release-scl epel-release ;
-yum install -y -q cmake3 devtoolset-4-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ;
-scl enable devtoolset-4 - << EOF
-cd /xmr-stak-cpu ;
-cmake3 -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+yum install -y -q cmake3 devtoolset-7-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ;
+scl enable devtoolset-7 - << EOF
+cd /mnt/xmr-stak ;
+cmake3 -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ;
+make ;
EOF
"
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_centos_6
-git -C xmr-stak-cpu clean -fd
-rm -rf xmr-stak-cpu
+test -d centos_6 || mkdir centos_6
+mv xmr-stak/bin/* centos_6
+git -C xmr-stak clean -fd
+
+rm -rf xmr-stak
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 15b8457..d9bc962 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -332,7 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
char options[256];
snprintf(options, sizeof(options),
- "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu", hasIterations, threadMemMask, int_port(ctx->workSize));
+ "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d",
+ hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
if(ret != CL_SUCCESS)
{
@@ -448,68 +449,85 @@ uint32_t getNumPlatforms()
std::vector<GpuContext> getAMDDevices(int index)
{
std::vector<GpuContext> ctxVec;
- cl_platform_id * platforms = NULL;
+ std::vector<cl_platform_id> platforms;
+ std::vector<cl_device_id> device_list;
+
cl_int clStatus;
cl_uint num_devices;
- cl_device_id *device_list = NULL;
-
uint32_t numPlatforms = getNumPlatforms();
- if(numPlatforms)
+ if(numPlatforms == 0)
+ return ctxVec;
+
+ platforms.resize(numPlatforms);
+ if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
+ return ctxVec;
+ }
+
+ if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+ return ctxVec;
+ }
+
+ device_list.resize(num_devices);
+ if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
+ return ctxVec;
+ }
+
+ for (size_t k = 0; k < num_devices; k++)
{
- platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
- clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
- if(clStatus == CL_SUCCESS)
+ std::vector<char> devVendorVec(1024);
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS)
{
- clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
- if(clStatus == CL_SUCCESS)
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
+ continue;
+ }
+
+ std::string devVendor(devVendorVec.data());
+ if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos)
+ {
+ GpuContext ctx;
+ std::vector<char> devNameVec(1024);
+ size_t maxMem;
+
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k);
+ continue;
+ }
+
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS)
{
- device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
- clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
- if(clStatus == CL_SUCCESS)
- {
- for (int k = 0; k < num_devices; k++)
- {
- cl_int clError;
- std::vector<char> devVendorVec(1024);
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL);
- if(clStatus == CL_SUCCESS)
- {
- std::string devVendor(devVendorVec.data());
- if( devVendor.find("Advanced Micro Devices") != std::string::npos)
- {
- GpuContext ctx;
- ctx.deviceIdx = k;
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL);
- size_t maxMem;
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL);
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL);
- // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
- ctx.freeMem = std::min(ctx.freeMem, maxMem);
- std::vector<char> devNameVec(1024);
- clError = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL);
- ctx.name = std::string(devNameVec.data());
- printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
- ctx.DeviceID = device_list[k];
- ctxVec.push_back(ctx);
- }
- }
- else
- printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name.", err_to_str(clStatus));
- }
- }
- else
- printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
- free(device_list);
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
+ continue;
}
- else
- printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k);
+ continue;
+ }
+
+ if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
+ continue;
+ }
+ printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
+
+ // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
+ ctx.deviceIdx = k;
+ ctx.freeMem = std::min(ctx.freeMem, maxMem);
+ ctx.name = std::string(devNameVec.data());
+ ctx.DeviceID = device_list[k];
+ ctxVec.push_back(ctx);
}
- else
- printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
}
-
- free(platforms);
return ctxVec;
}
@@ -541,7 +559,7 @@ int getAMDPlatformIdx()
clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
std::string platformName(platformNameVec.data());
- if( platformName.find("Advanced Micro Devices") != std::string::npos)
+ if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos)
{
platformIndex = i;
printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str());
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 123de01..c17bac1 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -9,6 +9,7 @@
#endif
#include <stdint.h>
+#include <string>
#include <vector>
#define ERR_SUCCESS (0)
@@ -23,6 +24,7 @@ struct GpuContext
size_t deviceIdx;
size_t rawIntensity;
size_t workSize;
+ int stridedIndex;
/*Output vars*/
cl_device_id DeviceID;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 966199b..255fcbb 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -411,7 +411,11 @@ void AESExpandKey256(uint *keybuf)
}
}
-#define IDX(x) (x)
+#if(STRIDED_INDEX==0)
+# define IDX(x) (x)
+#else
+# define IDX(x) ((x) * (Threads))
+#endif
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
@@ -440,7 +444,12 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
if(gIdx < Threads)
{
states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+ Scratchpad += gIdx;
+#endif
((ulong8 *)State)[0] = vload8(0, input);
State[8] = input[8];
@@ -519,7 +528,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
if(gIdx < Threads)
{
states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+ Scratchpad += gIdx;
+#endif
a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
@@ -588,7 +601,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
if(gIdx < Threads)
{
states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+ Scratchpad += gIdx;
+#endif
#if defined(__Tahiti__) || defined(__Pitcairn__)
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
index 996944b..81e1644 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -74,15 +74,29 @@ static const __constant uint AES0_C[256] =
#define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U))
-uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, const uint4 key)
+uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
{
- uint4 Y;
- Y.s0 = AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)];
- Y.s1 = AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)];
- Y.s2 = AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)];
- Y.s3 = AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)];
- Y ^= key;
- return(Y);
+ key.s0 ^= AES0[BYTE(X.s0, 0)];
+ key.s1 ^= AES0[BYTE(X.s1, 0)];
+ key.s2 ^= AES0[BYTE(X.s2, 0)];
+ key.s3 ^= AES0[BYTE(X.s3, 0)];
+
+ key.s0 ^= AES2[BYTE(X.s2, 2)];
+ key.s1 ^= AES2[BYTE(X.s3, 2)];
+ key.s2 ^= AES2[BYTE(X.s0, 2)];
+ key.s3 ^= AES2[BYTE(X.s1, 2)];
+
+ key.s0 ^= AES1[BYTE(X.s1, 1)];
+ key.s1 ^= AES1[BYTE(X.s2, 1)];
+ key.s2 ^= AES1[BYTE(X.s3, 1)];
+ key.s3 ^= AES1[BYTE(X.s0, 1)];
+
+ key.s0 ^= AES3[BYTE(X.s3, 3)];
+ key.s1 ^= AES3[BYTE(X.s0, 3)];
+ key.s2 ^= AES3[BYTE(X.s1, 3)];
+ key.s3 ^= AES3[BYTE(X.s2, 3)];
+
+ return key;
}
#endif
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 0b91212..0bc5239 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -123,7 +123,7 @@ private:
// set 8 threads per block (this is a good value for the most gpus)
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
- " \"affine_to_cpu\" : false, \n"
+ " \"affine_to_cpu\" : false, \"strided_index\" : true\n"
" },\n";
++i;
}
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index a93859c..af662f8 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -5,9 +5,12 @@ R"===(
* intensity - Number of parallel GPU threads (nothing to do with CPU threads)
* worksize - Number of local GPU threads (nothing to do with CPU threads)
* affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ * strided_index - switch memory pattern used for the scratch pad memory
+ * true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ * false = use a contiguous block of memory per thread
* "gpu_threads_conf" :
* [
- * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
+ * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true },
* ],
*/
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 0617aeb..07afb19 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -103,13 +103,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
- const Value *idx, *intensity, *w_size, *aff;
+ const Value *idx, *intensity, *w_size, *aff, *stridedIndex;
idx = GetObjectMember(oThdConf, "index");
intensity = GetObjectMember(oThdConf, "intensity");
w_size = GetObjectMember(oThdConf, "worksize");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
+ stridedIndex = GetObjectMember(oThdConf, "strided_index");
- if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr)
+ if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr)
return false;
if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -118,9 +119,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!aff->IsUint64() && !aff->IsBool())
return false;
+ if(!stridedIndex->IsBool())
+ return false;
+
cfg.index = idx->GetUint64();
cfg.intensity = intensity->GetUint64();
cfg.w_size = w_size->GetUint64();
+ cfg.stridedIndex = stridedIndex->GetBool();
if(aff->IsNumber())
cfg.cpu_aff = aff->GetInt64();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index da024a4..ee1882a 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -26,6 +26,7 @@ public:
size_t intensity;
size_t w_size;
long long cpu_aff;
+ bool stridedIndex;
};
size_t GetThreadCount();
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index c1399e0..103688f 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -96,6 +96,7 @@ bool minethd::init_gpus()
vGpuData[i].deviceIdx = cfg.index;
vGpuData[i].rawIntensity = cfg.intensity;
vGpuData[i].workSize = cfg.w_size;
+ vGpuData[i].stridedIndex = cfg.stridedIndex;
}
return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index 990a31d..b21a22d 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -1,9 +1,11 @@
R"===(
/*
* Thread configuration for each thread. Make sure it matches the number above.
- * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will
+ * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true,
+ this mode will double the cache usage, and double the single thread performance. It will
* consume much less power (as less cores are working), but will max out at around 80-85% of
- * the maximum performance.
+ * the maximum performance. When set to a number N greater than 1, this mode will increase the
+ * cache usage and single thread performance by N times.
*
* no_prefetch - Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
* things slower.
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 2a6a769..9b6e1dc 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -353,19 +353,19 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
-void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
+void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
- keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
- keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200);
+ keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+ keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
// Optim - 99% time boundary
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
- cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
- uint8_t* l0 = ctx0->long_state;
- uint64_t* h0 = (uint64_t*)ctx0->hash_state;
- uint8_t* l1 = ctx1->long_state;
- uint64_t* h1 = (uint64_t*)ctx1->hash_state;
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
uint64_t axl0 = h0[0] ^ h0[4];
uint64_t axh0 = h0[1] ^ h0[5];
@@ -444,13 +444,305 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
}
// Optim - 90% time boundary
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
- cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
// Optim - 99% time boundary
- keccakf((uint64_t*)ctx0->hash_state, 24);
- extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
- keccakf((uint64_t*)ctx1->hash_state, 24);
- extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32);
+ keccakf((uint64_t*)ctx[0]->hash_state, 24);
+ extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
+ keccakf((uint64_t*)ctx[1]->hash_state, 24);
+ extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32);
+}
+
+#define CN_STEP1(a, b, c, l, ptr, idx) \
+ a = _mm_xor_si128(a, c); \
+ idx = _mm_cvtsi128_si64(a); \
+ ptr = (__m128i *)&l[idx & MASK]; \
+ if(PREFETCH) \
+ _mm_prefetch((const char*)ptr, _MM_HINT_T0); \
+ c = _mm_load_si128(ptr)
+
+#define CN_STEP2(a, b, c, l, ptr, idx) \
+ if(SOFT_AES) \
+ c = soft_aesenc(c, a); \
+ else \
+ c = _mm_aesenc_si128(c, a); \
+ b = _mm_xor_si128(b, c); \
+ _mm_store_si128(ptr, b)
+
+#define CN_STEP3(a, b, c, l, ptr, idx) \
+ idx = _mm_cvtsi128_si64(c); \
+ ptr = (__m128i *)&l[idx & MASK]; \
+ if(PREFETCH) \
+ _mm_prefetch((const char*)ptr, _MM_HINT_T0); \
+ b = _mm_load_si128(ptr)
+
+#define CN_STEP4(a, b, c, l, ptr, idx) \
+ lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \
+ a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \
+ _mm_store_si128(ptr, a)
+
+// This lovelier creation will do 3 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 3; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ }
+
+ for (size_t i = 0; i < 3; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
+}
+
+// This even lovelier creation will do 4 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 4; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+ uint8_t* l3 = ctx[3]->long_state;
+ uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
+ __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+ __m128i cx3 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, idx3, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2, *ptr3;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
+ }
+
+ for (size_t i = 0; i < 4; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
+}
+
+// This most lovely creation will do 5 cn hashes at a time.
+template<size_t MASK, size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+ for (size_t i = 0; i < 5; i++)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
+
+ uint8_t* l0 = ctx[0]->long_state;
+ uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
+ uint8_t* l1 = ctx[1]->long_state;
+ uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ uint8_t* l2 = ctx[2]->long_state;
+ uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
+ uint8_t* l3 = ctx[3]->long_state;
+ uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
+ uint8_t* l4 = ctx[4]->long_state;
+ uint64_t* h4 = (uint64_t*)ctx[4]->hash_state;
+
+ __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
+ __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+ __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
+ __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+ __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
+ __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+ __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
+ __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+ __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]);
+ __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
+ __m128i cx0 = _mm_set_epi64x(0, 0);
+ __m128i cx1 = _mm_set_epi64x(0, 0);
+ __m128i cx2 = _mm_set_epi64x(0, 0);
+ __m128i cx3 = _mm_set_epi64x(0, 0);
+ __m128i cx4 = _mm_set_epi64x(0, 0);
+
+ for (size_t i = 0; i < ITERATIONS/2; i++)
+ {
+ uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo;
+ __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
+
+ // EVEN ROUND
+ CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4);
+
+ // ODD ROUND
+ CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
+
+ CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4);
+ }
+
+ for (size_t i = 0; i < 5; i++)
+ {
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
}
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 2ded8c0..6e709bd 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -116,7 +116,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
return false;
- if(!mode->IsBool() || !no_prefetch->IsBool())
+ if(!mode->IsBool() && !mode->IsNumber())
+ return false;
+
+ if(!no_prefetch->IsBool())
return false;
if(!aff->IsNumber() && !aff->IsBool())
@@ -125,7 +128,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(aff->IsNumber() && aff->GetInt64() < 0)
return false;
- cfg.bDoubleMode = mode->GetBool();
+ if(mode->IsNumber())
+ cfg.iMultiway = (int)mode->GetInt64();
+ else
+ cfg.iMultiway = mode->GetBool() ? 2 : 1;
+
cfg.bNoPrefetch = no_prefetch->GetBool();
if(aff->IsNumber())
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index f843ed4..e98ed16 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -22,7 +22,7 @@ public:
bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str());
struct thd_cfg {
- bool bDoubleMode;
+ int iMultiway;
bool bNoPrefetch;
long long iCpuAff;
};
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index cbb01f9..1c0e491 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -92,7 +92,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
#endif
}
-minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity)
+minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity)
{
this->backendType = iBackend::CPU;
oWork = pWork;
@@ -105,10 +105,25 @@ minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefet
std::unique_lock<std::mutex> lck(thd_aff_set);
std::future<void> order_guard = order_fix.get_future();
- if(double_work)
+ switch (iMultiway)
+ {
+ case 5:
+ oWorkThd = std::thread(&minethd::penta_work_main, this);
+ break;
+ case 4:
+ oWorkThd = std::thread(&minethd::quad_work_main, this);
+ break;
+ case 3:
+ oWorkThd = std::thread(&minethd::triple_work_main, this);
+ break;
+ case 2:
oWorkThd = std::thread(&minethd::double_work_main, this);
- else
+ break;
+ case 1:
+ default:
oWorkThd = std::thread(&minethd::work_main, this);
+ break;
+ }
order_guard.wait();
@@ -154,6 +169,7 @@ cryptonight_ctx* minethd::minethd_alloc_ctx()
return nullptr; //Should never happen
}
+static constexpr size_t MAX_N = 5;
bool minethd::self_test()
{
alloc_msg msg = { 0 };
@@ -191,14 +207,15 @@ bool minethd::self_test()
if(res == 0 && fatal)
return false;
- cryptonight_ctx *ctx0, *ctx1;
- if((ctx0 = minethd_alloc_ctx()) == nullptr)
- return false;
-
- if((ctx1 = minethd_alloc_ctx()) == nullptr)
+ cryptonight_ctx *ctx[MAX_N] = {0};
+ for (int i = 0; i < MAX_N; i++)
{
- cryptonight_free_ctx(ctx0);
- return false;
+ if ((ctx[i] = minethd_alloc_ctx()) == nullptr)
+ {
+ for (int j = 0; j < i; j++)
+ cryptonight_free_ctx(ctx[j]);
+ return false;
+ }
}
bool bResult = true;
@@ -206,31 +223,52 @@ bool minethd::self_test()
bool mineMonero = ::jconf::inst()->IsCurrencyMonero();
if(mineMonero)
{
- unsigned char out[64];
+ unsigned char out[32 * MAX_N];
cn_hash_fun hashf;
- cn_hash_fun_dbl hashdf;
-
+ cn_hash_fun_multi hashf_multi;
hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero);
- hashf("This is a test", 14, out, ctx0);
+ hashf("This is a test", 14, out, ctx[0]);
bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero);
- hashf("This is a test", 14, out, ctx0);
+ hashf("This is a test", 14, out, ctx[0]);
bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
- hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero);
- hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
+ hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
- "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+ "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
- hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero);
- hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
+ hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, mineMonero);
+ hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
- "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+ "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+ hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
+ bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
+
+ hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+ bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
+
+ hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+ bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
}
- cryptonight_free_ctx(ctx0);
- cryptonight_free_ctx(ctx1);
+
+ for (int i = 0; i < MAX_N; i++)
+ cryptonight_free_ctx(ctx[i]);
if(!bResult)
printer::inst()->print_msg(L0,
@@ -272,12 +310,12 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
#endif
- printer::inst()->print_msg(L1, "Starting %s thread, affinity: %d.", cfg.bDoubleMode ? "double" : "single", (int)cfg.iCpuAff);
+ printer::inst()->print_msg(L1, "Starting %dx thread, affinity: %d.", cfg.iMultiway, (int)cfg.iCpuAff);
}
else
- printer::inst()->print_msg(L1, "Starting %s thread, no affinity.", cfg.bDoubleMode ? "double" : "single");
+ printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway);
- minethd* thd = new minethd(pWork, i + threadOffset, cfg.bDoubleMode, cfg.bNoPrefetch, cfg.iCpuAff);
+ minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff);
pvThreads.push_back(thd);
}
@@ -326,7 +364,7 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, boo
// define aeon settings
#if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO)
- // ignore 3rd bit if only on currency is active
+ // ignore 3rd bit if only one currency is active
digit.set(2, 0);
#else
digit.set(2, !mineMonero);
@@ -416,22 +454,34 @@ void minethd::work_main()
cryptonight_free_ctx(ctx);
}
-minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero)
+minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero)
{
// We have two independent flag bits in the functions
// therefore we will build a binary digit and select the
// function as a two digit binary
- // Digit order SOFT_AES, NO_PREFETCH, MINER_ALGO
+ // Digit order SOFT_AES, NO_PREFETCH
- static const cn_hash_fun_dbl func_table[] = {
- /* there will be 8 function entries if `CONF_NO_MONERO` and `CONF_NO_AEON`
- * is not defined. If one is defined there will be 4 entries.
+ static const cn_hash_fun_multi func_table[] = {
+ /* there will be 8*(MAX_N-1) function entries if `CONF_NO_MONERO` and `CONF_NO_AEON`
+ * is not defined. If one is defined there will be 4*(MAX_N-1) entries.
*/
#ifndef CONF_NO_MONERO
cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
- cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>
+ cryptonight_double_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
+ cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
+ cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
+ cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
+ cryptonight_triple_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
+ cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
+ cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
+ cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
+ cryptonight_quad_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>,
+ cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, false>,
+ cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, false, true>,
+ cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, false>,
+ cryptonight_penta_hash<MONERO_MASK, MONERO_ITER, MONERO_MEMORY, true, true>
#endif
#if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO))
// comma will be added only if Monero and Aeon is build
@@ -441,33 +491,71 @@ minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefe
cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
- cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>
+ cryptonight_double_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
+ cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
+ cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
+ cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
+ cryptonight_triple_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
+ cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
+ cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
+ cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
+ cryptonight_quad_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>,
+ cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, false>,
+ cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, false, true>,
+ cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, false>,
+ cryptonight_penta_hash<AEON_MASK, AEON_ITER, AEON_MEMORY, true, true>
#endif
};
- std::bitset<3> digit;
+ std::bitset<2> digit;
digit.set(0, !bNoPrefetch);
digit.set(1, !bHaveAes);
// define aeon settings
#if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO)
- // ignore 3rd bit if only on currency is active
- digit.set(2, 0);
+ // ignore miner algo if only one currency is active
+ size_t miner_algo_base = 0;
#else
- digit.set(2, !mineMonero);
+ size_t miner_algo_base = mineMonero ? 0 : 4*(MAX_N-1);
#endif
- return func_table[digit.to_ulong()];
+ N = (N<2) ? 2 : (N>MAX_N) ? MAX_N : N;
+ return func_table[miner_algo_base + 4*(N-2) + digit.to_ulong()];
}
-uint32_t* minethd::prep_double_work(uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2])
+void minethd::double_work_main()
{
- memcpy(bDoubleWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
- memcpy(bDoubleWorkBlob + oWork.iWorkSize, oWork.bWorkBlob, oWork.iWorkSize);
- return (uint32_t*)(bDoubleWorkBlob + oWork.iWorkSize + 39);
+ multiway_work_main<2>(func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
}
-void minethd::double_work_main()
+void minethd::triple_work_main()
+{
+ multiway_work_main<3>(func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+}
+
+void minethd::quad_work_main()
+{
+ multiway_work_main<4>(func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+}
+
+void minethd::penta_work_main()
+{
+ multiway_work_main<5>(func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+}
+
+template<size_t N>
+void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce)
+{
+ for (size_t i = 0; i < N; i++)
+ {
+ memcpy(bWorkBlob + oWork.iWorkSize * i, oWork.bWorkBlob, oWork.iWorkSize);
+ if (i > 0)
+ piNonce[i] = (uint32_t*)(bWorkBlob + oWork.iWorkSize * i + 39);
+ }
+}
+
+template<size_t N>
+void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
{
if(affinity >= 0) //-1 means no affinity
bindMemoryToNUMANode(affinity);
@@ -477,31 +565,26 @@ void minethd::double_work_main()
lck.release();
std::this_thread::yield();
- cn_hash_fun_dbl hash_fun;
- cryptonight_ctx* ctx0;
- cryptonight_ctx* ctx1;
+ cryptonight_ctx *ctx[MAX_N];
uint64_t iCount = 0;
- uint64_t *piHashVal0, *piHashVal1;
- uint32_t *piNonce0, *piNonce1;
- uint8_t bDoubleHashOut[64];
- uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2];
+ uint64_t *piHashVal[MAX_N];
+ uint32_t *piNonce[MAX_N];
+ uint8_t bHashOut[MAX_N * 32];
+ uint8_t bWorkBlob[sizeof(miner_work::bWorkBlob) * MAX_N];
uint32_t iNonce;
job_result res;
- hash_fun = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero());
- ctx0 = minethd_alloc_ctx();
- ctx1 = minethd_alloc_ctx();
-
- piHashVal0 = (uint64_t*)(bDoubleHashOut + 24);
- piHashVal1 = (uint64_t*)(bDoubleHashOut + 32 + 24);
- piNonce0 = (uint32_t*)(bDoubleWorkBlob + 39);
+ for (size_t i = 0; i < N; i++)
+ {
+ ctx[i] = minethd_alloc_ctx();
+ piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24);
+ piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr;
+ }
if(!oWork.bStall)
- piNonce1 = prep_double_work(bDoubleWorkBlob);
- else
- piNonce1 = nullptr;
+ prep_multiway_work<N>(bWorkBlob, piNonce);
- globalStates::inst().inst().iConsumeCnt++;
+ globalStates::inst().iConsumeCnt++;
while (bQuit == 0)
{
@@ -515,55 +598,57 @@ void minethd::double_work_main()
std::this_thread::sleep_for(std::chrono::milliseconds(100));
consume_work();
- piNonce1 = prep_double_work(bDoubleWorkBlob);
+ prep_multiway_work<N>(bWorkBlob, piNonce);
continue;
}
- size_t nonce_ctr = 0;
- constexpr size_t nonce_chunk = 4096; //Needs to be a power of 2
+ constexpr uint32_t nonce_chunk = 4096;
+ int64_t nonce_ctr = 0;
assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
if(oWork.bNiceHash)
- iNonce = *piNonce0;
+ iNonce = *piNonce[0];
while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
{
- if ((iCount & 0x7) == 0) //Store stats every 16 hashes
+ if ((iCount++ & 0x7) == 0) //Store stats every 8*N hashes
{
using namespace std::chrono;
uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
- iHashCount.store(iCount, std::memory_order_relaxed);
+ iHashCount.store(iCount * N, std::memory_order_relaxed);
iTimestamp.store(iStamp, std::memory_order_relaxed);
}
- iCount += 2;
-
-
- if((nonce_ctr++ & (nonce_chunk/2 - 1)) == 0)
+
+ nonce_ctr -= N;
+ if(nonce_ctr <= 0)
{
globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, nonce_chunk);
+ nonce_ctr = nonce_chunk;
}
- *piNonce0 = ++iNonce;
- *piNonce1 = ++iNonce;
+ for (size_t i = 0; i < N; i++)
+ *piNonce[i] = ++iNonce;
- hash_fun(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1);
+ hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx);
- if (*piHashVal0 < oWork.iTarget)
- executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce-1, bDoubleHashOut, iThreadNo), oWork.iPoolId));
-
- if (*piHashVal1 < oWork.iTarget)
- executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce, bDoubleHashOut + 32, iThreadNo), oWork.iPoolId));
+ for (size_t i = 0; i < N; i++)
+ {
+ if (*piHashVal[i] < oWork.iTarget)
+ {
+ executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + 1 + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId));
+ }
+ }
std::this_thread::yield();
}
consume_work();
- piNonce1 = prep_double_work(bDoubleWorkBlob);
+ prep_multiway_work<N>(bWorkBlob, piNonce);
}
- cryptonight_free_ctx(ctx0);
- cryptonight_free_ctx(ctx1);
+ for (int i = 0; i < N; i++)
+ cryptonight_free_ctx(ctx[i]);
}
} // namespace cpu
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 5520d9e..0433d0d 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -29,16 +29,24 @@ public:
static cryptonight_ctx* minethd_alloc_ctx();
private:
+ typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**);
+ static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero);
- typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, cryptonight_ctx* __restrict, cryptonight_ctx* __restrict);
- static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero);
+ minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
- minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity);
+ template<size_t N>
+ void multiway_work_main(cn_hash_fun_multi hash_fun_multi);
+
+ template<size_t N>
+ void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce);
void work_main();
void double_work_main();
+ void triple_work_main();
+ void quad_work_main();
+ void penta_work_main();
+
void consume_work();
- uint32_t* prep_double_work(uint8_t bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2]);
uint64_t iJobNo;
diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index c6a7dca..be7d1ce 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -95,7 +95,7 @@ private:
conf += std::string(" { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" +
" \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
" \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" +
- " \"affine_to_cpu\" : false,\n" +
+ " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
" },\n";
}
}
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index 99dc023..5479172 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -9,6 +9,12 @@ R"===(
* bsleep - Insert a delay of X microseconds between kernel launches.
* Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 100
* affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ * sync_mode - method used to synchronize the device
+ * documentation: http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g69e73c7dda3fc05306ae7c811a690fac
+ * 0 = cudaDeviceScheduleAuto
+ * 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu
+ * 2 = cudaDeviceScheduleYield
+ * 3 = cudaDeviceScheduleBlockingSync (default)
*
* On the first run the miner will look at your system and suggest a basic configuration that will work,
* you can try to tweak it from there to get the best performance.
@@ -16,7 +22,9 @@ R"===(
* A filled out configuration should look like this:
* "gpu_threads_conf" :
* [
- * { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, "affine_to_cpu" : false},
+ * { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0,
+ * "affine_to_cpu" : false, "sync_mode" : 3,
+ * },
* ],
*/
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
index 4208145..46c5726 100644
--- a/xmrstak/backend/nvidia/jconf.cpp
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -123,16 +123,17 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
- const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff;
+ const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode;
gid = GetObjectMember(oThdConf, "index");
blocks = GetObjectMember(oThdConf, "blocks");
threads = GetObjectMember(oThdConf, "threads");
bfactor = GetObjectMember(oThdConf, "bfactor");
bsleep = GetObjectMember(oThdConf, "bsleep");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
+ syncMode = GetObjectMember(oThdConf, "sync_mode");
if(gid == nullptr || blocks == nullptr || threads == nullptr ||
- bfactor == nullptr || bsleep == nullptr || aff == nullptr)
+ bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr)
{
return false;
}
@@ -155,11 +156,17 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
if(!aff->IsUint64() && !aff->IsBool())
return false;
+ if(!syncMode->IsNumber() || syncMode->GetInt() < 0 || syncMode->GetInt() > 3)
+ {
+ printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)");
+ return false;
+ }
cfg.id = gid->GetInt();
cfg.blocks = blocks->GetInt();
cfg.threads = threads->GetInt();
cfg.bfactor = bfactor->GetInt();
cfg.bsleep = bsleep->GetInt();
+ cfg.syncMode = syncMode->GetInt();
if(aff->IsNumber())
cfg.cpu_aff = aff->GetInt();
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
index b09a162..7f60f1d 100644
--- a/xmrstak/backend/nvidia/jconf.hpp
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -28,6 +28,7 @@ public:
bool bDoubleMode;
bool bNoPrefetch;
int32_t cpu_aff;
+ int syncMode;
long long iCpuAff;
};
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 9eab1c0..6e628fd 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -77,6 +77,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
ctx.device_threads = (int)cfg.threads;
ctx.device_bfactor = (int)cfg.bfactor;
ctx.device_bsleep = (int)cfg.bsleep;
+ ctx.syncMode = cfg.syncMode;
this->affinity = cfg.cpu_aff;
std::unique_lock<std::mutex> lck(thd_aff_set);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 1b63379..afbdbaf 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -11,7 +11,8 @@ typedef struct {
int device_blocks;
int device_threads;
int device_bfactor;
- int device_bsleep;
+ int device_bsleep;
+ int syncMode;
uint32_t *d_input;
uint32_t inputlen;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index a92fa8c..0b175b5 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -167,10 +167,10 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_
#endif
}
+template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK>
#ifdef XMR_STAK_THREADS
__launch_bounds__( XMR_STAK_THREADS * 4 )
#endif
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK>
__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
{
__shared__ uint32_t sharedMemory[1024];
@@ -327,18 +327,22 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx)
for ( int i = 0; i < partcount; i++ )
{
- CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase2<ITERATIONS,THREAD_SHIFT,MASK><<<
- grid,
- block4,
- block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
- >>>(
- ctx->device_blocks*ctx->device_threads,
- ctx->device_bfactor,
- i,
- ctx->d_long_state,
- ctx->d_ctx_a,
- ctx->d_ctx_b
- ));
+ CUDA_CHECK_MSG_KERNEL(
+ ctx->device_id,
+ "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
+ cryptonight_core_gpu_phase2<ITERATIONS,THREAD_SHIFT,MASK><<<
+ grid,
+ block4,
+ block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+ >>>(
+ ctx->device_blocks*ctx->device_threads,
+ ctx->device_bfactor,
+ i,
+ ctx->d_long_state,
+ ctx->d_ctx_a,
+ ctx->d_ctx_b
+ )
+ );
if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
index 078c165..563bb3b 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
@@ -9,22 +9,41 @@
/** execute and check a CUDA api command
*
* @param id gpu id (thread id)
+ * @param msg message string which should be added to the error message
* @param ... CUDA api command
*/
-#define CUDA_CHECK(id, ...) { \
- cudaError_t error = __VA_ARGS__; \
- if(error!=cudaSuccess){ \
- std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__ << std::endl; \
- throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \
- } \
-} \
+#define CUDA_CHECK_MSG(id, msg, ...) { \
+ cudaError_t error = __VA_ARGS__; \
+ if(error!=cudaSuccess){ \
+ std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \
+ std::cerr << msg << std::endl; \
+ throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \
+ } \
+} \
( (void) 0 )
+/** execute and check a CUDA api command
+ *
+ * @param id gpu id (thread id)
+ * @param ... CUDA api command
+ */
+#define CUDA_CHECK(id, ...) CUDA_CHECK_MSG(id, "", __VA_ARGS__)
+
/** execute and check a CUDA kernel
*
* @param id gpu id (thread id)
* @param ... CUDA kernel call
*/
-#define CUDA_CHECK_KERNEL(id, ...) \
- __VA_ARGS__; \
+#define CUDA_CHECK_KERNEL(id, ...) \
+ __VA_ARGS__; \
CUDA_CHECK(id, cudaGetLastError())
+
+/** execute and check a CUDA kernel
+ *
+ * @param id gpu id (thread id)
+ * @param msg message string which should be added to the error message
+ * @param ... CUDA kernel call
+ */
+#define CUDA_CHECK_MSG_KERNEL(id, msg, ...) \
+ __VA_ARGS__; \
+ CUDA_CHECK_MSG(id, msg, cudaGetLastError())
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 5501d8d..492201d 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -189,7 +189,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
}
cudaDeviceReset();
- cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+ switch(ctx->syncMode)
+ {
+ case 0:
+ cudaSetDeviceFlags(cudaDeviceScheduleAuto);
+ break;
+ case 1:
+ cudaSetDeviceFlags(cudaDeviceScheduleSpin);
+ break;
+ case 2:
+ cudaSetDeviceFlags(cudaDeviceScheduleYield);
+ break;
+ case 3:
+ cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+ break;
+
+ };
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
size_t hashMemSize;
@@ -203,7 +218,6 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
}
size_t wsize = ctx->device_blocks * ctx->device_threads;
- CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, hashMemSize * wsize));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize));
@@ -213,6 +227,10 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
+ CUDA_CHECK_MSG(
+ ctx->device_id,
+ "\n**suggestion: Try to reduce the value of the attribute 'threads' in the NVIDIA config file.**",
+ cudaMalloc(&ctx->d_long_state, hashMemSize * wsize));
return 1;
}
@@ -239,7 +257,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) ));
- CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state ));
+ CUDA_CHECK_MSG_KERNEL(
+ ctx->device_id,
+ "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+ cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state )
+ );
CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
@@ -380,6 +402,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
*/
ctx->device_blocks = props.multiProcessorCount *
( props.major < 3 ? 2 : 3 );
+
+ // increase bfactor for low end devices to avoid that the miner is killed by the OS
+ if(props.multiProcessorCount < 6)
+ ctx->device_bfactor += 2;
}
if(ctx->device_threads == -1)
{
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index e152a02..546d226 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -66,6 +66,9 @@ void help()
cout<<" -v, --version show version number"<<endl;
cout<<" -V, --version-long show long version number"<<endl;
cout<<" -c, --config FILE common miner configuration file"<<endl;
+#ifdef _WIN32
+ cout<<" --noUAC disable the UAC dialog"<<endl;
+#endif
#if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO))
cout<<" --currency NAME currency to mine: monero or aeon"<<endl;
#endif
@@ -82,11 +85,17 @@ void help()
cout<<" --nvidia FILE NVIDIA backend miner config file"<<endl;
#endif
cout<<" "<<endl;
- cout<<"The Following options temporary overwrites the config file settings:"<<endl;
+ cout<<"The Following options temporary overwrites the config entries of \nthe pool with the highest weight:"<<endl;
cout<<" -o, --url URL pool url and port, e.g. pool.usxmrpool.com:3333"<<endl;
cout<<" -u, --user USERNAME pool user name or wallet address"<<endl;
cout<<" -p, --pass PASSWD pool password, in the most cases x or empty \"\""<<endl;
cout<<" \n"<<endl;
+#ifdef _WIN32
+ cout<<"Environment variables:\n"<<endl;
+ cout<<" XMRSTAK_NOWAIT disable the dialog `Press any key to exit."<<std::endl;
+ cout<<" for non UAC execution"<<endl;
+ cout<<" \n"<<endl;
+#endif
cout<< "Version: " << get_version_str_short() << endl;
cout<<"Brought to by fireice_uk and psychocrypt under GPLv3."<<endl;
}
@@ -263,6 +272,41 @@ void do_guided_config(bool userSetPasswd)
std::cout<<"Configuration stored in file '"<<params::inst().configFile<<"'"<<std::endl;
}
+#ifdef _WIN32
+/** start the miner as administrator
+ *
+ * This function based on the stackoverflow post
+ * - source: https://stackoverflow.com/a/4893508
+ * - author: Cody Gray
+ * - date: Feb 4 '11
+ */
+void UACDialog(const std::string& binaryName, std::string& args)
+{
+ args += " --noUAC";
+ SHELLEXECUTEINFO shExInfo = {0};
+ shExInfo.cbSize = sizeof(shExInfo);
+ shExInfo.fMask = SEE_MASK_NOCLOSEPROCESS;
+ shExInfo.hwnd = 0;
+ shExInfo.lpVerb = "runas";
+ shExInfo.lpFile = binaryName.c_str();
+ // disable UAC dialog (else the miner will go into a infinite loop)
+ shExInfo.lpParameters = args.c_str();
+ shExInfo.lpDirectory = 0;
+ shExInfo.nShow = SW_SHOW;
+ shExInfo.hInstApp = 0;
+
+ if(ShellExecuteEx(&shExInfo))
+ {
+ printer::inst()->print_msg(L0,
+ "This window has been opened because xmr-stak needed to run as administrator. It can be safely closed now.");
+ WaitForSingleObject(shExInfo.hProcess, INFINITE);
+ CloseHandle(shExInfo.hProcess);
+ // do not start the miner twice
+ std::exit(0);
+ }
+}
+#endif
+
int main(int argc, char *argv[])
{
#ifndef CONF_NO_TLS
@@ -296,6 +340,7 @@ int main(int argc, char *argv[])
}
bool userSetPasswd = false;
+ bool uacDialog = true;
for(int i = 1; i < argc; ++i)
{
std::string opName(argv[i]);
@@ -418,6 +463,10 @@ int main(int argc, char *argv[])
}
params::inst().configFile = argv[i];
}
+ else if(opName.compare("--noUAC") == 0)
+ {
+ uacDialog = false;
+ }
else
{
printer::inst()->print_msg(L0, "Parameter unknown '%s'",argv[i]);
@@ -426,6 +475,20 @@ int main(int argc, char *argv[])
}
}
+#ifdef _WIN32
+ if(uacDialog)
+ {
+ std::string minerArgs;
+ for(int i = 1; i < argc; i++)
+ {
+ minerArgs += " ";
+ minerArgs += argv[i];
+ }
+
+ UACDialog(argv[0], minerArgs);
+ }
+#endif
+
// check if we need a guided start
if(!configEditor::file_exist(params::inst().configFile))
do_guided_config(userSetPasswd);
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 462f564..34bde6c 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -158,6 +158,7 @@ bool jconf::GetPoolConfig(size_t id, pool_cfg& cfg)
}
else /* Special case - user selected same weights for everything */
cfg.weight = 0.0;
+ return true;
}
bool jconf::TlsSecureAlgos()
diff --git a/xmrstak/misc/console.cpp b/xmrstak/misc/console.cpp
index 8de5948..980760e 100644
--- a/xmrstak/misc/console.cpp
+++ b/xmrstak/misc/console.cpp
@@ -222,8 +222,13 @@ void printer::print_str(const char* str)
#ifdef _WIN32
void win_exit(size_t code)
{
- printer::inst()->print_str("Press any key to exit.");
- get_key();
+ size_t envSize = 0;
+ getenv_s(&envSize, nullptr, 0, "XMRSTAK_NOWAIT");
+ if(envSize == 0)
+ {
+ printer::inst()->print_str("Press any key to exit.");
+ get_key();
+ }
std::exit(code);
}
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index cccfca7..6f34d80 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -388,10 +388,16 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob)
if(dat.pool_id != pool_id)
{
- if(dat.pool_id == invalid_pool_id)
- printer::inst()->print_msg(L2, "Pool logged in.");
+ jpsock* prev_pool;
+ if(dat.pool_id != invalid_pool_id && (prev_pool = pick_pool_by_id(dat.pool_id)) != nullptr)
+ {
+ if(prev_pool->is_dev_pool())
+ printer::inst()->print_msg(L2, "Switching back to user pool.");
+ else
+ printer::inst()->print_msg(L2, "Pool switched.");
+ }
else
- printer::inst()->print_msg(L2, "Pool switched.");
+ printer::inst()->print_msg(L2, "Pool logged in.");
}
else
printer::inst()->print_msg(L3, "New block detected.");
@@ -459,6 +465,7 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult)
void disable_sigpipe()
{
struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_IGN;
sa.sa_flags = 0;
if (sigaction(SIGPIPE, &sa, 0) == -1)
@@ -521,6 +528,30 @@ void executor::ex_main()
pools.emplace_front(0, "donate.xmr-stak.net:4444", "", "", 0.0, true, false, "", true);
}
+ /* find the pool with the highest weighting to allow overwriting of the
+ * pool settings via command line options.
+ */
+ std::vector<jpsock*> sorted_pools;
+ sorted_pools.reserve(pools.size());
+ for(jpsock& pool : pools)
+ sorted_pools.emplace_back(&pool);
+ std::sort(sorted_pools.begin(), sorted_pools.end(), [](jpsock* a, jpsock* b) { return b->get_pool_weight(true) < a->get_pool_weight(true); });
+
+ // overwrite pool address if cli option is used
+ auto& poolURL = xmrstak::params::inst().poolURL;
+ if(!poolURL.empty())
+ {
+ sorted_pools[0]->set_pool_addr(poolURL.c_str());
+ }
+ // overwrite user pool login name if cli option is used
+ auto& poolUsername = xmrstak::params::inst().poolUsername;
+ if(!poolUsername.empty())
+ sorted_pools[0]->set_user_login(poolUsername.c_str());
+ // overwrite user pool login password if cli option is used
+ auto& poolPasswd = xmrstak::params::inst().poolPasswd;
+ if(!poolPasswd.empty())
+ sorted_pools[0]->set_user_passwd(poolPasswd.c_str());
+
ex_event ev;
std::thread clock_thd(&executor::ex_clock_thd, this);
@@ -624,11 +655,11 @@ inline const char* hps_format(double h, char* buf, size_t l)
{
if(std::isnormal(h) || h == 0.0)
{
- snprintf(buf, l, " %03.1f", h);
+ snprintf(buf, l, " %6.1f", h);
return buf;
}
else
- return " (na)";
+ return " (na)";
}
bool executor::motd_filter_console(std::string& motd)
@@ -722,9 +753,9 @@ void executor::hashrate_report(std::string& out)
std::transform(name.begin(), name.end(), name.begin(), ::toupper);
out.append("HASHRATE REPORT - ").append(name).append("\n");
- out.append("| ID | 10s | 60s | 15m |");
+ out.append("| ID | 10s | 60s | 15m |");
if(nthd != 1)
- out.append(" ID | 10s | 60s | 15m |\n");
+ out.append(" ID | 10s | 60s | 15m |\n");
else
out.append(1, '\n');
diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp
index 9d276b7..ba5d1c8 100644
--- a/xmrstak/net/jpsock.hpp
+++ b/xmrstak/net/jpsock.hpp
@@ -59,6 +59,10 @@ public:
inline const char* get_tls_fp() { return tls_fp.c_str(); }
inline bool is_nicehash() { return nicehash; }
+ inline void set_pool_addr(const char* sAddr) { net_addr = sAddr; }
+ inline void set_user_login(const char* sLogin) { usr_login = sLogin; }
+ inline void set_user_passwd(const char* sPassword) { usr_pass = sPassword; }
+
bool get_pool_motd(std::string& strin);
std::string&& get_call_error();
OpenPOWER on IntegriCloud