27 files changed, 363 insertions, 159 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index 7989cb6..efa65bf 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -29,4 +29,4 @@ test_script:
   - cd c:\xmr-stak\build\bin\Release
   - dir
   - copy C:\xmr-stak-dep\openssl\bin\* .
-  - xmr-stak.exe --help
+#  - xmr-stak.exe --help
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87c0e8a..a444085 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(xmr-stak)
 
-cmake_minimum_required(VERSION 3.0.1)
+cmake_minimum_required(VERSION 3.1.0)
 
 # enforce C++11
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -42,23 +42,23 @@ set_property(CACHE XMR-STAK_CURRENCY PROPERTY STRINGS "all;monero;aeon")
 
 set(XMR-STAK_COMPILE "native" CACHE STRING "select CPU compute architecture")
 set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic")
-if("${XMR-STAK_COMPILE}" STREQUAL "native")
+if(XMR-STAK_COMPILE STREQUAL "native")
     if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
         set(CMAKE_CXX_FLAGS "-march=native -mtune=native ${CMAKE_CXX_FLAGS}")
         set(CMAKE_C_FLAGS "-march=native -mtune=native ${CMAKE_C_FLAGS}")
     endif()
-elseif("${XMR-STAK_COMPILE}" STREQUAL "generic")
+elseif(XMR-STAK_COMPILE STREQUAL "generic")
     add_definitions("-DCONF_ENFORCE_OpenCL_1_2=1")
 else()
     message(FATAL_ERROR "XMR-STAK_COMPILE is set to an unknown value '${XMR-STAK_COMPILE}'")
 endif()
 
-if("${XMR-STAK_CURRENCY}" STREQUAL "all")
+if(XMR-STAK_CURRENCY STREQUAL "all")
     message(STATUS "Set miner currency to 'monero' and 'aeon'")
-elseif("${XMR-STAK_CURRENCY}" STREQUAL "aeon")
+elseif(XMR-STAK_CURRENCY STREQUAL "aeon")
     message(STATUS "Set miner currency to 'aeon'")
     add_definitions("-DCONF_NO_MONERO=1")
-elseif("${XMR-STAK_CURRENCY}" STREQUAL "monero")
+elseif(XMR-STAK_CURRENCY STREQUAL "monero")
     message(STATUS "Set miner currency to 'monero'")
     add_definitions("-DCONF_NO_AEON=1")
 endif()
@@ -134,7 +134,7 @@ if(CUDA_ENABLE)
         option(CUDA_SHOW_REGISTER "Show registers used for each kernel and compute architecture" OFF)
         option(CUDA_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps" OFF)
 
-        if("${CUDA_COMPILER}" STREQUAL "clang")
+        if(CUDA_COMPILER STREQUAL "clang")
             set(CLANG_BUILD_FLAGS "-O3 -x cuda --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
             # activation usage of FMA
             set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} -ffp-contract=fast")
@@ -152,9 +152,9 @@ if(CUDA_ENABLE)
                 set(CLANG_BUILD_FLAGS "${CLANG_BUILD_FLAGS} --cuda-gpu-arch=sm_${CUDA_ARCH_ELEM}")
             endforeach()
 
-        elseif("${CUDA_COMPILER}" STREQUAL "nvcc")
+        elseif(CUDA_COMPILER STREQUAL "nvcc")
             # add c++11 for cuda
-            if(NOT "${CMAKE_CXX_FLAGS}" MATCHES "-std=c\\+\\+11")
+            if(NOT CMAKE_CXX_FLAGS MATCHES "-std=c\\+\\+11")
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
             endif()
 
@@ -307,7 +307,7 @@ if(MICROHTTPD_ENABLE)
             ENV "MICROHTTPD_ROOT"
         PATH_SUFFIXES
             lib)
-    if("${MHTD}" STREQUAL "MHTD-NOTFOUND")
+    if(MHTD STREQUAL "MHTD-NOTFOUND")
         message(FATAL_ERROR "microhttpd NOT found: use `-DMICROHTTPD_ENABLE=OFF` to build without http deamon support")
     else()
         set(LIBS ${LIBS} ${MHTD})
@@ -361,7 +361,7 @@ if(HWLOC_ENABLE)
         PATH_SUFFIXES
             lib)
 
-    if("${HWLOC}" STREQUAL "MHTD-NOTFOUND" OR ${HWLOC_INCLUDE_DIR} STREQUAL "HWLOC_INCLUDE_DIR-NOTFOUND")
+    if(HWLOC STREQUAL "HWLOC-NOTFOUND" OR ${HWLOC_INCLUDE_DIR} STREQUAL "HWLOC_INCLUDE_DIR-NOTFOUND")
         message(FATAL_ERROR "hwloc NOT found: use `-DHWLOC_ENABLE=OFF` to build without hwloc support")
     else()
         set(LIBS ${LIBS} ${HWLOC})
@@ -399,10 +399,10 @@ execute_process(
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
 
-if(NOT "${GIT_COMMIT_HASH}" STREQUAL "")
+if(NOT GIT_COMMIT_HASH STREQUAL "")
 	add_definitions("-DGIT_COMMIT_HASH=${GIT_COMMIT_HASH}")
 endif()
-if(NOT "${GIT_BRANCH}" STREQUAL "")
+if(NOT GIT_BRANCH STREQUAL "")
 	add_definitions("-DGIT_BRANCH=${GIT_BRANCH}")
 endif()
 
@@ -446,7 +446,10 @@ add_library(xmr-stak-c
     ${SRCFILES_C}
 )
 set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99)
-target_link_libraries(xmr-stak-c ${MHTD} ${LIBS})
+if(MICROHTTPD_ENABLE)
+    target_link_libraries(xmr-stak-c ${MHTD})
+endif()
+target_link_libraries(xmr-stak-c ${LIBS})
 
 # compile generic backend files
 file(GLOB BACKEND_CPP
@@ -470,7 +473,7 @@ if(CUDA_FOUND)
         "xmrstak/backend/nvidia/nvcc_code/*.cu"
         "xmrstak/backend/nvidia/*.cpp")
 
-    if("${CUDA_COMPILER}" STREQUAL "clang")
+    if(CUDA_COMPILER STREQUAL "clang")
         # build device code with clang
         add_library(
             xmrstak_cuda_backend
@@ -530,7 +533,7 @@ endif()
 
 
 # do not install the binary if the project and install are equal
-if( NOT "${CMAKE_INSTALL_PREFIX}" STREQUAL "${PROJECT_BINARY_DIR}" )
+if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
     install(TARGETS xmr-stak
             RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
     if(CUDA_FOUND)
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3e996ef
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,26 @@
+# Latest version of ubuntu
+FROM nvidia/cuda:9.0-base
+
+# Default git repository
+ENV GIT_REPOSITORY https://github.com/fireice-uk/xmr-stak.git
+ENV XMRSTAK_CMAKE_FLAGS -DXMR-STAK_COMPILE=generic -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF
+
+# Innstall packages
+RUN apt-get update \
+    && set -x \
+    && apt-get install -qq --no-install-recommends -y ca-certificates cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
+    && git clone $GIT_REPOSITORY \
+    && cd /xmr-stak \
+    && cmake ${XMRSTAK_CMAKE_FLAGS} . \
+    && make \
+    && cd - \
+    && mv /xmr-stak/bin/* /usr/local/bin/ \
+    && rm -rf /xmr-stak \
+    && apt-get purge -y -qq cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
+    && apt-get clean -qq
+
+VOLUME /mnt
+
+WORKDIR /mnt
+
+ENTRYPOINT ["/usr/local/bin/xmr-stak"]
diff --git a/README.md b/README.md
index 5609d9d..b348928 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV
 
 ## Download
 
-You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/xmr-stak/xmr-stak/releases).
+You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/fireice-uk/xmr-stak/releases).
 If you are running on Linux (especially Linux VMs), checkout [Linux Portable Binary](doc/Linux_deployment.md).
 
 ## Default Developer Donation
diff --git a/doc/FAQ.md b/doc/FAQ.md
index 215048d..73bc3aa 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -29,7 +29,7 @@ Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid
 
 ## VirtualAlloc failed
 
-If you set up the user rights properly (see above), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation.
+If you set up the user rights properly ([see above](https://github.com/fireice-uk/xmr-stak/blob/master/doc/FAQ.md#selockmemoryprivilege-failed)), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation.
 
 If that happens, disable all auto-staring applications and run the miner after a reboot.
 
diff --git a/doc/Linux_deployment.md b/doc/Linux_deployment.md
index caf762c..323a97f 100644
--- a/doc/Linux_deployment.md
+++ b/doc/Linux_deployment.md
@@ -1,5 +1,7 @@
 # Deploying portable **XMR-Stak** on Linux systems
 
+**This is an experimental feature** we reserve the right to remove the binary if we get too many issues.
+
 XMR-Stak releases include a pre-built portable version. If you are simply using it to avoid having to compile the application, you can simply download **xmr-stak-portbin-linux.tar.gz** from our [latest releases](https://github.com/fireice-uk/xmr-stak/releases/latest). Open up command line, and use the following commands:
 
 ```
diff --git a/doc/compile.md b/doc/compile.md
index cf25135..4fa940f 100644
--- a/doc/compile.md
+++ b/doc/compile.md
@@ -9,6 +9,7 @@
 * [Compile on Windows](compile_Windows.md)
 * [Compile on Linux](compile_Linux.md)
 * [Compile on FreeBSD](compile_FreeBSD.md)
+* [Compile on MacOS](compile_MacOS.md)
 
 ## Build System
 
@@ -30,6 +31,7 @@ After the configuration you need to compile the miner, follow the guide for your
 * [Compile in Windows](compile_Windows.md)
 * [Compile in Linux](compile_Linux.md)
 * [Compile in FreeBSD](compile_FreeBSD.md)
+* [Compile in MacOS](compile_MacOS.md)
 
 ## Generic Build Options
 - `CMAKE_INSTALL_PREFIX` install miner to the home folder
diff --git a/doc/compile_MacOS.md b/doc/compile_MacOS.md
new file mode 100644
index 0000000..1b0af91
--- /dev/null
+++ b/doc/compile_MacOS.md
@@ -0,0 +1,31 @@
+# Compile **xmr-stak** for MacOS
+
+## Dependencies
+
+Assuming you already have [Homebrew](https://brew.sh) installed, the installation of dependencies is pretty straightforward and will generate the `xmr-stak` binary in the `bin/` directory.
+
+### For NVIDIA GPUs
+
+```shell
+brew tap caskroom/drivers
+brew cask install nvidia-cuda
+brew install hwloc libmicrohttpd gcc openssl cmake
+cmake . -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DOpenCL_ENABLE=OFF
+make install
+```
+
+[All available CMake options](compile.md#nvidia-build-options)
+
+### For AMD GPUs
+
+> 🖐 We need help with AMD GPU compilation instructions. Please submit a PR if you managed to install [AMD APP SDK](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/) and to compile `xmr-stak` on MacOS.
+
+### For CPU-only mining
+
+```shell
+brew install hwloc libmicrohttpd gcc openssl cmake
+cmake . -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF
+make install
+```
+
+[All available CMake options](compile.md#cpu-build-options)
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index 532348c..0ce4a0c 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -34,8 +34,12 @@
 - download and install the latest version from [http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/)
 
 ### Dependencies OpenSSL/Hwloc and Microhttpd
-- with CUDA 8: download the version 1 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip)
-- with CUDA 9: download the version 2 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip)
+- for CUDA 8*:
+  - download the version 1 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip)
+  - version 1 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v141
+- for CUDA 9 **and/or** AMD GPUs, CPU:
+  - download the version 2 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip)
+  - version 2 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v140
 - unzip all to `C:\xmr-stak-dep`
 
 ### Validate the Dependency Folder
@@ -81,15 +85,19 @@
   mkdir build
   cd build
   ```
-  - with CUDA 8
+  - for CUDA 8*
     ```
     cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 ..
     ```
-  - with CUDA 9
+  - for CUDA 9 **and/or** AMD GPUs, CPU
     ```
     cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 ..
     ```
   ```
   cmake --build . --config Release --target install
   cd bin\Release
+  copy C:\xmr-stak-dep\openssl\bin\* .
   ```
+
+\* Miner is also compiled for AMD GPUs (if the AMD APP SDK is installed) and CPUs.
+CUDA 8 requires a downgrade to the old v140 tool chain.
diff --git a/doc/tuning.md b/doc/tuning.md
index 474553b..8eeefcb 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -8,6 +8,7 @@
   * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
   * [Add more GPUs](#add-more-gpus)
   * [Increase Memory Pool](#increase-memory-pool)
+  * [Scratchpad Indexing](#scratchpad-indexing)
 
 ## NVIDIA Backend
 
@@ -80,4 +81,9 @@ export GPU_MAX_ALLOC_PERCENT=99
 export GPU_SINGLE_ALLOC_PERCENT=99
 ```
 
-*Note:* Windows user must use `set` instead of `export` to define an environment variable.
-\ No newline at end of file
+*Note:* Windows user must use `set` instead of `export` to define an environment variable.
+
+### Scratchpad Indexing
+
+The layout of the hash scratchpad memory can be changed for each GPU with the option `strided_index` in `amd.txt`.
+Try to change the value from the default `true` to `false`.
diff --git a/doc/usage.md b/doc/usage.md
index 1540a71..b516333 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -20,7 +20,10 @@ The number of files depends on the available backends.
 1) Double click the `xmr-stak.exe` file
 2) Fill in the pool url, username and password
 
-## Usage on Linux
+`set XMRSTAK_NOWAIT=1` disable the dialog `Press any key to exit.` for non UAC execution.
+
+
+## Usage on Linux & MacOS
 1) Open a terminal within the folder with the binary
 2) Start the miner with `./xmr-stak`
 
@@ -40,6 +43,8 @@ Usage: xmr-stak [OPTION]...
   --cpu FILE            CPU backend miner config file
   --noAMD               disable the AMD miner backend
   --amd FILE            AMD backend miner config file
+  --noNVIDIA            disable the NVIDIA miner backend
+  --nvidia FILE         NVIDIA backend miner config file
 
 The Following options temporary overwrites the config file settings:
   -o, --url URL         pool url and port, e.g. pool.usxmrpool.com:3333
@@ -47,6 +52,22 @@ The Following options temporary overwrites the config file settings:
   -p, --pass PASSWD     pool password, in the most cases x or empty ""
 ```
 
+## Docker image usage
+
+You can run the Docker image the following way:
+
+```
+docker run --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak
+docker stop xmr-stak
+docker run --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak --config config.txt
+```
+
+Debug the docker image by getting inside:
+
+```
+docker run --entrypoint=/bin/bash --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak
+```
+
 ## HTML and JSON API report configuraton
 
 To configure the reports shown on the [README](../README.md) side you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json
diff --git a/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh b/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
index e046cb0..bfee1b8 100755
--- a/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
+++ b/scripts/build_xmr-stak_docker/build_xmr-stak_docker.sh
@@ -5,110 +5,131 @@ if [[ $EUID -ne 0 ]]; then
    exit 1
 fi
 
-if [ -d xmr-stak-cpu ]; then
-  git -C xmr-stak-cpu clean -fd
+if [ -d xmr-stak ]; then
+  git -C xmr-stak clean -fd
 else
-  git clone https://github.com/fireice-uk/xmr-stak-cpu.git
+  git clone https://github.com/fireice-uk/xmr-stak.git
 fi
 
+wget -c https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run
+chmod a+x cuda_*_linux-run
+
 
 ########################
-# Fedora 26
+# Fedora 27
 ########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu fedora:26 /bin/bash -c "
-set -ex ;
-dnf install -y -q gcc gcc-c++ hwloc-devel libmicrohttpd-devel libstdc++-static make openssl-devel cmake ;
-cd /xmr-stak-cpu ;
-cmake -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+# CUDA is not going to work on Fedora 27 beacuse it's only support these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+docker run --rm -it -v $PWD:/mnt fedora:27 /bin/bash -c "
+set -x ;
+dnf install -y -q cmake gcc-c++ hwloc-devel libmicrohttpd-devel libstdc++-static make openssl-devel;
+cd /mnt/xmr-stak ;
+cmake -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ;
+make ;
 "
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_fedora_26
-git -C xmr-stak-cpu clean -fd
+
+test -d fedora_27 || mkdir fedora_27
+mv xmr-stak/bin/* fedora_27
+git -C xmr-stak clean -fd
 
 
 ########################
 # Ubuntu (17.04)
 ########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:17.04 /bin/bash -c "
-set -ex ;
+docker run --rm -it -v $PWD:/mnt ubuntu:17.04 /bin/bash -c "
+set -x ;
 apt update -qq ;
 apt install -y -qq libmicrohttpd-dev libssl-dev cmake build-essential libhwloc-dev ;
-cd /xmr-stak-cpu ;
-cmake -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+cd /mnt/xmr-stak ;
+/mnt/cuda_*_linux-run --silent --toolkit ;
+cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ;
+make ;
 "
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_17.04
-git -C xmr-stak-cpu clean -fd
+
+test -d ubuntu_17.10 || mkdir ubuntu_17.10
+mv xmr-stak/bin/* ubuntu_17.10
+git -C xmr-stak clean -fd
 
 
 ########################
 # Ubuntu 16.04
 ########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:16.04 /bin/bash -c "
-set -ex ;
+docker run --rm -it -v $PWD:/mnt ubuntu:16.04 /bin/bash -c "
+set -x ;
 apt update -qq ;
-apt install -y -qq libmicrohttpd-dev libssl-dev cmake build-essential libhwloc-dev ;
-cd /xmr-stak-cpu ;
-cmake -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+apt install -y -qq cmake g++ libmicrohttpd-dev libssl-dev libhwloc-dev ;
+cd /mnt/xmr-stak ;
+/mnt/cuda_*_linux-run --silent --toolkit ;
+cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ;
+make ;
 "
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_16.04
-git -C xmr-stak-cpu clean -fd
+
+test -d ubuntu_16.04 || mkdir ubuntu_16.04
+mv xmr-stak/bin/* ubuntu_16.04
+git -C xmr-stak clean -fd
 
 
 ########################
 # Ubuntu 14.04
 ########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu ubuntu:14.04 /bin/bash -c "
-set -ex ;
+docker run --rm -it -v $PWD:/mnt ubuntu:14.04 /bin/bash -c "
+set -x ;
 apt update -qq ;
 apt install -y -qq curl libmicrohttpd-dev libssl-dev libhwloc-dev software-properties-common ;
 add-apt-repository -y ppa:ubuntu-toolchain-r/test ;
 apt update -qq ;
-apt install -y -qq gcc-7 g++-7 make ;
-update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 1 --slave /usr/bin/g++ g++ /usr/bin/g++-7 ;
-curl -L https://cmake.org/files/v3.9/cmake-3.9.0.tar.gz | tar -xzf - -C /tmp/ ;
-( cd /tmp/cmake-3.9.0/ && ./configure && make && sudo make install && cd - ) > /dev/null
+apt install -y -qq gcc-6 g++-6 make ;
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 1 --slave /usr/bin/g++ g++ /usr/bin/g++-6 ;
+curl -L https://cmake.org/files/LatestRelease/cmake-3.10.0.tar.gz | tar -xzf - -C /tmp/ ;
+( cd /tmp/cmake-*/ && ./configure && make && sudo make install && cd - ) > /dev/null
 update-alternatives --install /usr/bin/cmake cmake /usr/local/bin/cmake 1 --force ;
-cd /xmr-stak-cpu ;
-cmake -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+cd /mnt/xmr-stak ;
+/mnt/cuda_*_linux-run --silent --toolkit ;
+cmake -DCUDA_ENABLE=ON -DOpenCL_ENABLE=OFF . ;
+make ;
 "
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_ubuntu_14.04
-git -C xmr-stak-cpu clean -fd
+
+test -d ubuntu_14.04 || mkdir ubuntu_14.04
+mv xmr-stak/bin/* ubuntu_14.04
+git -C xmr-stak clean -fd
 
 
 ########################
 # CentOS 7
 ########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu centos:7 /bin/bash -c "
-set -ex ;
+# CUDA is not going to work on CentOS/RHEL beacuse it's only support gcc-4 in these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+docker run --rm -it -v $PWD:/mnt centos:7 /bin/bash -c "
+set -x ;
 yum install -y -q centos-release-scl epel-release ;
-yum install -y -q cmake3 devtoolset-4-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ;
-scl enable devtoolset-4 - << EOF
-cd /xmr-stak-cpu ;
-cmake3 -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+yum install -y -q cmake3 devtoolset-7-gcc* hwloc-devel libmicrohttpd-devel make openssl-devel perl ;
+scl enable devtoolset-7 - << EOF
+cd /mnt/xmr-stak ;
+cmake3 -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ;
+make ;
 EOF
 "
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_centos_7
-git -C xmr-stak-cpu clean -fd
+
+test -d centos_7 || mkdir centos_7
+mv xmr-stak/bin/* centos_7
+git -C xmr-stak clean -fd
 
 
 ########################
 # CentOS 6.x
 ########################
-docker run --rm -it -v $PWD/xmr-stak-cpu:/xmr-stak-cpu centos:6 /bin/bash -c "
-set -ex ;
+# CUDA is not going to work on CentOS/RHEL beacuse it's only support gcc-4 in these distributions: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+docker run --rm -it -v $PWD:/mnt centos:6 /bin/bash -c "
+set -x ;
 yum install -y -q centos-release-scl epel-release ;
-yum install -y -q cmake3 devtoolset-4-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ;
-scl enable devtoolset-4 - << EOF
-cd /xmr-stak-cpu ;
-cmake3 -DCMAKE_LINK_STATIC=ON . ;
-make install ;
+yum install -y -q cmake3 devtoolset-7-gcc* hwloc-devel libmicrohttpd-devel openssl-devel make ;
+scl enable devtoolset-7 - << EOF
+cd /mnt/xmr-stak ;
+cmake3 -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF . ;
+make ;
 EOF
 "
-mv xmr-stak-cpu/bin/xmr-stak-cpu xmr-stak-cpu_centos_6
-git -C xmr-stak-cpu clean -fd
 
-rm -rf xmr-stak-cpu
+test -d centos_6 || mkdir centos_6
+mv xmr-stak/bin/* centos_6
+git -C xmr-stak clean -fd
+
+rm -rf xmr-stak
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 15b8457..d9bc962 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -332,7 +332,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 	char options[256];
 	snprintf(options, sizeof(options), 
-		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu", hasIterations, threadMemMask, int_port(ctx->workSize));
+		"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d", 
+		hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
 	ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
 	if(ret != CL_SUCCESS)
 	{
@@ -448,68 +449,85 @@ uint32_t getNumPlatforms()
 std::vector<GpuContext> getAMDDevices(int index)
 {
 	std::vector<GpuContext> ctxVec;
-	cl_platform_id * platforms = NULL;
+	std::vector<cl_platform_id> platforms;
+	std::vector<cl_device_id> device_list;
+
 	cl_int clStatus;
 	cl_uint num_devices;
-	cl_device_id *device_list = NULL;
-
 	uint32_t numPlatforms = getNumPlatforms();
 
-	if(numPlatforms)
+	if(numPlatforms == 0)
+		return ctxVec;
+
+	platforms.resize(numPlatforms);
+	if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
+		return ctxVec;
+	}
+
+	if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+		return ctxVec;
+	}
+
+	device_list.resize(num_devices);
+	if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
+		return ctxVec;
+	}
+
+	for (size_t k = 0; k < num_devices; k++)
 	{
-		platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
-		clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
-		if(clStatus == CL_SUCCESS)
+		std::vector<char> devVendorVec(1024);
+		if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS)
 		{
-			clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
-			if(clStatus == CL_SUCCESS)
+			printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
+			continue;
+		}
+		
+		std::string devVendor(devVendorVec.data());
+		if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos)
+		{
+			GpuContext ctx;
+			std::vector<char> devNameVec(1024);
+			size_t maxMem;
+
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS)
 			{
-				device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
-				clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
-				if(clStatus == CL_SUCCESS)
-				{
-					for (int k = 0; k < num_devices; k++)
-					{
-						cl_int clError;
-						std::vector<char> devVendorVec(1024);
-						clError = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL);
-						if(clStatus == CL_SUCCESS)
-						{
-							std::string devVendor(devVendorVec.data());
-							if( devVendor.find("Advanced Micro Devices") != std::string::npos)
-							{
-								GpuContext ctx;
-								ctx.deviceIdx = k;
-								clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL);
-								size_t maxMem;
-								clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL);
-								clError = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL);
-								// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
-								ctx.freeMem = std::min(ctx.freeMem, maxMem);
-								std::vector<char> devNameVec(1024);
-								clError = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL);
-								ctx.name = std::string(devNameVec.data());
-								printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
-								ctx.DeviceID = device_list[k];
-								ctxVec.push_back(ctx);
-							}
-						}
-						else
-							printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name.", err_to_str(clStatus));
-					}
-				}
-				else
-					printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
-				free(device_list);
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
+				continue;
 			}
-			else
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
+
+			// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
+			ctx.deviceIdx = k;
+			ctx.freeMem = std::min(ctx.freeMem, maxMem);
+			ctx.name = std::string(devNameVec.data());
+			ctx.DeviceID = device_list[k];
+			ctxVec.push_back(ctx);
 		}
-		else
-			printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
 	}
-	
-	free(platforms);
 
 	return ctxVec;
 }
@@ -541,7 +559,7 @@ int getAMDPlatformIdx()
 
 			clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
 			std::string platformName(platformNameVec.data());
-			if( platformName.find("Advanced Micro Devices") != std::string::npos)
+			if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos)
 			{
 				platformIndex = i;
 				printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str());
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 123de01..c17bac1 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -9,6 +9,7 @@
 #endif
 
 #include <stdint.h>
+#include <string>
 #include <vector>
 
 #define ERR_SUCCESS (0)
@@ -23,6 +24,7 @@ struct GpuContext
 	size_t deviceIdx;
 	size_t rawIntensity;
 	size_t workSize;
+	int stridedIndex;
 
 	/*Output vars*/
 	cl_device_id DeviceID;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 966199b..255fcbb 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -411,7 +411,11 @@ void AESExpandKey256(uint *keybuf)
 	}
 }
 
-#define IDX(x)	(x)
+#if(STRIDED_INDEX==0)
+#   define IDX(x)	(x)
+#else
+#   define IDX(x)	((x) * (Threads))
+#endif
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
 __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
@@ -440,7 +444,12 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		((ulong8 *)State)[0] = vload8(0, input);
 		State[8] = input[8];
@@ -519,7 +528,11 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		a[0] = states[0] ^ states[4];
 		b[0] = states[2] ^ states[6];
@@ -588,7 +601,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
 		Scratchpad += gIdx * (ITERATIONS >> 2);
+#else
+		Scratchpad += gIdx;
+#endif
 
 		#if defined(__Tahiti__) || defined(__Pitcairn__)
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
index 996944b..81e1644 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -74,15 +74,29 @@ static const __constant uint AES0_C[256] =
 
 #define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
 
-uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, const uint4 key)
+uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
 {
-	uint4 Y;
-	Y.s0 = AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)];
-    Y.s1 = AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)];
-    Y.s2 = AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)];
-    Y.s3 = AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)];
-    Y ^= key;
-    return(Y);
+	key.s0 ^= AES0[BYTE(X.s0, 0)];
+    key.s1 ^= AES0[BYTE(X.s1, 0)];
+    key.s2 ^= AES0[BYTE(X.s2, 0)];
+    key.s3 ^= AES0[BYTE(X.s3, 0)];
+
+	key.s0 ^= AES2[BYTE(X.s2, 2)];
+    key.s1 ^= AES2[BYTE(X.s3, 2)];
+    key.s2 ^= AES2[BYTE(X.s0, 2)];
+    key.s3 ^= AES2[BYTE(X.s1, 2)];
+
+	key.s0 ^= AES1[BYTE(X.s1, 1)];
+    key.s1 ^= AES1[BYTE(X.s2, 1)];
+    key.s2 ^= AES1[BYTE(X.s3, 1)];
+    key.s3 ^= AES1[BYTE(X.s0, 1)];
+
+	key.s0 ^= AES3[BYTE(X.s3, 3)];
+    key.s1 ^= AES3[BYTE(X.s0, 3)];
+    key.s2 ^= AES3[BYTE(X.s1, 3)];
+    key.s3 ^= AES3[BYTE(X.s2, 3)];
+
+    return key;
 }
 
 #endif
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 0b91212..0bc5239 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -123,7 +123,7 @@ private:
 			// set 8 threads per block (this is a good value for the most gpus)
 			conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 				"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-				"    \"affine_to_cpu\" : false, \n"
+				"    \"affine_to_cpu\" : false, \"strided_index\" : true\n"
 				"  },\n";
 			++i;
 		}
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index a93859c..af662f8 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -5,9 +5,12 @@ R"===(
  *  intensity    - Number of parallel GPU threads (nothing to do with CPU threads)
  *   worksize    - Number of local GPU threads (nothing to do with CPU threads)
  * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ * strided_index - switch memory pattern used for the scratch pad memory
+ *                 true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ *                 false = use a contiguous block of memory per thread
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true },
  * ],
  */
 
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 0617aeb..07afb19 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -103,13 +103,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+	stridedIndex = GetObjectMember(oThdConf, "strided_index");
 
-	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr)
+	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr)
 		return false;
 
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -118,9 +119,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!aff->IsUint64() && !aff->IsBool())
 		return false;
 
+	if(!stridedIndex->IsBool())
+		return false;
+
 	cfg.index = idx->GetUint64();
 	cfg.intensity = intensity->GetUint64();
 	cfg.w_size = w_size->GetUint64();
+	cfg.stridedIndex = stridedIndex->GetBool();
 
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt64();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index da024a4..ee1882a 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -26,6 +26,7 @@ public:
 		size_t intensity;
 		size_t w_size;
 		long long cpu_aff;
+		bool stridedIndex;
 	};
 
 	size_t GetThreadCount();
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index c1399e0..103688f 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -96,6 +96,7 @@ bool minethd::init_gpus()
 		vGpuData[i].deviceIdx = cfg.index;
 		vGpuData[i].rawIntensity = cfg.intensity;
 		vGpuData[i].workSize = cfg.w_size;
+		vGpuData[i].stridedIndex = cfg.stridedIndex;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index a92fa8c..dba6676 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -167,10 +167,10 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_
 #endif
 }
 
+template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
 #endif
-template<size_t ITERATIONS, uint32_t THREAD_SHIFT, uint32_t MASK>
 __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
 {
 	__shared__ uint32_t sharedMemory[1024];
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 5501d8d..333ae73 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -380,6 +380,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 */
 		ctx->device_blocks = props.multiProcessorCount *
 			( props.major < 3 ? 2 : 3 );
+
+		// increase bfactor for low end devices to avoid that the miner is killed by the OS
+		if(props.multiProcessorCount < 6)
+			ctx->device_bfactor += 2;
 	}
 	if(ctx->device_threads == -1)
 	{
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index e152a02..e32733b 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -87,6 +87,12 @@ void help()
 	cout<<"  -u, --user USERNAME   pool user name or wallet address"<<endl;
 	cout<<"  -p, --pass PASSWD     pool password, in the most cases x or empty \"\""<<endl;
 	cout<<" \n"<<endl;
+#ifdef _WIN32
+	cout<<"Environment variables:\n"<<endl;
+	cout<<"  XMRSTAK_NOWAIT        disable the dialog `Press any key to exit."<<std::endl;
+	cout<<"                	       for non UAC execution"<<endl;
+	cout<<" \n"<<endl;
+#endif
 	cout<< "Version: " << get_version_str_short() << endl;
 	cout<<"Brought to by fireice_uk and psychocrypt under GPLv3."<<endl;
 }
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 462f564..34bde6c 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -158,6 +158,7 @@ bool jconf::GetPoolConfig(size_t id, pool_cfg& cfg)
 	}
 	else /* Special case - user selected same weights for everything */
 		cfg.weight = 0.0;
+	return true;
 }
 
 bool jconf::TlsSecureAlgos()
diff --git a/xmrstak/misc/console.cpp b/xmrstak/misc/console.cpp
index 8de5948..980760e 100644
--- a/xmrstak/misc/console.cpp
+++ b/xmrstak/misc/console.cpp
@@ -222,8 +222,13 @@ void printer::print_str(const char* str)
 #ifdef _WIN32
 void win_exit(size_t code)
 {
-	printer::inst()->print_str("Press any key to exit.");
-	get_key();
+	size_t envSize = 0;
+	getenv_s(&envSize, nullptr, 0, "XMRSTAK_NOWAIT");
+	if(envSize == 0)
+	{
+		printer::inst()->print_str("Press any key to exit.");
+		get_key();
+	}
 	std::exit(code);
 }
 
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index cccfca7..c500b21 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -388,10 +388,16 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob)
 
 	if(dat.pool_id != pool_id)
 	{
-		if(dat.pool_id == invalid_pool_id)
-			printer::inst()->print_msg(L2, "Pool logged in.");
+		jpsock* prev_pool;
+		if(dat.pool_id != invalid_pool_id && (prev_pool = pick_pool_by_id(dat.pool_id)) != nullptr)
+		{
+			if(prev_pool->is_dev_pool())
+				printer::inst()->print_msg(L2, "Switching back to user pool.");
+			else
+				printer::inst()->print_msg(L2, "Pool switched.");
+		}
 		else
-			printer::inst()->print_msg(L2, "Pool switched.");
+			printer::inst()->print_msg(L2, "Pool logged in.");
 	}
 	else
 		printer::inst()->print_msg(L3, "New block detected.");
@@ -459,6 +465,7 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult)
 void disable_sigpipe()
 {
 	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
 	sa.sa_handler = SIG_IGN;
 	sa.sa_flags = 0;
 	if (sigaction(SIGPIPE, &sa, 0) == -1)
@@ -624,11 +631,11 @@ inline const char* hps_format(double h, char* buf, size_t l)
 {
 	if(std::isnormal(h) || h == 0.0)
 	{
-		snprintf(buf, l, " %03.1f", h);
+		snprintf(buf, l, " %6.1f", h);
 		return buf;
 	}
 	else
-		return " (na)";
+		return "   (na)";
 }
 
 bool executor::motd_filter_console(std::string& motd)
@@ -722,9 +729,9 @@ void executor::hashrate_report(std::string& out)
 			std::transform(name.begin(), name.end(), name.begin(), ::toupper);
 			
 			out.append("HASHRATE REPORT - ").append(name).append("\n");
-			out.append("| ID | 10s |  60s |  15m |");
+			out.append("| ID |    10s |    60s |    15m |");
 			if(nthd != 1)
-				out.append(" ID | 10s |  60s |  15m |\n");
+				out.append(" ID |    10s |    60s |    15m |\n");
 			else
 				out.append(1, '\n');