5 files changed, 57 insertions, 22 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8871951..cf776e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,6 +108,11 @@ if(CUDA_ENABLE)
             endforeach()
 
         elseif("${CUDA_COMPILER}" STREQUAL "nvcc")
+            # add c++11 for cuda
+            if(NOT "${CMAKE_CXX_FLAGS}" MATCHES "-std=c\\+\\+11")
+                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+            endif()
+
             # avoid that nvcc in CUDA < 8 tries to use libc `memcpy` within the kernel
             if(CUDA_VERSION VERSION_LESS 8.0)
                 add_definitions(-D_FORCE_INLINES)
diff --git a/backend/amd/autoAdjust.hpp b/backend/amd/autoAdjust.hpp
index de67456..e6e4015 100644
--- a/backend/amd/autoAdjust.hpp
+++ b/backend/amd/autoAdjust.hpp
@@ -83,13 +83,15 @@ private:
         int i = 0;
         for(auto& ctx : devVec)
         {
-			// use 90% of available memory
-			size_t availableMem = (ctx.freeMem * 100u) / 110;
-			size_t units = ctx.computeUnits;
+			// keep 64MiB memory free (value is randomly chosen)
+			size_t availableMem = ctx.freeMem - (64u * 1024 * 1024);
+			// 224byte extra memory is used per thread for meta data
 			size_t perThread = (size_t(1u)<<21) + 224u;
 			size_t max_intensity = availableMem / perThread;
+			// 1000 is a magic selected limit \todo select max intensity depending of the gpu type
 			size_t intensity = std::min( size_t(1000u) , max_intensity );
-			conf += std::string(" // gpu: ") + ctx.name + "\n";
+			conf += std::string("  // gpu: ") + ctx.name + "\n";
+			// set 8 threads per block (this is a good value for the most gpus)
             conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
                 "    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
                 "    \"affine_to_cpu\" : false, \n"
@@ -101,7 +103,7 @@ private:
 		configTpl.replace("NUMGPUS",std::to_string(devVec.size()));
 		configTpl.replace("GPUCONFIG",conf);
 		configTpl.write("amd.txt");
-		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", "amd.txt");
+		printer::inst()->print_msg(L0, "AMD: GPU configuration stored in file '%s'", "amd.txt");
     }
 
     std::vector<GpuContext> devVec;
diff --git a/backend/nvidia/autoAdjust.hpp b/backend/nvidia/autoAdjust.hpp
index 0e6c40f..659bd08 100644
--- a/backend/nvidia/autoAdjust.hpp
+++ b/backend/nvidia/autoAdjust.hpp
@@ -85,11 +85,13 @@ private:
 		ConfigEditor configTpl{};
 		configTpl.set( std::string(tpl) );
 
+		constexpr size_t byte2mib = 1024u * 1024u;
 		std::string conf;
         int i = 0;
         for(auto& ctx : nvidCtxVec)
         {
-
+			conf += std::string("  // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n";
+			conf += std::string("  //      memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/"  + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n";
             conf += std::string("  { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" +
                 "    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
                 "    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
@@ -100,7 +102,7 @@ private:
 
 		configTpl.replace("GPUCONFIG",conf);
 		configTpl.write("nvidia.txt");
-		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", "nvidia.txt");
+		printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", "nvidia.txt");
     }
 
     std::vector<nvid_ctx> nvidCtxVec;
diff --git a/backend/nvidia/nvcc_code/cryptonight.h b/backend/nvidia/nvcc_code/cryptonight.h
index bec9997..784c38d 100644
--- a/backend/nvidia/nvcc_code/cryptonight.h
+++ b/backend/nvidia/nvcc_code/cryptonight.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <stdint.h>
+#include <string>
 
 typedef struct {
 	int device_id;
@@ -23,6 +24,9 @@ typedef struct {
 	uint32_t *d_ctx_key1;
 	uint32_t *d_ctx_key2;
 	uint32_t *d_ctx_text;
+	std::string name;
+	size_t free_device_memory;
+	size_t total_device_memory;
 } nvid_ctx;
 
 extern "C" {
diff --git a/backend/nvidia/nvcc_code/cuda_extra.cu b/backend/nvidia/nvcc_code/cuda_extra.cu
index 94a4adb..7052bc8 100644
--- a/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -4,6 +4,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <device_functions.hpp>
+#include  <algorithm>
 
 #ifdef __CUDACC__
 __constant__
@@ -301,6 +302,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	ctx->device_arch[0] = props.major;
 	ctx->device_arch[1] = props.minor;
 
+	ctx->name = std::string(props.name);
+
 	// set all evice option those marked as auto (-1) to a valid value
 	if(ctx->device_blocks == -1)
 	{
@@ -318,27 +321,46 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 * `8 * ctx->device_threads` threads per block
 		 */
 		ctx->device_threads = 64;
+		constexpr size_t byte2mib = 1024u * 1024u;
+		
+		// no limit by default 1TiB
+		size_t maxMemUsage = byte2mib * byte2mib;
 		if(props.major < 6)
 		{
-			// try to stay under 950 threads ( 1900MiB memory per for hashes )
-			while(ctx->device_blocks * ctx->device_threads >= 950 && ctx->device_threads > 2)
-			{
-				ctx->device_threads /= 2;
-			}
+			// limit memory usage for GPUs before pascal
+			maxMemUsage = size_t(2048u) * byte2mib;
+		}
+		if(props.major == 2)
+		{
+			// limit memory usage for sm 20 GPUs
+			maxMemUsage = size_t(1024u) * byte2mib;
 		}
 
-		// stay within 85% of the available RAM
-		while(ctx->device_threads > 2)
+		size_t freeMemory = 0;
+		size_t totalMemory = 0;
+		CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory));
+		
+		ctx->total_device_memory = totalMemory;
+		ctx->free_device_memory = freeMemory;
+
+		// keep 64MiB memory free (value is randomly chosen)
+		// 200byte are meta data memory (result nonce, ...)
+		size_t availableMem = freeMemory - (64u * 1024 * 1024) - 200u;
+		size_t limitedMemory = std::min(availableMem, maxMemUsage);
+		// up to 920bytes extra memory is used per thread for some kernel (lmem/local memory)
+		// 680bytes are extra meta data memory per hash
+		size_t perThread = size_t(MEMORY) + 740u + 680u;
+		size_t max_intensity = limitedMemory / perThread;
+		ctx->device_threads = max_intensity / ctx->device_blocks;
+		// use only odd number of threads
+		ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
+
+		if(props.major == 2 && ctx->device_threads > 64)
 		{
-			size_t freeMemory = 0;
-			size_t totalMemory = 0;
-			CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory));
-			freeMemory = (freeMemory * size_t(85)) / 100;
-			if( freeMemory > (size_t(ctx->device_blocks) * size_t(ctx->device_threads) * size_t(2u * 1024u * 1024u)) )
-				break;
-			else
-				ctx->device_threads /= 2;
+			// Fermi gpus only support 512 threads per block (we need start 4 * configured threads)
+			ctx->device_threads = 64;
 		}
+
 	}
 
 	return 1;