summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt5
-rw-r--r--backend/amd/autoAdjust.hpp12
-rw-r--r--backend/nvidia/autoAdjust.hpp6
-rw-r--r--backend/nvidia/nvcc_code/cryptonight.h4
-rw-r--r--backend/nvidia/nvcc_code/cuda_extra.cu52
5 files changed, 57 insertions, 22 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8871951..cf776e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,6 +108,11 @@ if(CUDA_ENABLE)
endforeach()
elseif("${CUDA_COMPILER}" STREQUAL "nvcc")
+ # add c++11 for cuda
+ if(NOT "${CMAKE_CXX_FLAGS}" MATCHES "-std=c\\+\\+11")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+ endif()
+
# avoid that nvcc in CUDA < 8 tries to use libc `memcpy` within the kernel
if(CUDA_VERSION VERSION_LESS 8.0)
add_definitions(-D_FORCE_INLINES)
diff --git a/backend/amd/autoAdjust.hpp b/backend/amd/autoAdjust.hpp
index de67456..e6e4015 100644
--- a/backend/amd/autoAdjust.hpp
+++ b/backend/amd/autoAdjust.hpp
@@ -83,13 +83,15 @@ private:
int i = 0;
for(auto& ctx : devVec)
{
- // use 90% of available memory
- size_t availableMem = (ctx.freeMem * 100u) / 110;
- size_t units = ctx.computeUnits;
+ // keep 64MiB memory free (value is randomly chosen)
+ size_t availableMem = ctx.freeMem - (64u * 1024 * 1024);
+ // 224byte extra memory is used per thread for meta data
size_t perThread = (size_t(1u)<<21) + 224u;
size_t max_intensity = availableMem / perThread;
+ // 1000 is a magic selected limit \todo select max intensity depending of the gpu type
size_t intensity = std::min( size_t(1000u) , max_intensity );
- conf += std::string(" // gpu: ") + ctx.name + "\n";
+ conf += std::string(" // gpu: ") + ctx.name + "\n";
+ // set 8 threads per block (this is a good value for the most gpus)
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
" \"affine_to_cpu\" : false, \n"
@@ -101,7 +103,7 @@ private:
configTpl.replace("NUMGPUS",std::to_string(devVec.size()));
configTpl.replace("GPUCONFIG",conf);
configTpl.write("amd.txt");
- printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", "amd.txt");
+ printer::inst()->print_msg(L0, "AMD: GPU configuration stored in file '%s'", "amd.txt");
}
std::vector<GpuContext> devVec;
diff --git a/backend/nvidia/autoAdjust.hpp b/backend/nvidia/autoAdjust.hpp
index 0e6c40f..659bd08 100644
--- a/backend/nvidia/autoAdjust.hpp
+++ b/backend/nvidia/autoAdjust.hpp
@@ -85,11 +85,13 @@ private:
ConfigEditor configTpl{};
configTpl.set( std::string(tpl) );
+ constexpr size_t byte2mib = 1024u * 1024u;
std::string conf;
int i = 0;
for(auto& ctx : nvidCtxVec)
{
-
+ conf += std::string(" // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n";
+ conf += std::string(" // memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n";
conf += std::string(" { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" +
" \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
" \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" +
@@ -100,7 +102,7 @@ private:
configTpl.replace("GPUCONFIG",conf);
configTpl.write("nvidia.txt");
- printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", "nvidia.txt");
+ printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", "nvidia.txt");
}
std::vector<nvid_ctx> nvidCtxVec;
diff --git a/backend/nvidia/nvcc_code/cryptonight.h b/backend/nvidia/nvcc_code/cryptonight.h
index bec9997..784c38d 100644
--- a/backend/nvidia/nvcc_code/cryptonight.h
+++ b/backend/nvidia/nvcc_code/cryptonight.h
@@ -1,6 +1,7 @@
#pragma once
#include <stdint.h>
+#include <string>
typedef struct {
int device_id;
@@ -23,6 +24,9 @@ typedef struct {
uint32_t *d_ctx_key1;
uint32_t *d_ctx_key2;
uint32_t *d_ctx_text;
+ std::string name;
+ size_t free_device_memory;
+ size_t total_device_memory;
} nvid_ctx;
extern "C" {
diff --git a/backend/nvidia/nvcc_code/cuda_extra.cu b/backend/nvidia/nvcc_code/cuda_extra.cu
index 94a4adb..7052bc8 100644
--- a/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -4,6 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.hpp>
+#include <algorithm>
#ifdef __CUDACC__
__constant__
@@ -301,6 +302,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
ctx->device_arch[0] = props.major;
ctx->device_arch[1] = props.minor;
+ ctx->name = std::string(props.name);
+
// set all evice option those marked as auto (-1) to a valid value
if(ctx->device_blocks == -1)
{
@@ -318,27 +321,46 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
* `8 * ctx->device_threads` threads per block
*/
ctx->device_threads = 64;
+ constexpr size_t byte2mib = 1024u * 1024u;
+
+ // no limit by default 1TiB
+ size_t maxMemUsage = byte2mib * byte2mib;
if(props.major < 6)
{
- // try to stay under 950 threads ( 1900MiB memory per for hashes )
- while(ctx->device_blocks * ctx->device_threads >= 950 && ctx->device_threads > 2)
- {
- ctx->device_threads /= 2;
- }
+ // limit memory usage for GPUs before pascal
+ maxMemUsage = size_t(2048u) * byte2mib;
+ }
+ if(props.major == 2)
+ {
+ // limit memory usage for sm 20 GPUs
+ maxMemUsage = size_t(1024u) * byte2mib;
}
- // stay within 85% of the available RAM
- while(ctx->device_threads > 2)
+ size_t freeMemory = 0;
+ size_t totalMemory = 0;
+ CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory));
+
+ ctx->total_device_memory = totalMemory;
+ ctx->free_device_memory = freeMemory;
+
+ // keep 64MiB memory free (value is randomly chosen)
+ // 200byte are meta data memory (result nonce, ...)
+ size_t availableMem = freeMemory - (64u * 1024 * 1024) - 200u;
+ size_t limitedMemory = std::min(availableMem, maxMemUsage);
+ // up to 920bytes extra memory is used per thread for some kernel (lmem/local memory)
+ // 680bytes are extra meta data memory per hash
+ size_t perThread = size_t(MEMORY) + 740u + 680u;
+ size_t max_intensity = limitedMemory / perThread;
+ ctx->device_threads = max_intensity / ctx->device_blocks;
+ // use only odd number of threads
+ ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
+
+ if(props.major == 2 && ctx->device_threads > 64)
{
- size_t freeMemory = 0;
- size_t totalMemory = 0;
- CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory));
- freeMemory = (freeMemory * size_t(85)) / 100;
- if( freeMemory > (size_t(ctx->device_blocks) * size_t(ctx->device_threads) * size_t(2u * 1024u * 1024u)) )
- break;
- else
- ctx->device_threads /= 2;
+ // Fermi gpus only support 512 threads per block (we need start 4 * configured threads)
+ ctx->device_threads = 64;
}
+
}
return 1;
OpenPOWER on IntegriCloud