diff options
-rw-r--r-- | CMakeLists.txt | 5 | ||||
-rw-r--r-- | backend/amd/autoAdjust.hpp | 12 | ||||
-rw-r--r-- | backend/nvidia/autoAdjust.hpp | 6 | ||||
-rw-r--r-- | backend/nvidia/nvcc_code/cryptonight.h | 4 | ||||
-rw-r--r-- | backend/nvidia/nvcc_code/cuda_extra.cu | 52 |
5 files changed, 57 insertions, 22 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 8871951..cf776e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,11 @@ if(CUDA_ENABLE) endforeach() elseif("${CUDA_COMPILER}" STREQUAL "nvcc") + # add c++11 for cuda + if(NOT "${CMAKE_CXX_FLAGS}" MATCHES "-std=c\\+\\+11") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif() + # avoid that nvcc in CUDA < 8 tries to use libc `memcpy` within the kernel if(CUDA_VERSION VERSION_LESS 8.0) add_definitions(-D_FORCE_INLINES) diff --git a/backend/amd/autoAdjust.hpp b/backend/amd/autoAdjust.hpp index de67456..e6e4015 100644 --- a/backend/amd/autoAdjust.hpp +++ b/backend/amd/autoAdjust.hpp @@ -83,13 +83,15 @@ private: int i = 0; for(auto& ctx : devVec) { - // use 90% of available memory - size_t availableMem = (ctx.freeMem * 100u) / 110; - size_t units = ctx.computeUnits; + // keep 64MiB memory free (value is randomly chosen) + size_t availableMem = ctx.freeMem - (64u * 1024 * 1024); + // 224byte extra memory is used per thread for meta data size_t perThread = (size_t(1u)<<21) + 224u; size_t max_intensity = availableMem / perThread; + // 1000 is a magic selected limit \todo select max intensity depending of the gpu type size_t intensity = std::min( size_t(1000u) , max_intensity ); - conf += std::string(" // gpu: ") + ctx.name + "\n"; + conf += std::string(" // gpu: ") + ctx.name + "\n"; + // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + " \"affine_to_cpu\" : false, \n" @@ -101,7 +103,7 @@ private: configTpl.replace("NUMGPUS",std::to_string(devVec.size())); configTpl.replace("GPUCONFIG",conf); configTpl.write("amd.txt"); - printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", "amd.txt"); + printer::inst()->print_msg(L0, "AMD: GPU configuration stored in file '%s'", "amd.txt"); } std::vector<GpuContext> devVec; diff --git a/backend/nvidia/autoAdjust.hpp b/backend/nvidia/autoAdjust.hpp index 0e6c40f..659bd08 100644 --- a/backend/nvidia/autoAdjust.hpp +++ b/backend/nvidia/autoAdjust.hpp @@ -85,11 +85,13 @@ private: ConfigEditor configTpl{}; configTpl.set( std::string(tpl) ); + constexpr size_t byte2mib = 1024u * 1024u; std::string conf; int i = 0; for(auto& ctx : nvidCtxVec) { - + conf += std::string(" // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n"; + conf += std::string(" // memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n"; conf += std::string(" { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" + " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + @@ -100,7 +102,7 @@ private: configTpl.replace("GPUCONFIG",conf); configTpl.write("nvidia.txt"); - printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", "nvidia.txt"); + printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", "nvidia.txt"); } std::vector<nvid_ctx> nvidCtxVec; diff --git a/backend/nvidia/nvcc_code/cryptonight.h b/backend/nvidia/nvcc_code/cryptonight.h index bec9997..784c38d 100644 --- a/backend/nvidia/nvcc_code/cryptonight.h +++ b/backend/nvidia/nvcc_code/cryptonight.h @@ -1,6 +1,7 @@ #pragma once #include <stdint.h> +#include <string> typedef struct { int device_id; @@ -23,6 +24,9 @@ typedef struct { uint32_t *d_ctx_key1; uint32_t *d_ctx_key2; uint32_t *d_ctx_text; + std::string name; + size_t free_device_memory; + size_t total_device_memory; } nvid_ctx; extern "C" { diff --git a/backend/nvidia/nvcc_code/cuda_extra.cu b/backend/nvidia/nvcc_code/cuda_extra.cu index 94a4adb..7052bc8 100644 --- a/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/backend/nvidia/nvcc_code/cuda_extra.cu @@ -4,6 +4,7 @@ #include <cuda.h> #include <cuda_runtime.h> #include <device_functions.hpp> +#include <algorithm> #ifdef __CUDACC__ __constant__ @@ -301,6 +302,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->device_arch[0] = props.major; ctx->device_arch[1] = props.minor; + ctx->name = std::string(props.name); + // set all evice option those marked as auto (-1) to a valid value if(ctx->device_blocks == -1) { @@ -318,27 +321,46 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) * `8 * ctx->device_threads` threads per block */ ctx->device_threads = 64; + constexpr size_t byte2mib = 1024u * 1024u; + + // no limit by default 1TiB + size_t maxMemUsage = byte2mib * byte2mib; if(props.major < 6) { - // try to stay under 950 threads ( 1900MiB memory per for hashes ) - while(ctx->device_blocks * ctx->device_threads >= 950 && ctx->device_threads > 2) - { - ctx->device_threads /= 2; - } + // limit memory usage for GPUs before pascal + maxMemUsage = size_t(2048u) * byte2mib; + } + if(props.major == 2) + { + // limit memory usage for sm 20 GPUs + maxMemUsage = size_t(1024u) * byte2mib; } - // stay within 85% of the available RAM - while(ctx->device_threads > 2) + size_t freeMemory = 0; + size_t totalMemory = 0; + CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory)); + + ctx->total_device_memory = totalMemory; + ctx->free_device_memory = freeMemory; + + // keep 64MiB memory free (value is randomly chosen) + // 200byte are meta data memory (result nonce, ...) + size_t availableMem = freeMemory - (64u * 1024 * 1024) - 200u; + size_t limitedMemory = std::min(availableMem, maxMemUsage); + // up to 920bytes extra memory is used per thread for some kernel (lmem/local memory) + // 680bytes are extra meta data memory per hash + size_t perThread = size_t(MEMORY) + 740u + 680u; + size_t max_intensity = limitedMemory / perThread; + ctx->device_threads = max_intensity / ctx->device_blocks; + // use only odd number of threads + ctx->device_threads = ctx->device_threads & 0xFFFFFFFE; + + if(props.major == 2 && ctx->device_threads > 64) { - size_t freeMemory = 0; - size_t totalMemory = 0; - CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory)); - freeMemory = (freeMemory * size_t(85)) / 100; - if( freeMemory > (size_t(ctx->device_blocks) * size_t(ctx->device_threads) * size_t(2u * 1024u * 1024u)) ) - break; - else - ctx->device_threads /= 2; + // Fermi gpus only support 512 threads per block (we need start 4 * configured threads) + ctx->device_threads = 64; } + } return 1; |