diff options
Diffstat (limited to 'xmrstak/backend/nvidia')
-rw-r--r-- | xmrstak/backend/nvidia/config.tpl | 3 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/minethd.cpp | 45 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/minethd.hpp | 10 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 12 | ||||
-rw-r--r-- | xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu | 14 |
5 files changed, 65 insertions, 19 deletions
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl index 5479172..f489956 100644 --- a/xmrstak/backend/nvidia/config.tpl +++ b/xmrstak/backend/nvidia/config.tpl @@ -26,6 +26,9 @@ R"===( * "affine_to_cpu" : false, "sync_mode" : 3, * }, * ], + * If you do not wish to mine with your nVidia GPU(s) then use: + * "gpu_threads_conf" : + * null, */ "gpu_threads_conf" : diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 5564596..867a998 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -80,14 +80,22 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) ctx.syncMode = cfg.syncMode; this->affinity = cfg.cpu_aff; - std::unique_lock<std::mutex> lck(thd_aff_set); - std::future<void> order_guard = order_fix.get_future(); + std::future<void> numa_guard = numa_promise.get_future(); + thread_work_guard = thread_work_promise.get_future(); oWorkThd = std::thread(&minethd::work_main, this); - order_guard.wait(); + /* Wait until the gpu memory is initialized and numa cpu memory is pinned. + * The startup time is reduced if the memory is initialized in sequential order + * without concurrent threads (CUDA driver is less occupied). + */ + numa_guard.wait(); +} - if(affinity >= 0) //-1 means no affinity +void minethd::start_mining() +{ + thread_work_promise.set_value(); + if(this->affinity >= 0) //-1 means no affinity if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity)) printer::inst()->print_msg(L1, "WARNING setting affinity failed."); } @@ -166,7 +174,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor if(cfg.cpu_aff >= 0) { #if defined(__APPLE__) - printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory."); + printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory."); #endif printer::inst()->print_msg(L1, "Starting NVIDIA GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff); @@ -179,6 +187,11 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor } + for (i = 0; i < n; i++) + { + static_cast<minethd*>((*pvThreads)[i])->start_mining(); + } + return pvThreads; } @@ -208,10 +221,18 @@ void minethd::work_main() if(affinity >= 0) //-1 means no affinity bindMemoryToNUMANode(affinity); - order_fix.set_value(); - std::unique_lock<std::mutex> lck(thd_aff_set); - lck.release(); + if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1) + { + printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo); + std::exit(0); + } + + // numa memory bind and gpu memory is initialized + numa_promise.set_value(); + std::this_thread::yield(); + // wait until all NVIDIA devices are initialized + thread_work_guard.wait(); uint64_t iCount = 0; cryptonight_ctx* cpu_ctx; @@ -221,12 +242,6 @@ void minethd::work_main() globalStates::inst().iConsumeCnt++; - if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1) - { - printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo); - std::exit(0); - } - bool mineMonero = strcmp_i(::jconf::inst()->GetCurrency(), "monero"); while (bQuit == 0) @@ -287,7 +302,7 @@ void minethd::work_main() if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo), oWork.iPoolId)); else - executor::inst()->push_event(ex_event("NVIDIA Invalid Result", oWork.iPoolId)); + executor::inst()->push_event(ex_event("NVIDIA Invalid Result", ctx.device_id, oWork.iPoolId)); } iCount += h_per_round; diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp index d13c868..fcd24fa 100644 --- a/xmrstak/backend/nvidia/minethd.hpp +++ b/xmrstak/backend/nvidia/minethd.hpp @@ -32,7 +32,8 @@ private: typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg); - + void start_mining(); + void work_main(); void consume_work(); @@ -44,8 +45,11 @@ private: static miner_work oGlobalWork; miner_work oWork; - std::promise<void> order_fix; - std::mutex thd_aff_set; + std::promise<void> numa_promise; + std::promise<void> thread_work_promise; + + // block thread until all NVIDIA GPUs are initialized + std::future<void> thread_work_guard; std::thread oWorkThd; int64_t affinity; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 15a6f36..cc97274 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -74,24 +74,36 @@ __device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t m template< typename T > __device__ __forceinline__ T loadGlobal64( T * const addr ) { +#if (__CUDA_ARCH__ < 700) T x; asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) ); return x; +#else + return *addr; +#endif } template< typename T > __device__ __forceinline__ T loadGlobal32( T * const addr ) { +#if (__CUDA_ARCH__ < 700) T x; asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) ); return x; +#else + return *addr; +#endif } template< typename T > __device__ __forceinline__ void storeGlobal32( T* addr, T const & val ) { +#if (__CUDA_ARCH__ < 700) asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) ); +#else + *addr = val; +#endif } template<size_t ITERATIONS, uint32_t THREAD_SHIFT> diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index d865e13..92259db 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -204,7 +204,13 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) break; }; - CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); + const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1]; + + /* Disable L1 cache for GPUs before Volta. + * L1 speed is increased and latency reduced with Volta. + */ + if(gpuArch < 70) + CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); size_t hashMemSize; if(::jconf::inst()->IsCurrencyMonero()) @@ -441,6 +447,12 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) maxMemUsage = size_t(1024u) * byteToMiB; } + if(props.multiProcessorCount <= 6) + { + // limit memory usage for low end devices to reduce the number of threads + maxMemUsage = size_t(1024u) * byteToMiB; + } + int* tmp; cudaError_t err; // a device must be selected to get the right memory usage later on |