/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
* Additional permission under GNU GPL version 3 section 7
*
* If you modify this Program, or any covered work, by linking or combining
* it with OpenSSL (or a modified version of that library), containing parts
* covered by the terms of OpenSSL License and SSLeay License, the licensors
* of this Program grant you additional permission to convey the resulting work.
*
*/
#include "minethd.hpp"
#include "autoAdjust.hpp"
#include "xmrstak/misc/console.hpp"
#include "xmrstak/backend/cpu/crypto/cryptonight_altivec.h"
#include "xmrstak/backend/cpu/crypto/cryptonight.h"
#include "xmrstak/backend/cpu/minethd.hpp"
#include "xmrstak/params.hpp"
#include "xmrstak/misc/executor.hpp"
#include "xmrstak/jconf.hpp"
#include "xmrstak/misc/environment.hpp"
#include "xmrstak/backend/cpu/hwlocMemory.hpp"
#include "xmrstak/backend/cryptonight.hpp"
#include "xmrstak/misc/utility.hpp"
#include
#include
#include
#include
#include
#include
#ifndef USE_PRECOMPILED_HEADERS
#ifdef WIN32
#include
#include
#else
#include
#include
#endif
#include
#endif
namespace xmrstak
{
namespace nvidia
{
#ifdef WIN32
HINSTANCE lib_handle;
#else
void *lib_handle;
#endif
minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
{
this->backendType = iBackend::NVIDIA;
oWork = pWork;
bQuit = 0;
iThreadNo = (uint8_t)iNo;
iJobNo = 0;
ctx.device_id = (int)cfg.id;
ctx.device_blocks = (int)cfg.blocks;
ctx.device_threads = (int)cfg.threads;
ctx.device_bfactor = (int)cfg.bfactor;
ctx.device_bsleep = (int)cfg.bsleep;
ctx.syncMode = cfg.syncMode;
this->affinity = cfg.cpu_aff;
std::future numa_guard = numa_promise.get_future();
thread_work_guard = thread_work_promise.get_future();
oWorkThd = std::thread(&minethd::work_main, this);
/* Wait until the gpu memory is initialized and numa cpu memory is pinned.
* The startup time is reduced if the memory is initialized in sequential order
* without concurrent threads (CUDA driver is less occupied).
*/
numa_guard.wait();
}
void minethd::start_mining()
{
thread_work_promise.set_value();
if(this->affinity >= 0) //-1 means no affinity
if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
}
bool minethd::self_test()
{
cryptonight_ctx* ctx0;
unsigned char out[32];
bool bResult = true;
ctx0 = new cryptonight_ctx;
if(::jconf::inst()->HaveHardwareAes())
{
//cryptonight_hash_ctx("This is a test", 14, out, ctx0);
bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
}
else
{
//cryptonight_hash_ctx_soft("This is a test", 14, out, ctx0);
bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
}
delete ctx0;
//if(!bResult)
// printer::inst()->print_msg(L0,
// "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
return bResult;
}
extern "C"
{
#ifdef WIN32
__declspec(dllexport)
#endif
std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
{
environment::inst(&env);
return nvidia::minethd::thread_starter(threadOffset, pWork);
}
} // extern "C"
std::vector* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork)
{
std::vector* pvThreads = new std::vector();
if(!configEditor::file_exist(params::inst().configFileNVIDIA))
{
autoAdjust adjust;
if(!adjust.printConfig())
return pvThreads;
}
if(!jconf::inst()->parse_config())
{
win_exit();
}
int deviceCount = 0;
if(cuda_get_devicecount(&deviceCount) != 1)
{
std::cout<<"WARNING: NVIDIA no device found"<GetGPUThreadCount();
pvThreads->reserve(n);
jconf::thd_cfg cfg;
for (i = 0; i < n; i++)
{
jconf::inst()->GetGPUThreadConfig(i, cfg);
if(cfg.cpu_aff >= 0)
{
#if defined(__APPLE__)
printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
#endif
printer::inst()->print_msg(L1, "Starting NVIDIA GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff);
}
else
printer::inst()->print_msg(L1, "Starting NVIDIA GPU thread %d, no affinity.", i);
minethd* thd = new minethd(pWork, i + threadOffset, cfg);
pvThreads->push_back(thd);
}
for (i = 0; i < n; i++)
{
static_cast((*pvThreads)[i])->start_mining();
}
return pvThreads;
}
void minethd::switch_work(miner_work& pWork)
{
// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
// faster than threads can consume them. This should never happen in real life.
// Pool cant physically send jobs faster than every 250ms or so due to net latency.
while (globalStates::inst().iConsumeCnt.load(std::memory_order_seq_cst) < globalStates::inst().iThreadCount)
std::this_thread::sleep_for(std::chrono::milliseconds(100));
globalStates::inst().oGlobalWork = pWork;
globalStates::inst().iConsumeCnt.store(0, std::memory_order_seq_cst);
globalStates::inst().iGlobalJobNo++;
}
void minethd::consume_work()
{
memcpy(&oWork, &globalStates::inst().oGlobalWork, sizeof(miner_work));
iJobNo++;
globalStates::inst().iConsumeCnt++;
}
void minethd::work_main()
{
if(affinity >= 0) //-1 means no affinity
bindMemoryToNUMANode(affinity);
if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1)
{
printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
std::exit(0);
}
// numa memory bind and gpu memory is initialized
numa_promise.set_value();
std::this_thread::yield();
// wait until all NVIDIA devices are initialized
thread_work_guard.wait();
uint64_t iCount = 0;
cryptonight_ctx* cpu_ctx;
cpu_ctx = cpu::minethd::minethd_alloc_ctx();
// start with root algorithm and switch later if fork version is reached
auto miner_algo = ::jconf::inst()->GetMiningAlgoRoot();
cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
uint32_t iNonce;
globalStates::inst().iConsumeCnt++;
uint8_t version = 0;
while (bQuit == 0)
{
if (oWork.bStall)
{
/* We are stalled here because the executor didn't find a job for us yet,
* either because of network latency, or a socket problem. Since we are
* raison d'etre of this software it us sensible to just wait until we have something
*/
while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
std::this_thread::sleep_for(std::chrono::milliseconds(100));
consume_work();
continue;
}
uint8_t new_version = oWork.getVersion();
if(new_version != version)
{
if(new_version >= ::jconf::inst()->GetMiningForkVersion())
{
miner_algo = ::jconf::inst()->GetMiningAlgo();
hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
}
version = new_version;
}
cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize);
uint32_t h_per_round = ctx.device_blocks * ctx.device_threads;
size_t round_ctr = 0;
assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
if(oWork.bNiceHash)
iNonce = *(uint32_t*)(oWork.bWorkBlob + 39);
while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
{
//Allocate a new nonce every 16 rounds
if((round_ctr++ & 0xF) == 0)
{
globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, h_per_round * 16);
}
uint32_t foundNonce[10];
uint32_t foundCount;
cryptonight_extra_cpu_prepare(&ctx, iNonce, miner_algo);
cryptonight_core_cpu_hash(&ctx, miner_algo, iNonce);
cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce, miner_algo);
for(size_t i = 0; i < foundCount; i++)
{
uint8_t bWorkBlob[112];
uint8_t bResult[32];
memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
memset(bResult, 0, sizeof(job_result::bResult));
*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo), oWork.iPoolId));
else
executor::inst()->push_event(ex_event("NVIDIA Invalid Result", ctx.device_id, oWork.iPoolId));
}
iCount += h_per_round;
iNonce += h_per_round;
using namespace std::chrono;
uint64_t iStamp = get_timestamp_ms();
iHashCount.store(iCount, std::memory_order_relaxed);
iTimestamp.store(iStamp, std::memory_order_relaxed);
std::this_thread::yield();
}
consume_work();
}
}
} // namespace xmrstak
} //namespace nvidia