summaryrefslogtreecommitdiffstats
path: root/xmrstak/backend/amd
diff options
context:
space:
mode:
Diffstat (limited to 'xmrstak/backend/amd')
-rw-r--r--xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl104
-rw-r--r--xmrstak/backend/amd/autoAdjust.hpp11
-rw-r--r--xmrstak/backend/amd/config.tpl3
-rw-r--r--xmrstak/backend/amd/minethd.cpp4
4 files changed, 63 insertions, 59 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 255fcbb..ec05712 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -653,21 +653,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
for(int i = 0; i < 25; ++i) states[i] = State[i];
- switch(State[0] & 3)
- {
- case 0:
- Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0);
- break;
- case 1:
- Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0);
- break;
- case 2:
- Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0);
- break;
- case 3:
- Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0);
- break;
- }
+ ulong StateSwitch = State[0] & 3;
+ __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
+ __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
+ __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
+ destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
@@ -704,8 +694,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
for(uint i = 0; i < 4; ++i)
{
- if(i < 3) t[0] += 0x40UL;
- else t[0] += 0x08UL;
+ t[0] += i < 3 ? 0x40UL : 0x08UL;
t[2] = t[0] ^ t[1];
@@ -715,8 +704,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
h = m ^ p;
- if(i < 2) t[1] = 0x3000000000000000UL;
- else t[1] = 0xB000000000000000UL;
+ t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
}
t[0] = 0x08UL;
@@ -744,6 +732,27 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
+#define JHXOR \
+ h0h ^= input[0]; \
+ h0l ^= input[1]; \
+ h1h ^= input[2]; \
+ h1l ^= input[3]; \
+ h2h ^= input[4]; \
+ h2l ^= input[5]; \
+ h3h ^= input[6]; \
+ h3l ^= input[7]; \
+\
+ E8; \
+\
+ h4h ^= input[0]; \
+ h4l ^= input[1]; \
+ h5h ^= input[2]; \
+ h5l ^= input[3]; \
+ h6h ^= input[4]; \
+ h6l ^= input[5]; \
+ h7h ^= input[6]; \
+ h7l ^= input[7]
+
__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
{
const uint idx = get_global_id(0) - get_global_offset(0);
@@ -757,46 +766,27 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
sph_u64 tmp;
- for(int i = 0; i < 5; ++i)
+ for(int i = 0; i < 3; ++i)
{
ulong input[8];
- if(i < 3)
- {
- for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]);
- }
- else if(i == 3)
- {
- input[0] = (states[24]);
- input[1] = 0x80UL;
- for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
- }
- else
- {
- input[7] = 0x4006000000000000UL;
-
- for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
- }
-
- h0h ^= input[0];
- h0l ^= input[1];
- h1h ^= input[2];
- h1l ^= input[3];
- h2h ^= input[4];
- h2l ^= input[5];
- h3h ^= input[6];
- h3l ^= input[7];
-
- E8;
-
- h4h ^= input[0];
- h4l ^= input[1];
- h5h ^= input[2];
- h5l ^= input[3];
- h6h ^= input[4];
- h6l ^= input[5];
- h7h ^= input[6];
- h7l ^= input[7];
+ const int shifted = i << 3;
+ for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]);
+ JHXOR;
+ }
+ {
+ ulong input[8];
+ input[0] = (states[24]);
+ input[1] = 0x80UL;
+ #pragma unroll 6
+ for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+ JHXOR;
+ }
+ {
+ ulong input[8];
+ for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+ input[7] = 0x4006000000000000UL;
+ JHXOR;
}
//output[0] = h6h;
@@ -832,6 +822,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
((uint8 *)h)[0] = vload8(0U, c_IV256);
+ #pragma unroll 4
for(uint i = 0, bitlen = 0; i < 4; ++i)
{
if(i < 3)
@@ -907,6 +898,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
State[7] = 0x0001000000000000UL;
+ #pragma unroll 4
for(uint i = 0; i < 4; ++i)
{
ulong H[8], M[8];
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 0bc5239..511a712 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -101,7 +101,16 @@ private:
* sowing down the memory performance because of TLB cache misses
*/
size_t maxThreads = 1000u;
- if(ctx.name.compare("gfx901") == 0)
+ if(
+ ctx.name.compare("gfx901") == 0 ||
+ ctx.name.compare("gfx904") == 0 ||
+ // APU
+ ctx.name.compare("gfx902") == 0 ||
+ // UNKNOWN
+ ctx.name.compare("gfx900") == 0 ||
+ ctx.name.compare("gfx903") == 0 ||
+ ctx.name.compare("gfx905") == 0
+ )
{
/* Increase the number of threads for AMD VEGA gpus.
* Limit the number of threads based on the issue: https://github.com/fireice-uk/xmr-stak/issues/5#issuecomment-339425089
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index af662f8..25b75a1 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -12,6 +12,9 @@ R"===(
* [
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true },
* ],
+ * If you do not wish to mine with your AMD GPU(s) then use:
+ * "gpu_threads_conf" :
+ * null,
*/
"gpu_threads_conf" : [
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index e83527c..422c28c 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -139,7 +139,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
if(cfg.cpu_aff >= 0)
{
#if defined(__APPLE__)
- printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+ printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
#endif
printer::inst()->print_msg(L1, "Starting AMD GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff);
@@ -245,7 +245,7 @@ void minethd::work_main()
if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo), oWork.iPoolId));
else
- executor::inst()->push_event(ex_event("AMD Invalid Result", oWork.iPoolId));
+ executor::inst()->push_event(ex_event("AMD Invalid Result", pGpuCtx->deviceIdx, oWork.iPoolId));
}
iCount += pGpuCtx->rawIntensity;
OpenPOWER on IntegriCloud