4 files changed, 17 insertions, 2 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index ca81718..f9908cb 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -896,7 +896,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
 			// round up to next multiple of w_size
 			BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size;
 			// number of global threads must be a multiple of the work group size (w_size)
-			assert(BranchNonces%w_size == 0);
+			assert(BranchNonces[i]%w_size == 0);
 			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &ctx->Nonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
diff --git a/xmrstak/backend/cpu/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp
index 719c1bb..69742be 100644
--- a/xmrstak/backend/cpu/hwlocMemory.hpp
+++ b/xmrstak/backend/cpu/hwlocMemory.hpp
@@ -21,6 +21,13 @@ void bindMemoryToNUMANode( size_t puId )
 	hwloc_topology_init(&topology);
 	hwloc_topology_load(topology);
 
+	if(!hwloc_topology_get_support(topology)->membind->set_thisthread_membind)
+	{
+		printer::inst()->print_msg(L0, "hwloc: set_thisthread_membind not supported");
+		hwloc_topology_destroy(topology);
+		return;
+	}
+
 	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
 
 	for( size_t i = 0;
@@ -45,6 +52,8 @@ void bindMemoryToNUMANode( size_t puId )
 			}
 		}
 	}
+
+	hwloc_topology_destroy(topology);
 }
 #else
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index eb395a5..625fbe4 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -410,6 +410,8 @@ void minethd::double_work_main()
 	if(affinity >= 0) //-1 means no affinity
 		bindMemoryToNUMANode(affinity);
 
+	order_fix.set_value();
+
 	cn_hash_fun_dbl hash_fun;
 	cryptonight_ctx* ctx0;
 	cryptonight_ctx* ctx1;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 0c086e8..e1b78ce 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -156,7 +156,11 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_
 #else
     unusedVar( ptr );
     unusedVar( sub );
-    return __shfl( val, src, 4 );
+#   if(__CUDACC_VER_MAJOR__ >= 9)
+    return __shfl_sync(0xFFFFFFFF, val, src, 4 );
+#	else
+	return __shfl( val, src, 4 );
+#	endif
 #endif
 }