group files

- move source code to `src` - categorize files and move to group folder - change upper case class files to lower case - change C++ header to `*.hpp`
author: psychocrypt <psychocrypt@users.noreply.github.com> 2017-09-29 20:32:31 +0200
committer: psychocrypt <psychocrypt@users.noreply.github.com> 2017-09-30 23:46:08 +0200
commit: cc429b68fadc502b981fd0acd64a5ff6e2ae1d15 (patch)
tree: 3fb23fc4db15dbdd08af4c7ea20134b9d82e58fd /xmrstak/backend
parent: e5b0319d5a9f58762fa934ad700113908940cb31 (diff)
download: xmr-stak-cc429b68fadc502b981fd0acd64a5ff6e2ae1d15.zip
xmr-stak-cc429b68fadc502b981fd0acd64a5ff6e2ae1d15.tar.gz
64 files changed, 13316 insertions, 0 deletions
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
new file mode 100644
index 0000000..04e442a
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -0,0 +1,889 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  */
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <regex>
+
+#ifdef _WIN32
+#include <windows.h>
+const char* sSourcePath = "opencl\\cryptonight.cl";
+
+static inline void port_sleep(size_t sec)
+{
+	Sleep(sec * 1000);
+}
+#else
+#include <unistd.h>
+const char* sSourcePath = "opencl/cryptonight.cl";
+
+static inline void port_sleep(size_t sec)
+{
+	sleep(sec);
+}
+#endif // _WIN32
+
+#if 0
+static inline long long unsigned int int_port(size_t i)
+{
+	return i;
+}
+#endif
+
+#include "gpu.h"
+
+const char* err_to_str(cl_int ret)
+{
+	switch(ret)
+	{
+	case CL_SUCCESS:
+		return "CL_SUCCESS";
+	case CL_DEVICE_NOT_FOUND:
+		return "CL_DEVICE_NOT_FOUND";
+	case CL_DEVICE_NOT_AVAILABLE:
+		return "CL_DEVICE_NOT_AVAILABLE";
+	case CL_COMPILER_NOT_AVAILABLE:
+		return "CL_COMPILER_NOT_AVAILABLE";
+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+		return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+	case CL_OUT_OF_RESOURCES:
+		return "CL_OUT_OF_RESOURCES";
+	case CL_OUT_OF_HOST_MEMORY:
+		return "CL_OUT_OF_HOST_MEMORY";
+	case CL_PROFILING_INFO_NOT_AVAILABLE:
+		return "CL_PROFILING_INFO_NOT_AVAILABLE";
+	case CL_MEM_COPY_OVERLAP:
+		return "CL_MEM_COPY_OVERLAP";
+	case CL_IMAGE_FORMAT_MISMATCH:
+		return "CL_IMAGE_FORMAT_MISMATCH";
+	case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+		return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+	case CL_BUILD_PROGRAM_FAILURE:
+		return "CL_BUILD_PROGRAM_FAILURE";
+	case CL_MAP_FAILURE:
+		return "CL_MAP_FAILURE";
+	case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+		return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+	case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+		return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+	case CL_COMPILE_PROGRAM_FAILURE:
+		return "CL_COMPILE_PROGRAM_FAILURE";
+	case CL_LINKER_NOT_AVAILABLE:
+		return "CL_LINKER_NOT_AVAILABLE";
+	case CL_LINK_PROGRAM_FAILURE:
+		return "CL_LINK_PROGRAM_FAILURE";
+	case CL_DEVICE_PARTITION_FAILED:
+		return "CL_DEVICE_PARTITION_FAILED";
+	case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+		return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+	case CL_INVALID_VALUE:
+		return "CL_INVALID_VALUE";
+	case CL_INVALID_DEVICE_TYPE:
+		return "CL_INVALID_DEVICE_TYPE";
+	case CL_INVALID_PLATFORM:
+		return "CL_INVALID_PLATFORM";
+	case CL_INVALID_DEVICE:
+		return "CL_INVALID_DEVICE";
+	case CL_INVALID_CONTEXT:
+		return "CL_INVALID_CONTEXT";
+	case CL_INVALID_QUEUE_PROPERTIES:
+		return "CL_INVALID_QUEUE_PROPERTIES";
+	case CL_INVALID_COMMAND_QUEUE:
+		return "CL_INVALID_COMMAND_QUEUE";
+	case CL_INVALID_HOST_PTR:
+		return "CL_INVALID_HOST_PTR";
+	case CL_INVALID_MEM_OBJECT:
+		return "CL_INVALID_MEM_OBJECT";
+	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+		return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+	case CL_INVALID_IMAGE_SIZE:
+		return "CL_INVALID_IMAGE_SIZE";
+	case CL_INVALID_SAMPLER:
+		return "CL_INVALID_SAMPLER";
+	case CL_INVALID_BINARY:
+		return "CL_INVALID_BINARY";
+	case CL_INVALID_BUILD_OPTIONS:
+		return "CL_INVALID_BUILD_OPTIONS";
+	case CL_INVALID_PROGRAM:
+		return "CL_INVALID_PROGRAM";
+	case CL_INVALID_PROGRAM_EXECUTABLE:
+		return "CL_INVALID_PROGRAM_EXECUTABLE";
+	case CL_INVALID_KERNEL_NAME:
+		return "CL_INVALID_KERNEL_NAME";
+	case CL_INVALID_KERNEL_DEFINITION:
+		return "CL_INVALID_KERNEL_DEFINITION";
+	case CL_INVALID_KERNEL:
+		return "CL_INVALID_KERNEL";
+	case CL_INVALID_ARG_INDEX:
+		return "CL_INVALID_ARG_INDEX";
+	case CL_INVALID_ARG_VALUE:
+		return "CL_INVALID_ARG_VALUE";
+	case CL_INVALID_ARG_SIZE:
+		return "CL_INVALID_ARG_SIZE";
+	case CL_INVALID_KERNEL_ARGS:
+		return "CL_INVALID_KERNEL_ARGS";
+	case CL_INVALID_WORK_DIMENSION:
+		return "CL_INVALID_WORK_DIMENSION";
+	case CL_INVALID_WORK_GROUP_SIZE:
+		return "CL_INVALID_WORK_GROUP_SIZE";
+	case CL_INVALID_WORK_ITEM_SIZE:
+		return "CL_INVALID_WORK_ITEM_SIZE";
+	case CL_INVALID_GLOBAL_OFFSET:
+		return "CL_INVALID_GLOBAL_OFFSET";
+	case CL_INVALID_EVENT_WAIT_LIST:
+		return "CL_INVALID_EVENT_WAIT_LIST";
+	case CL_INVALID_EVENT:
+		return "CL_INVALID_EVENT";
+	case CL_INVALID_OPERATION:
+		return "CL_INVALID_OPERATION";
+	case CL_INVALID_GL_OBJECT:
+		return "CL_INVALID_GL_OBJECT";
+	case CL_INVALID_BUFFER_SIZE:
+		return "CL_INVALID_BUFFER_SIZE";
+	case CL_INVALID_MIP_LEVEL:
+		return "CL_INVALID_MIP_LEVEL";
+	case CL_INVALID_GLOBAL_WORK_SIZE:
+		return "CL_INVALID_GLOBAL_WORK_SIZE";
+	case CL_INVALID_PROPERTY:
+		return "CL_INVALID_PROPERTY";
+	case CL_INVALID_IMAGE_DESCRIPTOR:
+		return "CL_INVALID_IMAGE_DESCRIPTOR";
+	case CL_INVALID_COMPILER_OPTIONS:
+		return "CL_INVALID_COMPILER_OPTIONS";
+	case CL_INVALID_LINKER_OPTIONS:
+		return "CL_INVALID_LINKER_OPTIONS";
+	case CL_INVALID_DEVICE_PARTITION_COUNT:
+		return "CL_INVALID_DEVICE_PARTITION_COUNT";
+#ifdef CL_VERSION_2_0
+	case CL_INVALID_PIPE_SIZE:
+		return "CL_INVALID_PIPE_SIZE";
+	case CL_INVALID_DEVICE_QUEUE:
+		return "CL_INVALID_DEVICE_QUEUE";
+#endif
+	default:
+		return "UNKNOWN_ERROR";
+	}
+}
+
+#if 0
+void printer::inst()->print_msg(L1,const char* fmt, ...);
+void printer::inst()->print_str(const char* str);
+#endif
+
+char* LoadTextFile(const char* filename)
+{
+	size_t flen;
+	char* out;
+	FILE* kernel = fopen(filename, "rb");
+
+	if(kernel == NULL)
+		return NULL;
+
+	fseek(kernel, 0, SEEK_END);
+	flen = ftell(kernel);
+	fseek(kernel, 0, SEEK_SET);
+
+	out = (char*)malloc(flen+1);
+	size_t r = fread(out, flen, 1, kernel);
+	fclose(kernel);
+
+	if(r != 1)
+	{
+		free(out);
+		return NULL;
+	}
+
+	out[flen] = '\0';
+	return out;
+}
+
+size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_code)
+{
+	size_t MaximumWorkSize;
+	cl_int ret;
+
+	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &MaximumWorkSize, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+#ifdef CL_VERSION_2_0
+	const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 };
+	ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret);
+#else
+	const cl_command_queue_properties CommandQueueProperties = { 0 };
+	ctx->CommandQueues = clCreateCommandQueue(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret);
+#endif
+
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	size_t g_thd = ctx->rawIntensity;
+	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (1 << 21) * g_thd, NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Blake-256 branches
+	ctx->ExtraBuffers[2] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Groestl-256 branches
+	ctx->ExtraBuffers[3] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// JH-256 branches
+	ctx->ExtraBuffers[4] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Skein-512 branches
+	ctx->ExtraBuffers[5] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Assume we may find up to 0xFF nonces in one run - it's reasonable
+	ctx->OutputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * 0x100, NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the contents of cryptonight.cl", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	char options[32];
+	snprintf(options, sizeof(options), "-I. -DWORKSIZE=%llu", int_port(ctx->workSize));
+	ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
+	if(ret != CL_SUCCESS)
+	{
+		size_t len;
+		printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+
+		if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+
+		char* BuildLog = (char*)malloc(len + 1);
+		BuildLog[0] = '\0';
+
+		if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+		{
+			free(BuildLog);
+			printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+		
+		printer::inst()->print_str("Build log:\n");
+		std::cerr<<BuildLog<<std::endl;
+
+		free(BuildLog);
+		return ERR_OCL_API;
+	}
+
+	cl_build_status status;
+	do
+	{
+		if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+		port_sleep(1);
+	}
+	while(status == CL_BUILD_IN_PROGRESS);
+
+	const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
+	for(int i = 0; i < 7; ++i)
+	{
+		ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret);
+		if(ret != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel %s.", err_to_str(ret), KernelNames[i]);
+			return ERR_OCL_API;
+		}
+	}
+
+	ctx->Nonce = 0;
+	return 0;
+}
+
+const cl_platform_info attributeTypes[5] = {
+    CL_PLATFORM_NAME,
+    CL_PLATFORM_VENDOR,
+    CL_PLATFORM_VERSION,
+    CL_PLATFORM_PROFILE,
+    CL_PLATFORM_EXTENSIONS
+};
+
+const char* const attributeNames[] = {
+    "CL_PLATFORM_NAME",
+    "CL_PLATFORM_VENDOR",
+    "CL_PLATFORM_VERSION",
+    "CL_PLATFORM_PROFILE",
+    "CL_PLATFORM_EXTENSIONS"
+};
+
+#define NELEMS(x)  (sizeof(x) / sizeof((x)[0]))
+
+void PrintDeviceInfo(cl_device_id device)
+{
+    char queryBuffer[1024];
+    int queryInt;
+    cl_int clError;
+    clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL);
+    printf("    CL_DEVICE_NAME: %s\n", queryBuffer);
+    queryBuffer[0] = '\0';
+    clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL);
+    printf("    CL_DEVICE_VENDOR: %s\n", queryBuffer);
+    queryBuffer[0] = '\0';
+    clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
+    printf("    CL_DRIVER_VERSION: %s\n", queryBuffer);
+    queryBuffer[0] = '\0';
+    clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
+    printf("    CL_DEVICE_VERSION: %s\n", queryBuffer);
+    queryBuffer[0] = '\0';
+    clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL);
+    printf("    CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt);
+}
+
+uint32_t getNumPlatforms()
+{
+	cl_uint num_platforms = 0;
+	cl_platform_id * platforms = NULL;
+	cl_int clStatus;
+
+	// Get platform and device information
+	clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
+	platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
+	clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
+
+	return num_platforms;
+}
+
+std::vector<GpuContext> getAMDDevices(int index)
+{
+	std::vector<GpuContext> ctxVec;
+	cl_platform_id * platforms = NULL;
+	cl_int clStatus;
+	cl_uint num_devices;
+	cl_device_id *device_list = NULL;
+
+	uint32_t numPlatforms = getNumPlatforms();
+
+
+    platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
+    clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
+
+	clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+	device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
+	clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
+	for (int k = 0; k < num_devices; k++) {
+		cl_int clError;
+		std::vector<char> devVendorVec(1024);
+		clError = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL);
+		std::string devVendor(devVendorVec.data());
+		if( devVendor.find("Advanced Micro Devices") != std::string::npos)
+		{
+			GpuContext ctx;
+			ctx.deviceIdx = k;
+			clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL);
+			size_t maxMem;
+			clError = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL);
+			clError = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL);
+			// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
+			ctx.freeMem = std::min(ctx.freeMem, maxMem);
+			std::vector<char> devNameVec(1024);
+			clError = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL);
+			ctx.name = std::string(devNameVec.data());
+			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
+			ctx.DeviceID = device_list[k];
+			ctxVec.push_back(ctx);
+		}
+	}
+	
+
+	free(device_list);
+	free(platforms);
+
+	return ctxVec;
+}
+
+int getAMDPlatformIdx()
+{
+
+	uint32_t numPlatforms = getNumPlatforms();
+
+	if(numPlatforms == 0)
+	{
+		printer::inst()->print_msg(L0,"WARNING: No OpenCL platform found.");
+		return -1;
+	}
+	cl_platform_id * platforms = NULL;
+	cl_int clStatus;
+
+	platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
+    clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
+
+	int platformIndex = -1;
+
+	for (int i = 0; i < numPlatforms; i++) {
+		size_t infoSize;
+		clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &infoSize);
+		std::vector<char> platformNameVec(infoSize);
+
+		clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
+		std::string platformName(platformNameVec.data());
+		if( platformName.find("Advanced Micro Devices") != std::string::npos)
+		{
+			platformIndex = i;
+			printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str());
+			break;
+		}
+	}
+
+	free(platforms);
+	return platformIndex;
+}
+
+// RequestedDeviceIdxs is a list of OpenCL device indexes
+// NumDevicesRequested is number of devices in RequestedDeviceIdxs list
+// Returns 0 on success, -1 on stupid params, -2 on OpenCL API error
+size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
+{
+
+	cl_context opencl_ctx;
+	cl_int ret;
+	cl_uint entries;
+
+	if((ret = clGetPlatformIDs(0, NULL, &entries)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+
+	// The number of platforms naturally is the index of the last platform plus one.
+	if(entries <= platform_idx)
+	{
+		printer::inst()->print_msg(L1,"Selected OpenCL platform index %d doesn't exist.", platform_idx);
+		return ERR_STUPID_PARAMS;
+	}
+
+
+
+	cl_platform_id * platforms = NULL;
+	cl_int clStatus;
+	uint32_t numPlatforms = getNumPlatforms();
+
+	platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
+    clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
+
+	size_t infoSize;
+	clGetPlatformInfo(platforms[platform_idx], CL_PLATFORM_VENDOR, 0, NULL, &infoSize);
+	std::vector<char> platformNameVec(infoSize);
+	clGetPlatformInfo(platforms[platform_idx], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
+	std::string platformName(platformNameVec.data());
+	if( platformName.find("Advanced Micro Devices") == std::string::npos)
+	{
+		printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str());
+	}
+	
+	free(platforms);
+
+	/*MSVC skimping on devel costs by shoehorning C99 to be a subset of C++? Noooo... can't be.*/
+#ifdef __GNUC__
+	cl_platform_id PlatformIDList[entries];
+#else
+	cl_platform_id* PlatformIDList = (cl_platform_id*)_alloca(entries * sizeof(cl_platform_id));
+#endif
+	if((ret = clGetPlatformIDs(entries, PlatformIDList, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &entries)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Same as the platform index sanity check, except we must check all requested device indexes
+	for(int i = 0; i < num_gpus; ++i)
+	{
+		if(entries <= ctx[i].deviceIdx)
+		{
+			printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx);
+			return ERR_STUPID_PARAMS;
+		}
+	}
+
+#ifdef __GNUC__
+	cl_device_id DeviceIDList[entries];
+#else
+	cl_device_id* DeviceIDList = (cl_device_id*)_alloca(entries * sizeof(cl_device_id));
+#endif
+	if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, entries, DeviceIDList, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Indexes sanity checked above
+#ifdef __GNUC__
+	cl_device_id TempDeviceList[num_gpus];
+#else
+	cl_device_id* TempDeviceList = (cl_device_id*)_alloca(entries * sizeof(cl_device_id));
+#endif
+	for(int i = 0; i < num_gpus; ++i)
+	{
+		ctx[i].DeviceID = DeviceIDList[ctx[i].deviceIdx];
+		TempDeviceList[i] = DeviceIDList[ctx[i].deviceIdx];
+	}
+
+	opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList, NULL, NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	//char* source_code = LoadTextFile(sSourcePath);
+
+	const char *cryptonightCL =
+			#include "./opencl/cryptonight.cl"
+	;
+	const char *blake256CL =
+			#include "./opencl/blake256.cl"
+	;
+	const char *groestl256CL =
+			#include "./opencl/groestl256.cl"
+	;
+	const char *jhCL =
+			#include "./opencl/jh.cl"
+	;
+	const char *wolfAesCL =
+			#include "./opencl/wolf-aes.cl"
+	;
+	const char *wolfSkeinCL =
+			#include "./opencl/wolf-skein.cl"
+	;
+
+	std::string source_code(cryptonightCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL);
+
+	for(int i = 0; i < num_gpus; ++i)
+	{
+		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
+		{
+			return ret;
+		}
+	}
+
+	return ERR_SUCCESS;
+}
+
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint32_t target)
+{
+	cl_int ret;
+
+	if(input_len > 84)
+		return ERR_STUPID_PARAMS;
+
+	input[input_len] = 0x01;
+	memset(input + input_len + 1, 0, 88 - input_len - 1);
+
+	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	if((ret = clSetKernelArg(ctx->Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Scratchpads
+	if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// States
+	if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// CN2 Kernel
+
+	// Scratchpads
+	if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// States
+	if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// CN3 Kernel
+	// Scratchpads
+	if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// States
+	if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Branch 0
+	if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Branch 1
+	if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Branch 2
+	if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	// Branch 3
+	if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	for(int i = 0; i < 4; ++i)
+	{
+		// States
+		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
+			return ERR_OCL_API;
+		}
+
+		// Nonce buffer
+		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
+			return ERR_OCL_API;
+		}
+
+		// Output
+		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
+			return ERR_OCL_API;
+		}
+
+		// Target
+		if((ret = clSetKernelArg(ctx->Kernels[i + 3], 3, sizeof(cl_uint), &target)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
+			return ERR_OCL_API;
+		}
+	}
+
+	return ERR_SUCCESS;
+}
+
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
+{
+	cl_int ret;
+	cl_uint zero = 0;
+	size_t BranchNonces[4];
+	memset(BranchNonces,0,sizeof(size_t)*4);
+
+	size_t g_thd = ctx->rawIntensity;
+	size_t w_size = ctx->workSize;
+
+	for(int i = 2; i < 6; ++i)
+	{
+		if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->ExtraBuffers[i], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2);
+			return ERR_OCL_API;
+		}
+	}
+
+	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	clFinish(ctx->CommandQueues);
+
+	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 };
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
+		return ERR_OCL_API;
+	}
+
+	/*for(int i = 1; i < 3; ++i)
+	{
+		if((ret = clEnqueueNDRangeKernel(*ctx->CommandQueues, ctx->Kernels[i], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+		{
+			Log(LOG_CRITICAL, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i);
+			return(ERR_OCL_API);
+		}
+	}*/
+
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+		return ERR_OCL_API;
+	}
+
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
+		return ERR_OCL_API;
+	}
+
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_thd, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	clFinish(ctx->CommandQueues);
+
+	for(int i = 0; i < 4; ++i)
+	{
+		if(BranchNonces[i])
+		{
+			// Threads
+			if((clSetKernelArg(ctx->Kernels[i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+				return(ERR_OCL_API);
+			}
+
+			BranchNonces[i] = ((size_t)ceil( (double)BranchNonces[i] / (double)w_size) ) * w_size;
+			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &ctx->Nonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
+				return ERR_OCL_API;
+			}
+		}
+	}
+
+	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
+	clFinish(ctx->CommandQueues);
+	ctx->Nonce += g_thd;
+
+	return ERR_SUCCESS;
+}
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
new file mode 100644
index 0000000..8a71cfa
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <stdint.h>
+#include <vector>
+#include "../../../console.h"
+
+#define ERR_SUCCESS (0)
+#define ERR_OCL_API (2)
+#define ERR_STUPID_PARAMS (1)
+
+
+
+struct GpuContext
+{
+	/*Input vars*/
+	size_t deviceIdx;
+	size_t rawIntensity;
+	size_t workSize;
+
+	/*Output vars*/
+	cl_device_id DeviceID;
+	cl_command_queue CommandQueues;
+	cl_mem InputBuffer;
+	cl_mem OutputBuffer;
+	cl_mem ExtraBuffers[6];
+	cl_program Program;
+	cl_kernel Kernels[7];
+	size_t freeMem;
+	int computeUnits;
+	std::string name;
+
+	size_t Nonce;
+
+};
+
+uint32_t getNumPlatforms();
+int getAMDPlatformIdx();
+std::vector<GpuContext> getAMDDevices(int index);
+
+size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint32_t target);
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput);
+
+
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/blake256.cl b/xmrstak/backend/amd/amd_gpu/opencl/blake256.cl
new file mode 100644
index 0000000..3d5fe3e
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/blake256.cl
@@ -0,0 +1,93 @@
+R"===(
+/*
+* blake256 kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+* Copyright (c) 2014 djm34
+* Copyright (c) 2014 tpruvot
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*
+* @author   djm34
+*/
+__constant static const int sigma[16][16] = {
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+
+__constant static const sph_u32  c_IV256[8] = {
+	0x6A09E667, 0xBB67AE85,
+	0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C,
+	0x1F83D9AB, 0x5BE0CD19
+};
+
+/* Second part (64-80) msg never change, store it */
+__constant static const sph_u32  c_Padding[16] = {
+	0, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 1, 0, 640,
+};
+__constant static const sph_u32  c_u256[16] = {
+	0x243F6A88, 0x85A308D3,
+	0x13198A2E, 0x03707344,
+	0xA4093822, 0x299F31D0,
+	0x082EFA98, 0xEC4E6C89,
+	0x452821E6, 0x38D01377,
+	0xBE5466CF, 0x34E90C6C,
+	0xC0AC29B7, 0xC97C50DD,
+	0x3F84D5B5, 0xB5470917
+};
+
+#define GS(a,b,c,d,x) { \
+	const sph_u32 idx1 = sigma[r][x]; \
+	const sph_u32 idx2 = sigma[r][x+1]; \
+	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
+	v[d] ^= v[a]; \
+    v[d] = rotate(v[d], 16U); \
+	v[c] += v[d]; \
+    v[b] ^= v[c]; \
+	v[b] = rotate(v[b], 20U); \
+\
+	v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \
+    v[d] ^= v[a]; \
+	v[d] = rotate(v[d], 24U); \
+	v[c] += v[d]; \
+    v[b] ^= v[c]; \
+	v[b] = rotate(v[b], 25U); \
+}
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
new file mode 100644
index 0000000..1bb334a
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -0,0 +1,859 @@
+R"===(
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  */
+
+#ifdef cl_amd_media_ops
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#else
+/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
+ * Build-in Function
+ *     uintn  amd_bitalign (uintn src0, uintn src1, uintn src2)
+ *   Description
+ *     dst.s0 =  (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2.s0 & 31))
+ *     similar operation applied to other components of the vectors.
+ *
+ * The implemented function is modified because the last is in our case always a scalar.
+ * We can ignore the bitwise AND operation.
+ */
+inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
+{
+	uint2 result;
+	result.s0 =  (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2));
+	result.s1 =  (uint) (((((long)src0.s1) << 32) | (long)src1.s1) >> (src2));
+	return result;
+}
+#endif
+
+#ifdef cl_amd_media_ops2
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+#else
+/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
+ *     Built-in Function:
+ *     uintn amd_bfe (uintn src0, uintn src1, uintn src2)
+ *   Description
+ *     NOTE: operator >> below represent logical right shift
+ *     offset = src1.s0 & 31;
+ *     width = src2.s0 & 31;
+ *     if width = 0
+ *         dst.s0 = 0;
+ *     else if (offset + width) < 32
+ *         dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
+ *     else
+ *         dst.s0 = src0.s0 >> offset;
+ *     similar operation applied to other components of the vectors
+ */
+inline int amd_bfe(const uint src0, const uint offset, const uint width)
+{
+	/* casts are removed because we can implement everything as uint
+	 * int offset = src1;
+	 * int width = src2;
+	 * remove check for edge case, this function is always called with
+	 * `width==8`
+	 * @code
+	 *   if ( width == 0 )
+	 *      return 0;
+	 * @endcode
+	 */
+	if ( (offset + width) < 32u )
+		return (src0 << (32u - offset - width)) >> (32u - width);
+
+	return src0 >> offset;
+}
+#endif
+
+//#include "opencl/wolf-aes.cl"
+XMRSTAK_INCLUDE_WOLF_AES
+//#include "opencl/wolf-skein.cl"
+XMRSTAK_INCLUDE_WOLF_SKEIN
+//#include "opencl/jh.cl"
+XMRSTAK_INCLUDE_JH
+//#include "opencl/blake256.cl"
+XMRSTAK_INCLUDE_BLAKE256
+//#include "opencl/groestl256.cl"
+XMRSTAK_INCLUDE_GROESTL256
+
+static const __constant ulong keccakf_rndc[24] = 
+{
+    0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+    0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+    0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+    0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+    0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+    0x8000000000008003, 0x8000000000008002, 0x8000000000000080, 
+    0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+    0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
+static const __constant uchar sbox[256] = 
+{
+	0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+	0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+	0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+	0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+	0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+	0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+	0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+	0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+	0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+	0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+	0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+	0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+	0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+	0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+	0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+	0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
+};
+
+
+void keccakf1600(ulong *s)
+{
+    for(int i = 0; i < 24; ++i) 
+    {
+		ulong bc[5], tmp1, tmp2;
+        bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
+        bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL);
+        bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
+        bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
+        bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
+        
+        tmp1 = s[1] ^ bc[0];
+        
+        s[0] ^= bc[4];
+        s[1] = rotate(s[6] ^ bc[0], 44UL);
+        s[6] = rotate(s[9] ^ bc[3], 20UL);
+        s[9] = rotate(s[22] ^ bc[1], 61UL);
+        s[22] = rotate(s[14] ^ bc[3], 39UL);
+        s[14] = rotate(s[20] ^ bc[4], 18UL);
+        s[20] = rotate(s[2] ^ bc[1], 62UL);
+        s[2] = rotate(s[12] ^ bc[1], 43UL);
+        s[12] = rotate(s[13] ^ bc[2], 25UL);
+        s[13] = rotate(s[19] ^ bc[3], 8UL);
+        s[19] = rotate(s[23] ^ bc[2], 56UL);
+        s[23] = rotate(s[15] ^ bc[4], 41UL);
+        s[15] = rotate(s[4] ^ bc[3], 27UL);
+        s[4] = rotate(s[24] ^ bc[3], 14UL);
+        s[24] = rotate(s[21] ^ bc[0], 2UL);
+        s[21] = rotate(s[8] ^ bc[2], 55UL);
+        s[8] = rotate(s[16] ^ bc[0], 35UL);
+        s[16] = rotate(s[5] ^ bc[4], 36UL);
+        s[5] = rotate(s[3] ^ bc[2], 28UL);
+        s[3] = rotate(s[18] ^ bc[2], 21UL);
+        s[18] = rotate(s[17] ^ bc[1], 15UL);
+        s[17] = rotate(s[11] ^ bc[0], 10UL);
+        s[11] = rotate(s[7] ^ bc[1], 6UL);
+        s[7] = rotate(s[10] ^ bc[4], 3UL);
+        s[10] = rotate(tmp1, 1UL);
+        
+        tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+        tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+        tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+        tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+        tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+        s[0] ^= keccakf_rndc[i];
+    }
+}
+
+static const __constant uint keccakf_rotc[24] = 
+{
+    1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14, 
+    27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+};
+
+static const __constant uint keccakf_piln[24] = 
+{
+    10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4, 
+    15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1 
+};
+
+void keccakf1600_1(ulong *st)
+{
+    int i, round;
+    ulong t, bc[5];
+	
+	#pragma unroll 1
+    for(round = 0; round < 24; ++round)
+    {
+
+        // Theta
+        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+		
+		#pragma unroll 1
+        for (i = 0; i < 5; ++i) {
+            t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL);
+            st[i     ] ^= t;
+            st[i +  5] ^= t;
+            st[i + 10] ^= t;
+            st[i + 15] ^= t;
+            st[i + 20] ^= t;
+        }
+
+        // Rho Pi
+        t = st[1];
+        #pragma unroll
+        for (i = 0; i < 24; ++i) {
+            bc[0] = st[keccakf_piln[i]];
+            st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
+            t = bc[0];
+        }
+
+        //ulong tmp1 = st[0]; ulong tmp2 = st[1]; st[0] = bitselect(st[0] ^ st[2], st[0], st[1]); st[1] = bitselect(st[1] ^ st[3], st[1], st[2]); st[2] = bitselect(st[2] ^ st[4], st[2], st[3]); st[3] = bitselect(st[3] ^ tmp1, st[3], st[4]); st[4] = bitselect(st[4] ^ tmp2, st[4], tmp1);
+        //tmp1 = st[5]; tmp2 = st[6]; st[5] = bitselect(st[5] ^ st[7], st[5], st[6]); st[6] = bitselect(st[6] ^ st[8], st[6], st[7]); st[7] = bitselect(st[7] ^ st[9], st[7], st[8]); st[8] = bitselect(st[8] ^ tmp1, st[8], st[9]); st[9] = bitselect(st[9] ^ tmp2, st[9], tmp1);
+        //tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1);
+        //tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1);
+        //tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1);
+        
+        #pragma unroll 1
+        for(int i = 0; i < 25; i += 5)
+        {	
+			ulong tmp[5];
+			
+			#pragma unroll 1
+			for(int x = 0; x < 5; ++x)
+				tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
+			
+			#pragma unroll 1
+			for(int x = 0; x < 5; ++x) st[i + x] = tmp[x];
+        }
+        
+        //  Iota
+        st[0] ^= keccakf_rndc[round];
+    }
+}
+
+void keccakf1600_2(ulong *st)
+{
+    int i, round;
+    ulong t, bc[5];
+	
+	#pragma unroll 1
+    for(round = 0; round < 24; ++round)
+    {
+
+        // Theta
+        //bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+        //bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+        //bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+        //bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+        //bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+		
+		/*
+		#pragma unroll
+        for (i = 0; i < 5; ++i) {
+            t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL);
+            st[i     ] ^= t;
+            st[i +  5] ^= t;
+            st[i + 10] ^= t;
+            st[i + 15] ^= t;
+            st[i + 20] ^= t;
+        }
+		*/
+		
+		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
+		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
+		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
+		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
+		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
+		
+		st[0] ^= bc[4];
+		st[5] ^= bc[4];
+		st[10] ^= bc[4];
+		st[15] ^= bc[4];
+		st[20] ^= bc[4];
+		
+		st[1] ^= bc[0];
+		st[6] ^= bc[0];
+		st[11] ^= bc[0];
+		st[16] ^= bc[0];
+		st[21] ^= bc[0];
+		
+		st[2] ^= bc[1];
+		st[7] ^= bc[1];
+		st[12] ^= bc[1];
+		st[17] ^= bc[1];
+		st[22] ^= bc[1];
+		
+		st[3] ^= bc[2];
+		st[8] ^= bc[2];
+		st[13] ^= bc[2];
+		st[18] ^= bc[2];
+		st[23] ^= bc[2];
+		
+		st[4] ^= bc[3];
+		st[9] ^= bc[3];
+		st[14] ^= bc[3];
+		st[19] ^= bc[3];
+		st[24] ^= bc[3];
+		
+        // Rho Pi
+        t = st[1];
+        #pragma unroll
+        for (i = 0; i < 24; ++i) {
+            bc[0] = st[keccakf_piln[i]];
+            st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
+            t = bc[0];
+        }
+		
+		
+		
+		/*ulong tmp1 = st[1] ^ bc[0];
+        
+        st[0] ^= bc[4];
+        st[1] = rotate(st[6] ^ bc[0], 44UL);
+        st[6] = rotate(st[9] ^ bc[3], 20UL);
+        st[9] = rotate(st[22] ^ bc[1], 61UL);
+        st[22] = rotate(st[14] ^ bc[3], 39UL);
+        st[14] = rotate(st[20] ^ bc[4], 18UL);
+        st[20] = rotate(st[2] ^ bc[1], 62UL);
+        st[2] = rotate(st[12] ^ bc[1], 43UL);
+        st[12] = rotate(st[13] ^ bc[2], 25UL);
+        st[13] = rotate(st[19] ^ bc[3], 8UL);
+        st[19] = rotate(st[23] ^ bc[2], 56UL);
+        st[23] = rotate(st[15] ^ bc[4], 41UL);
+        st[15] = rotate(st[4] ^ bc[3], 27UL);
+        st[4] = rotate(st[24] ^ bc[3], 14UL);
+        st[24] = rotate(st[21] ^ bc[0], 2UL);
+        st[21] = rotate(st[8] ^ bc[2], 55UL);
+        st[8] = rotate(st[16] ^ bc[0], 35UL);
+        st[16] = rotate(st[5] ^ bc[4], 36UL);
+        st[5] = rotate(st[3] ^ bc[2], 28UL);
+        st[3] = rotate(st[18] ^ bc[2], 21UL);
+        st[18] = rotate(st[17] ^ bc[1], 15UL);
+        st[17] = rotate(st[11] ^ bc[0], 10UL);
+        st[11] = rotate(st[7] ^ bc[1], 6UL);
+        st[7] = rotate(st[10] ^ bc[4], 3UL);
+        st[10] = rotate(tmp1, 1UL);
+		*/
+		
+		
+        //ulong tmp1 = st[0]; ulong tmp2 = st[1]; st[0] = bitselect(st[0] ^ st[2], st[0], st[1]); st[1] = bitselect(st[1] ^ st[3], st[1], st[2]); st[2] = bitselect(st[2] ^ st[4], st[2], st[3]); st[3] = bitselect(st[3] ^ tmp1, st[3], st[4]); st[4] = bitselect(st[4] ^ tmp2, st[4], tmp1);
+        //tmp1 = st[5]; tmp2 = st[6]; st[5] = bitselect(st[5] ^ st[7], st[5], st[6]); st[6] = bitselect(st[6] ^ st[8], st[6], st[7]); st[7] = bitselect(st[7] ^ st[9], st[7], st[8]); st[8] = bitselect(st[8] ^ tmp1, st[8], st[9]); st[9] = bitselect(st[9] ^ tmp2, st[9], tmp1);
+        //tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1);
+        //tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1);
+        //tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1);
+        
+        #pragma unroll
+        for(int i = 0; i < 25; i += 5)
+        {
+			ulong tmp1 = st[i], tmp2 = st[i + 1];
+			
+			st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
+			st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
+			st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
+			st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
+			st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
+        }
+        
+        //  Iota
+        st[0] ^= keccakf_rndc[round];
+    }
+}
+
+)==="
+R"===(
+
+void CNKeccak(ulong *output, ulong *input)
+{
+	ulong st[25];
+	
+	// Copy 72 bytes
+	for(int i = 0; i < 9; ++i) st[i] = input[i];
+	
+	// Last four and '1' bit for padding
+	//st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
+	
+	st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
+	
+	for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
+	
+	// Last bit of padding
+	st[16] = 0x8000000000000000UL;
+	
+	keccakf1600_1(st);
+	
+	for(int i = 0; i < 25; ++i) output[i] = st[i];
+}
+
+static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
+
+#define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
+
+#define SubWord(inw)		((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
+
+void AESExpandKey256(uint *keybuf)
+{
+	//#pragma unroll 4
+	for(uint c = 8, i = 1; c < 60; ++c)
+	{
+		// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
+		uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
+		
+		// If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
+		// then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
+		// is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
+		keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
+	}
+}
+
+#define IDX(x)	(x)
+
+__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
+__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states)
+{
+	ulong State[25];
+	uint ExpandedKey1[256];
+	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+	uint4 text;
+	
+	states += (25 * (get_global_id(0) - get_global_offset(0)));
+	Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2);
+	
+	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+	{
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+		AES2[i] = rotate(tmp, 16U);
+		AES3[i] = rotate(tmp, 24U);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	((ulong8 *)State)[0] = vload8(0, input);
+	State[8] = input[8];
+	State[9] = input[9];
+	State[10] = input[10];
+
+	((uint *)State)[9] &= 0x00FFFFFFU;
+	((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24;
+	((uint *)State)[10] &= 0xFF000000U;
+	((uint *)State)[10] |= ((get_global_id(0) >> 8));
+	
+	for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
+	
+	// Last bit of padding
+	State[16] = 0x8000000000000000UL;
+	
+	keccakf1600_2(State);
+	
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
+	
+	#pragma unroll
+	for(int i = 0; i < 25; ++i) states[i] = State[i];
+	
+	text = vload4(get_local_id(1) + 4, (__global uint *)(states));
+	
+	#pragma unroll
+	for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey1)[i] = states[i];
+	
+	AESExpandKey256(ExpandedKey1);
+	
+	mem_fence(CLK_LOCAL_MEM_FENCE);
+	
+	#pragma unroll 2
+	for(int i = 0; i < 0x4000; ++i)
+	{
+		#pragma unroll
+		for(int j = 0; j < 10; ++j)
+			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
+		
+		Scratchpad[IDX((i << 3) + get_local_id(1))] = text;
+	}
+	
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states)
+{
+	ulong a[2], b[2];
+	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+	
+	Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2);
+	states += (25 * (get_global_id(0) - get_global_offset(0)));
+	
+	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+	{
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+		AES2[i] = rotate(tmp, 16U);
+		AES3[i] = rotate(tmp, 24U);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	a[0] = states[0] ^ states[4];
+	b[0] = states[2] ^ states[6];
+	a[1] = states[1] ^ states[5];
+	b[1] = states[3] ^ states[7];
+	
+	uint4 b_x = ((uint4 *)b)[0];
+	
+	mem_fence(CLK_LOCAL_MEM_FENCE);
+	
+	#pragma unroll 8
+	for(int i = 0; i < 0x80000; ++i)
+	{
+		ulong c[2];
+		
+		((uint4 *)c)[0] = Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)];
+		((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+		//b_x ^= ((uint4 *)c)[0];
+		
+		Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)] = b_x ^ ((uint4 *)c)[0];
+		
+		uint4 tmp;
+		tmp = Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)];
+		
+		a[1] += c[0] * as_ulong2(tmp).s0;
+		a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+		
+		Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)] = ((uint4 *)a)[0];
+		
+		((uint4 *)a)[0] ^= tmp;
+		
+		b_x = ((uint4 *)c)[0];
+	}
+	
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
+__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3)
+{
+	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+	uint ExpandedKey2[256];
+	ulong State[25];
+	uint4 text;
+	
+	Scratchpad += ((get_global_id(0) - get_global_offset(0))) * (0x80000 >> 2);
+	states += (25 * (get_global_id(0) - get_global_offset(0)));
+	
+	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+	{
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+		AES2[i] = rotate(tmp, 16U);
+		AES3[i] = rotate(tmp, 24U);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	#if defined(__Tahiti__) || defined(__Pitcairn__)
+	
+	for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
+	text = vload4(get_local_id(1) + 4, (__global uint *)states);
+	
+	#else
+	
+	text = vload4(get_local_id(1) + 4, (__global uint *)states);
+	((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
+	
+	#endif
+	
+	AESExpandKey256(ExpandedKey2);
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	#pragma unroll 2
+	for(int i = 0; i < 0x4000; ++i)
+	{		
+		text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+		
+		#pragma unroll
+		for(int j = 0; j < 10; ++j)
+			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+	}
+	
+	vstore2(as_ulong2(text), get_local_id(1) + 4, states);
+	
+	barrier(CLK_GLOBAL_MEM_FENCE);
+	
+	if(!get_local_id(1))
+	{
+		for(int i = 0; i < 25; ++i) State[i] = states[i];
+		
+		keccakf1600_2(State);
+		
+		for(int i = 0; i < 25; ++i) states[i] = State[i];
+		
+		switch(State[0] & 3)
+		{
+			case 0:
+				Branch0[atomic_inc(Branch0 + get_global_size(0))] = get_global_id(0) - get_global_offset(0);
+				break;
+			case 1:
+				Branch1[atomic_inc(Branch1 + get_global_size(0))] = get_global_id(0) - get_global_offset(0);
+				break;
+			case 2:
+				Branch2[atomic_inc(Branch2 + get_global_size(0))] = get_global_id(0) - get_global_offset(0);
+				break;
+			case 3:
+				Branch3[atomic_inc(Branch3 + get_global_size(0))] = get_global_id(0) - get_global_offset(0);
+				break;
+		}
+	}
+	
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+)==="
+R"===(
+
+#define VSWAP8(x)	(((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \
+          | (((x) >>  8) & 0x00000000FF000000UL) | (((x) <<  8) & 0x000000FF00000000UL) \
+          | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL))
+
+#define VSWAP4(x)	((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U))
+
+__kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, uint Target, ulong Threads)
+{
+	const ulong idx = get_global_id(0) - get_global_offset(0);
+	
+	if(idx >= Threads) return;
+	
+	states += 25 * BranchBuf[idx];
+	
+	// skein
+	ulong8 h = vload8(0, SKEIN512_256_IV);
+	
+	// Type field begins with final bit, first bit, then six bits of type; the last 96
+	// bits are input processed (including in the block to be processed with that tweak)
+	// The output transform is only one run of UBI, since we need only 256 bits of output
+	// The tweak for the output transform is Type = Output with the Final bit set
+	// T[0] for the output is 8, and I don't know why - should be message size...
+	ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
+	ulong8 p, m;
+	
+	for(uint i = 0; i < 4; ++i)
+	{
+		if(i < 3) t[0] += 0x40UL;
+		else t[0] += 0x08UL;
+		
+		t[2] = t[0] ^ t[1];
+		
+		m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+		const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+		p = Skein512Block(m, h, h8, t);
+		
+		h = m ^ p;
+		
+		if(i < 2) t[1] = 0x3000000000000000UL;
+		else t[1] = 0xB000000000000000UL;
+	}
+	
+	t[0] = 0x08UL;
+	t[1] = 0xFF00000000000000UL;
+	t[2] = t[0] ^ t[1];
+	
+	p = (ulong8)(0);
+	const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+	
+	p = Skein512Block(p, h, h8, t);
+	
+	//vstore8(p, 0, output);
+	
+	if(as_uint16(p).s7 <= Target) output[atomic_inc(output + 0xFF)] = BranchBuf[idx] + get_global_offset(0);
+	
+	mem_fence(CLK_GLOBAL_MEM_FENCE);	
+}
+
+#define SWAP8(x)	as_ulong(as_uchar8(x).s76543210)
+
+__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, uint Target, ulong Threads)
+{
+	const uint idx = get_global_id(0) - get_global_offset(0);
+	
+	if(idx >= Threads) return;
+	
+	states += 25 * BranchBuf[idx];
+	
+	sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
+	sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
+	sph_u64 tmp;
+	
+	for(int i = 0; i < 5; ++i)
+	{
+		ulong input[8];
+		
+		if(i < 3)
+		{
+			for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]);
+		}
+		else if(i == 3)
+		{
+			input[0] = (states[24]);
+			input[1] = 0x80UL;
+			for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+		}
+		else
+		{
+			input[7] = 0x4006000000000000UL;
+						
+			for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+		}
+		
+		h0h ^= input[0];
+		h0l ^= input[1];
+		h1h ^= input[2];
+		h1l ^= input[3];
+		h2h ^= input[4];
+		h2l ^= input[5];
+		h3h ^= input[6];
+		h3l ^= input[7];
+		
+		E8;
+		
+		h4h ^= input[0];
+		h4l ^= input[1];
+		h5h ^= input[2];
+		h5l ^= input[3];
+		h6h ^= input[4];
+		h6l ^= input[5];
+		h7h ^= input[6];
+		h7l ^= input[7];
+	}
+	
+	//output[0] = h6h;
+	//output[1] = h6l;
+	//output[2] = h7h;
+	//output[3] = h7l;
+	
+	if(as_uint2(h7l).s1 <= Target) output[atomic_inc(output + 0xFF)] = BranchBuf[idx] + get_global_offset(0);
+}
+
+#define SWAP4(x)	as_uint(as_uchar4(x).s3210)
+
+__kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, uint Target, ulong Threads)
+{
+	const uint idx = get_global_id(0) - get_global_offset(0);
+	
+	if(idx >= Threads) return;
+	
+	states += 25 * BranchBuf[idx];
+	
+	unsigned int m[16];
+	unsigned int v[16];
+	uint h[8];
+	
+	((uint8 *)h)[0] = vload8(0U, c_IV256);
+	
+	for(uint i = 0, bitlen = 0; i < 4; ++i)
+	{
+		if(i < 3)
+		{
+			((uint16 *)m)[0] = vload16(i, (__global uint *)states);
+			for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
+			bitlen += 512;
+		}
+		else
+		{
+			m[0] = SWAP4(((__global uint *)states)[48]);
+			m[1] = SWAP4(((__global uint *)states)[49]);
+			m[2] = 0x80000000U;
+			
+			for(int i = 3; i < 13; ++i) m[i] = 0x00U;
+			
+			m[13] = 1U;
+			m[14] = 0U;
+			m[15] = 0x640;
+			bitlen += 64;
+		}
+		
+		((uint16 *)v)[0].lo = ((uint8 *)h)[0];
+		((uint16 *)v)[0].hi = vload8(0U, c_u256);
+		
+		//v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
+		//v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
+		
+		v[12] ^= bitlen;
+		v[13] ^= bitlen;
+		
+		for(int r = 0; r < 14; r++)
+		{	
+			GS(0, 4, 0x8, 0xC, 0x0);
+			GS(1, 5, 0x9, 0xD, 0x2);
+			GS(2, 6, 0xA, 0xE, 0x4);
+			GS(3, 7, 0xB, 0xF, 0x6);
+			GS(0, 5, 0xA, 0xF, 0x8);
+			GS(1, 6, 0xB, 0xC, 0xA);
+			GS(2, 7, 0x8, 0xD, 0xC);
+			GS(3, 4, 0x9, 0xE, 0xE);
+		}
+		
+		((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
+	}
+	
+	for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
+	
+	//for(int i = 0; i < 4; ++i) output[i] = ((ulong *)h)[i];
+	if(h[7] <= Target) output[atomic_inc(output + 0xFF)] = BranchBuf[idx] + get_global_offset(0);
+}
+
+__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, uint Target, ulong Threads)
+{
+	const uint idx = get_global_id(0) - get_global_offset(0);
+	
+	if(idx >= Threads) return;
+	
+	states += 25 * BranchBuf[idx];
+	
+	ulong State[8];
+	
+	for(int i = 0; i < 7; ++i) State[i] = 0UL;
+	
+	State[7] = 0x0001000000000000UL;
+	
+	for(uint i = 0; i < 4; ++i)
+	{
+		ulong H[8], M[8];
+		
+		if(i < 3)
+		{
+			((ulong8 *)M)[0] = vload8(i, states);
+		}
+		else
+		{
+			M[0] = states[24];
+			M[1] = 0x80UL;
+						
+			for(int x = 2; x < 7; ++x) M[x] = 0UL;
+			
+			M[7] = 0x0400000000000000UL;
+		}
+		
+		for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
+		
+		PERM_SMALL_P(H);
+		PERM_SMALL_Q(M);
+		
+		for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
+	}
+	
+	ulong tmp[8];
+	
+	for(int i = 0; i < 8; ++i) tmp[i] = State[i];
+	
+	PERM_SMALL_P(State);
+	
+	for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
+	
+	//for(int i = 0; i < 4; ++i) output[i] = State[i + 4];
+	if(as_uint2(State[7]).s1 <= Target) output[atomic_inc(output + 0xFF)] = BranchBuf[idx] + get_global_offset(0);
+}
+
+)==="
+\ No newline at end of file
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
new file mode 100644
index 0000000..1a7c96f
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
@@ -0,0 +1,295 @@
+R"===(
+/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */
+/*
+ * Groestl256
+ *
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2014 djm34
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#define SPH_C64(x)	x
+#define SPH_ROTL64(x, y)	rotate((x), (ulong)(y))
+
+
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+
+#define B64_0(x)    ((x) & 0xFF)
+#define B64_1(x)    (((x) >> 8) & 0xFF)
+#define B64_2(x)    (((x) >> 16) & 0xFF)
+#define B64_3(x)    (((x) >> 24) & 0xFF)
+#define B64_4(x)    (((x) >> 32) & 0xFF)
+#define B64_5(x)    (((x) >> 40) & 0xFF)
+#define B64_6(x)    (((x) >> 48) & 0xFF)
+#define B64_7(x)    ((x) >> 56)
+#define R64         SPH_ROTL64
+#define PC64(j, r)  ((sph_u64)((j) + (r)))
+#define QC64(j, r)  (((sph_u64)(r) << 56) ^ (~((sph_u64)(j) << 56)))
+
+static const __constant ulong T0_G[] =
+{
+	0xc6a597f4a5f432c6UL, 0xf884eb9784976ff8UL, 0xee99c7b099b05eeeUL, 0xf68df78c8d8c7af6UL, 
+	0xff0de5170d17e8ffUL, 0xd6bdb7dcbddc0ad6UL, 0xdeb1a7c8b1c816deUL, 0x915439fc54fc6d91UL, 
+	0x6050c0f050f09060UL, 0x0203040503050702UL, 0xcea987e0a9e02eceUL, 0x567dac877d87d156UL, 
+	0xe719d52b192bcce7UL, 0xb56271a662a613b5UL, 0x4de69a31e6317c4dUL, 0xec9ac3b59ab559ecUL, 
+	0x8f4505cf45cf408fUL, 0x1f9d3ebc9dbca31fUL, 0x894009c040c04989UL, 0xfa87ef92879268faUL, 
+	0xef15c53f153fd0efUL, 0xb2eb7f26eb2694b2UL, 0x8ec90740c940ce8eUL, 0xfb0bed1d0b1de6fbUL, 
+	0x41ec822fec2f6e41UL, 0xb3677da967a91ab3UL, 0x5ffdbe1cfd1c435fUL, 0x45ea8a25ea256045UL, 
+	0x23bf46dabfdaf923UL, 0x53f7a602f7025153UL, 0xe496d3a196a145e4UL, 0x9b5b2ded5bed769bUL, 
+	0x75c2ea5dc25d2875UL, 0xe11cd9241c24c5e1UL, 0x3dae7ae9aee9d43dUL, 0x4c6a98be6abef24cUL, 
+	0x6c5ad8ee5aee826cUL, 0x7e41fcc341c3bd7eUL, 0xf502f1060206f3f5UL, 0x834f1dd14fd15283UL, 
+	0x685cd0e45ce48c68UL, 0x51f4a207f4075651UL, 0xd134b95c345c8dd1UL, 0xf908e9180818e1f9UL, 
+	0xe293dfae93ae4ce2UL, 0xab734d9573953eabUL, 0x6253c4f553f59762UL, 0x2a3f54413f416b2aUL, 
+	0x080c10140c141c08UL, 0x955231f652f66395UL, 0x46658caf65afe946UL, 0x9d5e21e25ee27f9dUL, 
+	0x3028607828784830UL, 0x37a16ef8a1f8cf37UL, 0x0a0f14110f111b0aUL, 0x2fb55ec4b5c4eb2fUL, 
+	0x0e091c1b091b150eUL, 0x2436485a365a7e24UL, 0x1b9b36b69bb6ad1bUL, 0xdf3da5473d4798dfUL, 
+	0xcd26816a266aa7cdUL, 0x4e699cbb69bbf54eUL, 0x7fcdfe4ccd4c337fUL, 0xea9fcfba9fba50eaUL, 
+	0x121b242d1b2d3f12UL, 0x1d9e3ab99eb9a41dUL, 0x5874b09c749cc458UL, 0x342e68722e724634UL, 
+	0x362d6c772d774136UL, 0xdcb2a3cdb2cd11dcUL, 0xb4ee7329ee299db4UL, 0x5bfbb616fb164d5bUL, 
+	0xa4f65301f601a5a4UL, 0x764decd74dd7a176UL, 0xb76175a361a314b7UL, 0x7dcefa49ce49347dUL, 
+	0x527ba48d7b8ddf52UL, 0xdd3ea1423e429fddUL, 0x5e71bc937193cd5eUL, 0x139726a297a2b113UL, 
+	0xa6f55704f504a2a6UL, 0xb96869b868b801b9UL, 0x0000000000000000UL, 0xc12c99742c74b5c1UL, 
+	0x406080a060a0e040UL, 0xe31fdd211f21c2e3UL, 0x79c8f243c8433a79UL, 0xb6ed772ced2c9ab6UL, 
+	0xd4beb3d9bed90dd4UL, 0x8d4601ca46ca478dUL, 0x67d9ce70d9701767UL, 0x724be4dd4bddaf72UL, 
+	0x94de3379de79ed94UL, 0x98d42b67d467ff98UL, 0xb0e87b23e82393b0UL, 0x854a11de4ade5b85UL, 
+	0xbb6b6dbd6bbd06bbUL, 0xc52a917e2a7ebbc5UL, 0x4fe59e34e5347b4fUL, 0xed16c13a163ad7edUL, 
+	0x86c51754c554d286UL, 0x9ad72f62d762f89aUL, 0x6655ccff55ff9966UL, 0x119422a794a7b611UL, 
+	0x8acf0f4acf4ac08aUL, 0xe910c9301030d9e9UL, 0x0406080a060a0e04UL, 0xfe81e798819866feUL, 
+	0xa0f05b0bf00baba0UL, 0x7844f0cc44ccb478UL, 0x25ba4ad5bad5f025UL, 0x4be3963ee33e754bUL, 
+	0xa2f35f0ef30eaca2UL, 0x5dfeba19fe19445dUL, 0x80c01b5bc05bdb80UL, 0x058a0a858a858005UL, 
+	0x3fad7eecadecd33fUL, 0x21bc42dfbcdffe21UL, 0x7048e0d848d8a870UL, 0xf104f90c040cfdf1UL, 
+	0x63dfc67adf7a1963UL, 0x77c1ee58c1582f77UL, 0xaf75459f759f30afUL, 0x426384a563a5e742UL, 
+	0x2030405030507020UL, 0xe51ad12e1a2ecbe5UL, 0xfd0ee1120e12effdUL, 0xbf6d65b76db708bfUL, 
+	0x814c19d44cd45581UL, 0x1814303c143c2418UL, 0x26354c5f355f7926UL, 0xc32f9d712f71b2c3UL, 
+	0xbee16738e13886beUL, 0x35a26afda2fdc835UL, 0x88cc0b4fcc4fc788UL, 0x2e395c4b394b652eUL, 
+	0x93573df957f96a93UL, 0x55f2aa0df20d5855UL, 0xfc82e39d829d61fcUL, 0x7a47f4c947c9b37aUL, 
+	0xc8ac8befacef27c8UL, 0xbae76f32e73288baUL, 0x322b647d2b7d4f32UL, 0xe695d7a495a442e6UL, 
+	0xc0a09bfba0fb3bc0UL, 0x199832b398b3aa19UL, 0x9ed12768d168f69eUL, 0xa37f5d817f8122a3UL, 
+	0x446688aa66aaee44UL, 0x547ea8827e82d654UL, 0x3bab76e6abe6dd3bUL, 0x0b83169e839e950bUL, 
+	0x8cca0345ca45c98cUL, 0xc729957b297bbcc7UL, 0x6bd3d66ed36e056bUL, 0x283c50443c446c28UL, 
+	0xa779558b798b2ca7UL, 0xbce2633de23d81bcUL, 0x161d2c271d273116UL, 0xad76419a769a37adUL, 
+	0xdb3bad4d3b4d96dbUL, 0x6456c8fa56fa9e64UL, 0x744ee8d24ed2a674UL, 0x141e28221e223614UL, 
+	0x92db3f76db76e492UL, 0x0c0a181e0a1e120cUL, 0x486c90b46cb4fc48UL, 0xb8e46b37e4378fb8UL, 
+	0x9f5d25e75de7789fUL, 0xbd6e61b26eb20fbdUL, 0x43ef862aef2a6943UL, 0xc4a693f1a6f135c4UL, 
+	0x39a872e3a8e3da39UL, 0x31a462f7a4f7c631UL, 0xd337bd5937598ad3UL, 0xf28bff868b8674f2UL, 
+	0xd532b156325683d5UL, 0x8b430dc543c54e8bUL, 0x6e59dceb59eb856eUL, 0xdab7afc2b7c218daUL, 
+	0x018c028f8c8f8e01UL, 0xb16479ac64ac1db1UL, 0x9cd2236dd26df19cUL, 0x49e0923be03b7249UL, 
+	0xd8b4abc7b4c71fd8UL, 0xacfa4315fa15b9acUL, 0xf307fd090709faf3UL, 0xcf25856f256fa0cfUL, 
+	0xcaaf8feaafea20caUL, 0xf48ef3898e897df4UL, 0x47e98e20e9206747UL, 0x1018202818283810UL, 
+	0x6fd5de64d5640b6fUL, 0xf088fb83888373f0UL, 0x4a6f94b16fb1fb4aUL, 0x5c72b8967296ca5cUL, 
+	0x3824706c246c5438UL, 0x57f1ae08f1085f57UL, 0x73c7e652c7522173UL, 0x975135f351f36497UL, 
+	0xcb238d652365aecbUL, 0xa17c59847c8425a1UL, 0xe89ccbbf9cbf57e8UL, 0x3e217c6321635d3eUL, 
+	0x96dd377cdd7cea96UL, 0x61dcc27fdc7f1e61UL, 0x0d861a9186919c0dUL, 0x0f851e9485949b0fUL, 
+	0xe090dbab90ab4be0UL, 0x7c42f8c642c6ba7cUL, 0x71c4e257c4572671UL, 0xccaa83e5aae529ccUL, 
+	0x90d83b73d873e390UL, 0x06050c0f050f0906UL, 0xf701f5030103f4f7UL, 0x1c12383612362a1cUL, 
+	0xc2a39ffea3fe3cc2UL, 0x6a5fd4e15fe18b6aUL, 0xaef94710f910beaeUL, 0x69d0d26bd06b0269UL, 
+	0x17912ea891a8bf17UL, 0x995829e858e87199UL, 0x3a2774692769533aUL, 0x27b94ed0b9d0f727UL, 
+	0xd938a948384891d9UL, 0xeb13cd351335deebUL, 0x2bb356ceb3cee52bUL, 0x2233445533557722UL, 
+	0xd2bbbfd6bbd604d2UL, 0xa9704990709039a9UL, 0x07890e8089808707UL, 0x33a766f2a7f2c133UL, 
+	0x2db65ac1b6c1ec2dUL, 0x3c22786622665a3cUL, 0x15922aad92adb815UL, 0xc92089602060a9c9UL, 
+	0x874915db49db5c87UL, 0xaaff4f1aff1ab0aaUL, 0x5078a0887888d850UL, 0xa57a518e7a8e2ba5UL, 
+	0x038f068a8f8a8903UL, 0x59f8b213f8134a59UL, 0x0980129b809b9209UL, 0x1a1734391739231aUL, 
+	0x65daca75da751065UL, 0xd731b553315384d7UL, 0x84c61351c651d584UL, 0xd0b8bbd3b8d303d0UL, 
+	0x82c31f5ec35edc82UL, 0x29b052cbb0cbe229UL, 0x5a77b4997799c35aUL, 0x1e113c3311332d1eUL, 
+	0x7bcbf646cb463d7bUL, 0xa8fc4b1ffc1fb7a8UL, 0x6dd6da61d6610c6dUL, 0x2c3a584e3a4e622cUL
+};
+
+)==="
+R"===(
+
+static const __constant ulong T4_G[] =
+{
+	0xA5F432C6C6A597F4UL, 0x84976FF8F884EB97UL, 0x99B05EEEEE99C7B0UL, 0x8D8C7AF6F68DF78CUL, 
+	0x0D17E8FFFF0DE517UL, 0xBDDC0AD6D6BDB7DCUL, 0xB1C816DEDEB1A7C8UL, 0x54FC6D91915439FCUL, 
+	0x50F090606050C0F0UL, 0x0305070202030405UL, 0xA9E02ECECEA987E0UL, 0x7D87D156567DAC87UL, 
+	0x192BCCE7E719D52BUL, 0x62A613B5B56271A6UL, 0xE6317C4D4DE69A31UL, 0x9AB559ECEC9AC3B5UL, 
+	0x45CF408F8F4505CFUL, 0x9DBCA31F1F9D3EBCUL, 0x40C04989894009C0UL, 0x879268FAFA87EF92UL, 
+	0x153FD0EFEF15C53FUL, 0xEB2694B2B2EB7F26UL, 0xC940CE8E8EC90740UL, 0x0B1DE6FBFB0BED1DUL, 
+	0xEC2F6E4141EC822FUL, 0x67A91AB3B3677DA9UL, 0xFD1C435F5FFDBE1CUL, 0xEA25604545EA8A25UL, 
+	0xBFDAF92323BF46DAUL, 0xF702515353F7A602UL, 0x96A145E4E496D3A1UL, 0x5BED769B9B5B2DEDUL, 
+	0xC25D287575C2EA5DUL, 0x1C24C5E1E11CD924UL, 0xAEE9D43D3DAE7AE9UL, 0x6ABEF24C4C6A98BEUL, 
+	0x5AEE826C6C5AD8EEUL, 0x41C3BD7E7E41FCC3UL, 0x0206F3F5F502F106UL, 0x4FD15283834F1DD1UL, 
+	0x5CE48C68685CD0E4UL, 0xF407565151F4A207UL, 0x345C8DD1D134B95CUL, 0x0818E1F9F908E918UL, 
+	0x93AE4CE2E293DFAEUL, 0x73953EABAB734D95UL, 0x53F597626253C4F5UL, 0x3F416B2A2A3F5441UL, 
+	0x0C141C08080C1014UL, 0x52F66395955231F6UL, 0x65AFE94646658CAFUL, 0x5EE27F9D9D5E21E2UL, 
+	0x2878483030286078UL, 0xA1F8CF3737A16EF8UL, 0x0F111B0A0A0F1411UL, 0xB5C4EB2F2FB55EC4UL, 
+	0x091B150E0E091C1BUL, 0x365A7E242436485AUL, 0x9BB6AD1B1B9B36B6UL, 0x3D4798DFDF3DA547UL, 
+	0x266AA7CDCD26816AUL, 0x69BBF54E4E699CBBUL, 0xCD4C337F7FCDFE4CUL, 0x9FBA50EAEA9FCFBAUL, 
+	0x1B2D3F12121B242DUL, 0x9EB9A41D1D9E3AB9UL, 0x749CC4585874B09CUL, 0x2E724634342E6872UL, 
+	0x2D774136362D6C77UL, 0xB2CD11DCDCB2A3CDUL, 0xEE299DB4B4EE7329UL, 0xFB164D5B5BFBB616UL, 
+	0xF601A5A4A4F65301UL, 0x4DD7A176764DECD7UL, 0x61A314B7B76175A3UL, 0xCE49347D7DCEFA49UL, 
+	0x7B8DDF52527BA48DUL, 0x3E429FDDDD3EA142UL, 0x7193CD5E5E71BC93UL, 0x97A2B113139726A2UL, 
+	0xF504A2A6A6F55704UL, 0x68B801B9B96869B8UL, 0x0000000000000000UL, 0x2C74B5C1C12C9974UL, 
+	0x60A0E040406080A0UL, 0x1F21C2E3E31FDD21UL, 0xC8433A7979C8F243UL, 0xED2C9AB6B6ED772CUL, 
+	0xBED90DD4D4BEB3D9UL, 0x46CA478D8D4601CAUL, 0xD970176767D9CE70UL, 0x4BDDAF72724BE4DDUL, 
+	0xDE79ED9494DE3379UL, 0xD467FF9898D42B67UL, 0xE82393B0B0E87B23UL, 0x4ADE5B85854A11DEUL, 
+	0x6BBD06BBBB6B6DBDUL, 0x2A7EBBC5C52A917EUL, 0xE5347B4F4FE59E34UL, 0x163AD7EDED16C13AUL, 
+	0xC554D28686C51754UL, 0xD762F89A9AD72F62UL, 0x55FF99666655CCFFUL, 0x94A7B611119422A7UL, 
+	0xCF4AC08A8ACF0F4AUL, 0x1030D9E9E910C930UL, 0x060A0E040406080AUL, 0x819866FEFE81E798UL, 
+	0xF00BABA0A0F05B0BUL, 0x44CCB4787844F0CCUL, 0xBAD5F02525BA4AD5UL, 0xE33E754B4BE3963EUL, 
+	0xF30EACA2A2F35F0EUL, 0xFE19445D5DFEBA19UL, 0xC05BDB8080C01B5BUL, 0x8A858005058A0A85UL, 
+	0xADECD33F3FAD7EECUL, 0xBCDFFE2121BC42DFUL, 0x48D8A8707048E0D8UL, 0x040CFDF1F104F90CUL, 
+	0xDF7A196363DFC67AUL, 0xC1582F7777C1EE58UL, 0x759F30AFAF75459FUL, 0x63A5E742426384A5UL, 
+	0x3050702020304050UL, 0x1A2ECBE5E51AD12EUL, 0x0E12EFFDFD0EE112UL, 0x6DB708BFBF6D65B7UL, 
+	0x4CD45581814C19D4UL, 0x143C24181814303CUL, 0x355F792626354C5FUL, 0x2F71B2C3C32F9D71UL, 
+	0xE13886BEBEE16738UL, 0xA2FDC83535A26AFDUL, 0xCC4FC78888CC0B4FUL, 0x394B652E2E395C4BUL, 
+	0x57F96A9393573DF9UL, 0xF20D585555F2AA0DUL, 0x829D61FCFC82E39DUL, 0x47C9B37A7A47F4C9UL, 
+	0xACEF27C8C8AC8BEFUL, 0xE73288BABAE76F32UL, 0x2B7D4F32322B647DUL, 0x95A442E6E695D7A4UL, 
+	0xA0FB3BC0C0A09BFBUL, 0x98B3AA19199832B3UL, 0xD168F69E9ED12768UL, 0x7F8122A3A37F5D81UL, 
+	0x66AAEE44446688AAUL, 0x7E82D654547EA882UL, 0xABE6DD3B3BAB76E6UL, 0x839E950B0B83169EUL, 
+	0xCA45C98C8CCA0345UL, 0x297BBCC7C729957BUL, 0xD36E056B6BD3D66EUL, 0x3C446C28283C5044UL, 
+	0x798B2CA7A779558BUL, 0xE23D81BCBCE2633DUL, 0x1D273116161D2C27UL, 0x769A37ADAD76419AUL, 
+	0x3B4D96DBDB3BAD4DUL, 0x56FA9E646456C8FAUL, 0x4ED2A674744EE8D2UL, 0x1E223614141E2822UL, 
+	0xDB76E49292DB3F76UL, 0x0A1E120C0C0A181EUL, 0x6CB4FC48486C90B4UL, 0xE4378FB8B8E46B37UL, 
+	0x5DE7789F9F5D25E7UL, 0x6EB20FBDBD6E61B2UL, 0xEF2A694343EF862AUL, 0xA6F135C4C4A693F1UL, 
+	0xA8E3DA3939A872E3UL, 0xA4F7C63131A462F7UL, 0x37598AD3D337BD59UL, 0x8B8674F2F28BFF86UL, 
+	0x325683D5D532B156UL, 0x43C54E8B8B430DC5UL, 0x59EB856E6E59DCEBUL, 0xB7C218DADAB7AFC2UL, 
+	0x8C8F8E01018C028FUL, 0x64AC1DB1B16479ACUL, 0xD26DF19C9CD2236DUL, 0xE03B724949E0923BUL, 
+	0xB4C71FD8D8B4ABC7UL, 0xFA15B9ACACFA4315UL, 0x0709FAF3F307FD09UL, 0x256FA0CFCF25856FUL, 
+	0xAFEA20CACAAF8FEAUL, 0x8E897DF4F48EF389UL, 0xE920674747E98E20UL, 0x1828381010182028UL, 
+	0xD5640B6F6FD5DE64UL, 0x888373F0F088FB83UL, 0x6FB1FB4A4A6F94B1UL, 0x7296CA5C5C72B896UL, 
+	0x246C54383824706CUL, 0xF1085F5757F1AE08UL, 0xC752217373C7E652UL, 0x51F36497975135F3UL, 
+	0x2365AECBCB238D65UL, 0x7C8425A1A17C5984UL, 0x9CBF57E8E89CCBBFUL, 0x21635D3E3E217C63UL, 
+	0xDD7CEA9696DD377CUL, 0xDC7F1E6161DCC27FUL, 0x86919C0D0D861A91UL, 0x85949B0F0F851E94UL, 
+	0x90AB4BE0E090DBABUL, 0x42C6BA7C7C42F8C6UL, 0xC457267171C4E257UL, 0xAAE529CCCCAA83E5UL, 
+	0xD873E39090D83B73UL, 0x050F090606050C0FUL, 0x0103F4F7F701F503UL, 0x12362A1C1C123836UL, 
+	0xA3FE3CC2C2A39FFEUL, 0x5FE18B6A6A5FD4E1UL, 0xF910BEAEAEF94710UL, 0xD06B026969D0D26BUL, 
+	0x91A8BF1717912EA8UL, 0x58E87199995829E8UL, 0x2769533A3A277469UL, 0xB9D0F72727B94ED0UL, 
+	0x384891D9D938A948UL, 0x1335DEEBEB13CD35UL, 0xB3CEE52B2BB356CEUL, 0x3355772222334455UL, 
+	0xBBD604D2D2BBBFD6UL, 0x709039A9A9704990UL, 0x8980870707890E80UL, 0xA7F2C13333A766F2UL, 
+	0xB6C1EC2D2DB65AC1UL, 0x22665A3C3C227866UL, 0x92ADB81515922AADUL, 0x2060A9C9C9208960UL, 
+	0x49DB5C87874915DBUL, 0xFF1AB0AAAAFF4F1AUL, 0x7888D8505078A088UL, 0x7A8E2BA5A57A518EUL, 
+	0x8F8A8903038F068AUL, 0xF8134A5959F8B213UL, 0x809B92090980129BUL, 0x1739231A1A173439UL, 
+	0xDA75106565DACA75UL, 0x315384D7D731B553UL, 0xC651D58484C61351UL, 0xB8D303D0D0B8BBD3UL, 
+	0xC35EDC8282C31F5EUL, 0xB0CBE22929B052CBUL, 0x7799C35A5A77B499UL, 0x11332D1E1E113C33UL, 
+	0xCB463D7B7BCBF646UL, 0xFC1FB7A8A8FC4B1FUL, 0xD6610C6D6DD6DA61UL, 0x3A4E622C2C3A584EUL
+};
+
+#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0_G[B64_0(a[b0])] \
+			^ R64(T0_G[B64_1(a[b1])],  8) \
+			^ R64(T0_G[B64_2(a[b2])], 16) \
+			^ R64(T0_G[B64_3(a[b3])], 24) \
+			^ T4_G[B64_4(a[b4])] \
+			^ R64(T4_G[B64_5(a[b5])],  8) \
+			^ R64(T4_G[B64_6(a[b6])], 16) \
+			^ R64(T4_G[B64_7(a[b7])], 24); \
+		} while (0)
+
+#define ROUND_SMALL_P(a, r)   do { \
+		ulong t[8]; \
+		a[0] ^= PC64(0x00, r); \
+		a[1] ^= PC64(0x10, r); \
+		a[2] ^= PC64(0x20, r); \
+		a[3] ^= PC64(0x30, r); \
+		a[4] ^= PC64(0x40, r); \
+		a[5] ^= PC64(0x50, r); \
+		a[6] ^= PC64(0x60, r); \
+		a[7] ^= PC64(0x70, r); \
+		RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \
+		RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \
+		RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \
+		RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \
+		RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \
+		RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \
+		RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \
+		RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \
+		a[0] = t[0]; \
+		a[1] = t[1]; \
+		a[2] = t[2]; \
+		a[3] = t[3]; \
+		a[4] = t[4]; \
+		a[5] = t[5]; \
+		a[6] = t[6]; \
+		a[7] = t[7]; \
+		} while (0)
+
+#define ROUND_SMALL_Pf(a,r)   do { \
+		a[0] ^= PC64(0x00, r); \
+		a[1] ^= PC64(0x10, r); \
+		a[2] ^= PC64(0x20, r); \
+		a[3] ^= PC64(0x30, r); \
+		a[4] ^= PC64(0x40, r); \
+		a[5] ^= PC64(0x50, r); \
+		a[6] ^= PC64(0x60, r); \
+		a[7] ^= PC64(0x70, r); \
+		RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \
+		a[7] = t[7]; \
+			} while (0)
+
+#define ROUND_SMALL_Q(a, r)   do { \
+		ulong t[8]; \
+		a[0] ^= QC64(0x00, r); \
+		a[1] ^= QC64(0x10, r); \
+		a[2] ^= QC64(0x20, r); \
+		a[3] ^= QC64(0x30, r); \
+		a[4] ^= QC64(0x40, r); \
+		a[5] ^= QC64(0x50, r); \
+		a[6] ^= QC64(0x60, r); \
+		a[7] ^= QC64(0x70, r); \
+		RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \
+		RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \
+		RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \
+		RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \
+		RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \
+		RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \
+		RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \
+		RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \
+		a[0] = t[0]; \
+		a[1] = t[1]; \
+		a[2] = t[2]; \
+		a[3] = t[3]; \
+		a[4] = t[4]; \
+		a[5] = t[5]; \
+		a[6] = t[6]; \
+		a[7] = t[7]; \
+		} while (0)
+
+#define PERM_SMALL_P(a)   do { \
+		for (int r = 0; r < 10; r ++) \
+			ROUND_SMALL_P(a, r); \
+		} while (0)
+
+#define PERM_SMALL_Pf(a)   do { \
+		for (int r = 0; r < 9; r ++) { \
+			ROUND_SMALL_P(a, r);} \
+            ROUND_SMALL_Pf(a,9); \
+			} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		for (int r = 0; r < 10; r ++) \
+			ROUND_SMALL_Q(a, r); \
+		} while (0)
+
+)==="
+		
+\ No newline at end of file
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/jh.cl b/xmrstak/backend/amd/amd_gpu/opencl/jh.cl
new file mode 100644
index 0000000..fe70ea3
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/jh.cl
@@ -0,0 +1,274 @@
+R"===(
+/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
+/*
+ * JH implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#define SPH_JH_64   1
+#define SPH_LITTLE_ENDIAN 1
+
+#define SPH_C32(x)	x
+#define SPH_C64(x)	x
+typedef uint sph_u32;
+typedef ulong sph_u64;
+
+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+          | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+          | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+          | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define dec32e_aligned   sph_dec32le_aligned
+#define enc32e           sph_enc32le
+
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+          | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+          | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+          | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+          | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+          | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+          | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+          | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+
+#else
+
+#define C32e(x)     SPH_C32(x)
+#define dec32e_aligned   sph_dec32be_aligned
+#define enc32e           sph_enc32be
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+
+#endif
+
+#define Sb(x0, x1, x2, x3, c)   do { \
+    x3 = ~x3; \
+    x0 ^= (c) & ~x2; \
+    tmp = (c) ^ (x0 & x1); \
+    x0 ^= x2 & x3; \
+    x3 ^= ~x1 & x2; \
+    x1 ^= x0 & x2; \
+    x2 ^= x0 & ~x3; \
+    x0 ^= x1 | x3; \
+    x3 ^= x1 & x2; \
+    x1 ^= tmp & x0; \
+    x2 ^= tmp; \
+  } while (0)
+
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+    x4 ^= x1; \
+    x5 ^= x2; \
+    x6 ^= x3 ^ x0; \
+    x7 ^= x0; \
+    x0 ^= x5; \
+    x1 ^= x6; \
+    x2 ^= x7 ^ x4; \
+    x3 ^= x4; \
+  } while (0)
+
+static const __constant ulong C[] =
+{
+	0x67F815DFA2DED572UL, 0x571523B70A15847BUL, 0xF6875A4D90D6AB81UL, 0x402BD1C3C54F9F4EUL, 
+	0x9CFA455CE03A98EAUL, 0x9A99B26699D2C503UL, 0x8A53BBF2B4960266UL, 0x31A2DB881A1456B5UL, 
+	0xDB0E199A5C5AA303UL, 0x1044C1870AB23F40UL, 0x1D959E848019051CUL, 0xDCCDE75EADEB336FUL, 
+	0x416BBF029213BA10UL, 0xD027BBF7156578DCUL, 0x5078AA3739812C0AUL, 0xD3910041D2BF1A3FUL, 
+	0x907ECCF60D5A2D42UL, 0xCE97C0929C9F62DDUL, 0xAC442BC70BA75C18UL, 0x23FCC663D665DFD1UL, 
+	0x1AB8E09E036C6E97UL, 0xA8EC6C447E450521UL, 0xFA618E5DBB03F1EEUL, 0x97818394B29796FDUL, 
+	0x2F3003DB37858E4AUL, 0x956A9FFB2D8D672AUL, 0x6C69B8F88173FE8AUL, 0x14427FC04672C78AUL, 
+	0xC45EC7BD8F15F4C5UL, 0x80BB118FA76F4475UL, 0xBC88E4AEB775DE52UL, 0xF4A3A6981E00B882UL, 
+	0x1563A3A9338FF48EUL, 0x89F9B7D524565FAAUL, 0xFDE05A7C20EDF1B6UL, 0x362C42065AE9CA36UL, 
+	0x3D98FE4E433529CEUL, 0xA74B9A7374F93A53UL, 0x86814E6F591FF5D0UL, 0x9F5AD8AF81AD9D0EUL, 
+	0x6A6234EE670605A7UL, 0x2717B96EBE280B8BUL, 0x3F1080C626077447UL, 0x7B487EC66F7EA0E0UL, 
+	0xC0A4F84AA50A550DUL, 0x9EF18E979FE7E391UL, 0xD48D605081727686UL, 0x62B0E5F3415A9E7EUL, 
+	0x7A205440EC1F9FFCUL, 0x84C9F4CE001AE4E3UL, 0xD895FA9DF594D74FUL, 0xA554C324117E2E55UL, 
+	0x286EFEBD2872DF5BUL, 0xB2C4A50FE27FF578UL, 0x2ED349EEEF7C8905UL, 0x7F5928EB85937E44UL, 
+	0x4A3124B337695F70UL, 0x65E4D61DF128865EUL, 0xE720B95104771BC7UL, 0x8A87D423E843FE74UL, 
+	0xF2947692A3E8297DUL, 0xC1D9309B097ACBDDUL, 0xE01BDC5BFB301B1DUL, 0xBF829CF24F4924DAUL, 
+	0xFFBF70B431BAE7A4UL, 0x48BCF8DE0544320DUL, 0x39D3BB5332FCAE3BUL, 0xA08B29E0C1C39F45UL, 
+	0x0F09AEF7FD05C9E5UL, 0x34F1904212347094UL, 0x95ED44E301B771A2UL, 0x4A982F4F368E3BE9UL, 
+	0x15F66CA0631D4088UL, 0xFFAF52874B44C147UL, 0x30C60AE2F14ABB7EUL, 0xE68C6ECCC5B67046UL, 
+	0x00CA4FBD56A4D5A4UL, 0xAE183EC84B849DDAUL, 0xADD1643045CE5773UL, 0x67255C1468CEA6E8UL, 
+	0x16E10ECBF28CDAA3UL, 0x9A99949A5806E933UL, 0x7B846FC220B2601FUL, 0x1885D1A07FACCED1UL, 
+	0xD319DD8DA15B5932UL, 0x46B4A5AAC01C9A50UL, 0xBA6B04E467633D9FUL, 0x7EEE560BAB19CAF6UL, 
+	0x742128A9EA79B11FUL, 0xEE51363B35F7BDE9UL, 0x76D350755AAC571DUL, 0x01707DA3FEC2463AUL, 
+	0x42D8A498AFC135F7UL, 0x79676B9E20ECED78UL, 0xA8DB3AEA15638341UL, 0x832C83324D3BC3FAUL, 
+	0xF347271C1F3B40A7UL, 0x9A762DB734F04059UL, 0xFD4F21D26C4E3EE7UL, 0xEF5957DC398DFDB8UL, 
+	0xDAEB492B490C9B8DUL, 0x0D70F36849D7A25BUL, 0x84558D7AD0AE3B7DUL, 0x658EF8E4F0E9A5F5UL, 
+	0x533B1036F4A2B8A0UL, 0x5AEC3E759E07A80CUL, 0x4F88E85692946891UL, 0x4CBCBAF8555CB05BUL, 
+	0x7B9487F3993BBBE3UL, 0x5D1C6B72D6F4DA75UL, 0x6DB334DC28ACAE64UL, 0x71DB28B850A5346CUL, 
+	0x2A518D10F2E261F8UL, 0xFC75DD593364DBE3UL, 0xA23FCE43F1BCAC1CUL, 0xB043E8023CD1BB67UL, 
+	0x75A12988CA5B0A33UL, 0x5C5316B44D19347FUL, 0x1E4D790EC3943B92UL, 0x3FAFEEB6D7757479UL, 
+	0x21391ABEF7D4A8EAUL, 0x5127234C097EF45CUL, 0xD23C32BA5324A326UL, 0xADD5A66D4A17A344UL, 
+	0x08C9F2AFA63E1DB5UL, 0x563C6B91983D5983UL, 0x4D608672A17CF84CUL, 0xF6C76E08CC3EE246UL, 
+	0x5E76BCB1B333982FUL, 0x2AE6C4EFA566D62BUL, 0x36D4C1BEE8B6F406UL, 0x6321EFBC1582EE74UL, 
+	0x69C953F40D4EC1FDUL, 0x26585806C45A7DA7UL, 0x16FAE0061614C17EUL, 0x3F9D63283DAF907EUL, 
+	0x0CD29B00E3F2C9D2UL, 0x300CD4B730CEAA5FUL, 0x9832E0F216512A74UL, 0x9AF8CEE3D830EB0DUL, 
+	0x9279F1B57B9EC54BUL, 0xD36886046EE651FFUL, 0x316796E6574D239BUL, 0x05750A17F3A6E6CCUL, 
+	0xCE6C3213D98176B1UL, 0x62A205F88452173CUL, 0x47154778B3CB2BF4UL, 0x486A9323825446FFUL, 
+	0x65655E4E0758DF38UL, 0x8E5086FC897CFCF2UL, 0x86CA0BD0442E7031UL, 0x4E477830A20940F0UL, 
+	0x8338F7D139EEA065UL, 0xBD3A2CE437E95EF7UL, 0x6FF8130126B29721UL, 0xE7DE9FEFD1ED44A3UL, 
+	0xD992257615DFA08BUL, 0xBE42DC12F6F7853CUL, 0x7EB027AB7CECA7D8UL, 0xDEA83EAADA7D8D53UL, 
+	0xD86902BD93CE25AAUL, 0xF908731AFD43F65AUL, 0xA5194A17DAEF5FC0UL, 0x6A21FD4C33664D97UL, 
+	0x701541DB3198B435UL, 0x9B54CDEDBB0F1EEAUL, 0x72409751A163D09AUL, 0xE26F4791BF9D75F6UL
+};
+
+#define Ceven_hi(r)   (C[((r) << 2) + 0])
+#define Ceven_lo(r)   (C[((r) << 2) + 1])
+#define Codd_hi(r)    (C[((r) << 2) + 2])
+#define Codd_lo(r)    (C[((r) << 2) + 3])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+    Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+    Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+  } while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+    Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+      x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+    Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+      x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+  } while (0)
+
+#define Wz(x, c, n)   do { \
+    sph_u64 t = (x ## h & (c)) << (n); \
+    x ## h = ((x ## h >> (n)) & (c)) | t; \
+    t = (x ## l & (c)) << (n); \
+    x ## l = ((x ## l >> (n)) & (c)) | t; \
+  } while (0)
+
+#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
+#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
+#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
+#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
+#define W6(x)   do { \
+    sph_u64 t = x ## h; \
+    x ## h = x ## l; \
+    x ## l = t; \
+  } while (0)
+
+#define SL(ro)   SLu(r + ro, ro)
+
+#define SLu(r, ro)   do { \
+    S(h0, h2, h4, h6, Ceven_, r); \
+    S(h1, h3, h5, h7, Codd_, r); \
+    L(h0, h2, h4, h6, h1, h3, h5, h7); \
+    W ## ro(h1); \
+    W ## ro(h3); \
+    W ## ro(h5); \
+    W ## ro(h7); \
+  } while (0)
+
+#if SPH_SMALL_FOOTPRINT_JH
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#define E8   do { \
+    unsigned r; \
+    for (r = 0; r < 42; r += 7) { \
+      SL(0); \
+      SL(1); \
+      SL(2); \
+      SL(3); \
+      SL(4); \
+      SL(5); \
+      SL(6); \
+    } \
+  } while (0)
+
+#else
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */
+
+#define E8   do { \
+    SLu( 0, 0); \
+    SLu( 1, 1); \
+    SLu( 2, 2); \
+    SLu( 3, 3); \
+    SLu( 4, 4); \
+    SLu( 5, 5); \
+    SLu( 6, 6); \
+    SLu( 7, 0); \
+    SLu( 8, 1); \
+    SLu( 9, 2); \
+    SLu(10, 3); \
+    SLu(11, 4); \
+    SLu(12, 5); \
+    SLu(13, 6); \
+    SLu(14, 0); \
+    SLu(15, 1); \
+    SLu(16, 2); \
+    SLu(17, 3); \
+    SLu(18, 4); \
+    SLu(19, 5); \
+    SLu(20, 6); \
+    SLu(21, 0); \
+    SLu(22, 1); \
+    SLu(23, 2); \
+    SLu(24, 3); \
+    SLu(25, 4); \
+    SLu(26, 5); \
+    SLu(27, 6); \
+    SLu(28, 0); \
+    SLu(29, 1); \
+    SLu(30, 2); \
+    SLu(31, 3); \
+    SLu(32, 4); \
+    SLu(33, 5); \
+    SLu(34, 6); \
+    SLu(35, 0); \
+    SLu(36, 1); \
+    SLu(37, 2); \
+    SLu(38, 3); \
+    SLu(39, 4); \
+    SLu(40, 5); \
+    SLu(41, 6); \
+  } while (0)
+
+#endif
+
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
new file mode 100644
index 0000000..996944b
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -0,0 +1,90 @@
+R"===(
+#ifndef WOLF_AES_CL
+#define WOLF_AES_CL
+
+// AES table - the other three are generated on the fly
+
+static const __constant uint AES0_C[256] =
+{
+	0xA56363C6U, 0x847C7CF8U, 0x997777EEU, 0x8D7B7BF6U,
+	0x0DF2F2FFU, 0xBD6B6BD6U, 0xB16F6FDEU, 0x54C5C591U,
+	0x50303060U, 0x03010102U, 0xA96767CEU, 0x7D2B2B56U,
+	0x19FEFEE7U, 0x62D7D7B5U, 0xE6ABAB4DU, 0x9A7676ECU,
+	0x45CACA8FU, 0x9D82821FU, 0x40C9C989U, 0x877D7DFAU,
+	0x15FAFAEFU, 0xEB5959B2U, 0xC947478EU, 0x0BF0F0FBU,
+	0xECADAD41U, 0x67D4D4B3U, 0xFDA2A25FU, 0xEAAFAF45U,
+	0xBF9C9C23U, 0xF7A4A453U, 0x967272E4U, 0x5BC0C09BU,
+	0xC2B7B775U, 0x1CFDFDE1U, 0xAE93933DU, 0x6A26264CU,
+	0x5A36366CU, 0x413F3F7EU, 0x02F7F7F5U, 0x4FCCCC83U,
+	0x5C343468U, 0xF4A5A551U, 0x34E5E5D1U, 0x08F1F1F9U,
+	0x937171E2U, 0x73D8D8ABU, 0x53313162U, 0x3F15152AU,
+	0x0C040408U, 0x52C7C795U, 0x65232346U, 0x5EC3C39DU,
+	0x28181830U, 0xA1969637U, 0x0F05050AU, 0xB59A9A2FU,
+	0x0907070EU, 0x36121224U, 0x9B80801BU, 0x3DE2E2DFU,
+	0x26EBEBCDU, 0x6927274EU, 0xCDB2B27FU, 0x9F7575EAU,
+	0x1B090912U, 0x9E83831DU, 0x742C2C58U, 0x2E1A1A34U,
+	0x2D1B1B36U, 0xB26E6EDCU, 0xEE5A5AB4U, 0xFBA0A05BU,
+	0xF65252A4U, 0x4D3B3B76U, 0x61D6D6B7U, 0xCEB3B37DU,
+	0x7B292952U, 0x3EE3E3DDU, 0x712F2F5EU, 0x97848413U,
+	0xF55353A6U, 0x68D1D1B9U, 0x00000000U, 0x2CEDEDC1U,
+	0x60202040U, 0x1FFCFCE3U, 0xC8B1B179U, 0xED5B5BB6U,
+	0xBE6A6AD4U, 0x46CBCB8DU, 0xD9BEBE67U, 0x4B393972U,
+	0xDE4A4A94U, 0xD44C4C98U, 0xE85858B0U, 0x4ACFCF85U,
+	0x6BD0D0BBU, 0x2AEFEFC5U, 0xE5AAAA4FU, 0x16FBFBEDU,
+	0xC5434386U, 0xD74D4D9AU, 0x55333366U, 0x94858511U,
+	0xCF45458AU, 0x10F9F9E9U, 0x06020204U, 0x817F7FFEU,
+	0xF05050A0U, 0x443C3C78U, 0xBA9F9F25U, 0xE3A8A84BU,
+	0xF35151A2U, 0xFEA3A35DU, 0xC0404080U, 0x8A8F8F05U,
+	0xAD92923FU, 0xBC9D9D21U, 0x48383870U, 0x04F5F5F1U,
+	0xDFBCBC63U, 0xC1B6B677U, 0x75DADAAFU, 0x63212142U,
+	0x30101020U, 0x1AFFFFE5U, 0x0EF3F3FDU, 0x6DD2D2BFU,
+	0x4CCDCD81U, 0x140C0C18U, 0x35131326U, 0x2FECECC3U,
+	0xE15F5FBEU, 0xA2979735U, 0xCC444488U, 0x3917172EU,
+	0x57C4C493U, 0xF2A7A755U, 0x827E7EFCU, 0x473D3D7AU,
+	0xAC6464C8U, 0xE75D5DBAU, 0x2B191932U, 0x957373E6U,
+	0xA06060C0U, 0x98818119U, 0xD14F4F9EU, 0x7FDCDCA3U,
+	0x66222244U, 0x7E2A2A54U, 0xAB90903BU, 0x8388880BU,
+	0xCA46468CU, 0x29EEEEC7U, 0xD3B8B86BU, 0x3C141428U,
+	0x79DEDEA7U, 0xE25E5EBCU, 0x1D0B0B16U, 0x76DBDBADU,
+	0x3BE0E0DBU, 0x56323264U, 0x4E3A3A74U, 0x1E0A0A14U,
+	0xDB494992U, 0x0A06060CU, 0x6C242448U, 0xE45C5CB8U,
+	0x5DC2C29FU, 0x6ED3D3BDU, 0xEFACAC43U, 0xA66262C4U,
+	0xA8919139U, 0xA4959531U, 0x37E4E4D3U, 0x8B7979F2U,
+	0x32E7E7D5U, 0x43C8C88BU, 0x5937376EU, 0xB76D6DDAU,
+	0x8C8D8D01U, 0x64D5D5B1U, 0xD24E4E9CU, 0xE0A9A949U,
+	0xB46C6CD8U, 0xFA5656ACU, 0x07F4F4F3U, 0x25EAEACFU,
+	0xAF6565CAU, 0x8E7A7AF4U, 0xE9AEAE47U, 0x18080810U,
+	0xD5BABA6FU, 0x887878F0U, 0x6F25254AU, 0x722E2E5CU,
+	0x241C1C38U, 0xF1A6A657U, 0xC7B4B473U, 0x51C6C697U,
+	0x23E8E8CBU, 0x7CDDDDA1U, 0x9C7474E8U, 0x211F1F3EU,
+	0xDD4B4B96U, 0xDCBDBD61U, 0x868B8B0DU, 0x858A8A0FU,
+	0x907070E0U, 0x423E3E7CU, 0xC4B5B571U, 0xAA6666CCU,
+	0xD8484890U, 0x05030306U, 0x01F6F6F7U, 0x120E0E1CU,
+	0xA36161C2U, 0x5F35356AU, 0xF95757AEU, 0xD0B9B969U,
+	0x91868617U, 0x58C1C199U, 0x271D1D3AU, 0xB99E9E27U,
+	0x38E1E1D9U, 0x13F8F8EBU, 0xB398982BU, 0x33111122U,
+	0xBB6969D2U, 0x70D9D9A9U, 0x898E8E07U, 0xA7949433U,
+	0xB69B9B2DU, 0x221E1E3CU, 0x92878715U, 0x20E9E9C9U,
+	0x49CECE87U, 0xFF5555AAU, 0x78282850U, 0x7ADFDFA5U,
+	0x8F8C8C03U, 0xF8A1A159U, 0x80898909U, 0x170D0D1AU,
+	0xDABFBF65U, 0x31E6E6D7U, 0xC6424284U, 0xB86868D0U,
+	0xC3414182U, 0xB0999929U, 0x772D2D5AU, 0x110F0F1EU,
+	0xCBB0B07BU, 0xFC5454A8U, 0xD6BBBB6DU, 0x3A16162CU
+};
+
+#define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
+
+uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, const uint4 key)
+{
+	uint4 Y;
+	Y.s0 = AES0[BYTE(X.s0, 0)] ^ AES1[BYTE(X.s1, 1)] ^ AES2[BYTE(X.s2, 2)] ^ AES3[BYTE(X.s3, 3)];
+    Y.s1 = AES0[BYTE(X.s1, 0)] ^ AES1[BYTE(X.s2, 1)] ^ AES2[BYTE(X.s3, 2)] ^ AES3[BYTE(X.s0, 3)];
+    Y.s2 = AES0[BYTE(X.s2, 0)] ^ AES1[BYTE(X.s3, 1)] ^ AES2[BYTE(X.s0, 2)] ^ AES3[BYTE(X.s1, 3)];
+    Y.s3 = AES0[BYTE(X.s3, 0)] ^ AES1[BYTE(X.s0, 1)] ^ AES2[BYTE(X.s1, 2)] ^ AES3[BYTE(X.s2, 3)];
+    Y ^= key;
+    return(Y);
+}
+
+#endif
+
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
new file mode 100644
index 0000000..868757b
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
@@ -0,0 +1,114 @@
+R"===(
+#ifndef WOLF_SKEIN_CL
+#define WOLF_SKEIN_CL
+
+// Vectorized Skein implementation macros and functions by Wolf
+
+#define SKEIN_KS_PARITY	0x1BD11BDAA9FC1A22
+
+static const __constant ulong SKEIN256_IV[8] =
+{
+	0xCCD044A12FDB3E13UL, 0xE83590301A79A9EBUL,
+	0x55AEA0614F816E6FUL, 0x2A2767A4AE9B94DBUL,
+	0xEC06025E74DD7683UL, 0xE7A436CDC4746251UL,
+	0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL
+};
+
+static const __constant ulong SKEIN512_256_IV[8] =
+{
+	0xCCD044A12FDB3E13UL, 0xE83590301A79A9EBUL,
+	0x55AEA0614F816E6FUL, 0x2A2767A4AE9B94DBUL,
+	0xEC06025E74DD7683UL, 0xE7A436CDC4746251UL,
+	0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL
+};
+
+#define SKEIN_INJECT_KEY(p, s)	do { \
+	p += h; \
+	p.s5 += t[s % 3]; \
+	p.s6 += t[(s + 1) % 3]; \
+	p.s7 += s; \
+} while(0)
+
+ulong SKEIN_ROT(const uint2 x, const uint y)
+{
+	if(y < 32) return(as_ulong(amd_bitalign(x, x.s10, 32 - y)));
+	else return(as_ulong(amd_bitalign(x.s10, x, 32 - (y - 32))));
+}
+
+void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const uint rc0, const uint rc1, const uint rc2, const uint rc3)
+{
+	*pv0 += *pv1;
+	(*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc0);
+	(*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc1);
+	(*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc2);
+	(*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc3);
+	*pv1 ^= *pv0;
+}
+
+ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s)
+{
+	SKEIN_INJECT_KEY(p, s);
+	ulong4 pv0 = p.even, pv1 = p.odd;
+	
+	SkeinMix8(&pv0, &pv1, 46, 36, 19, 37);
+	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
+	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
+	
+	SkeinMix8(&pv0, &pv1, 33, 27, 14, 42);
+	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
+	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
+	
+	SkeinMix8(&pv0, &pv1, 17, 49, 36, 39);
+	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
+	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
+	
+	SkeinMix8(&pv0, &pv1, 44, 9, 54, 56);
+	return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5)));
+}
+
+ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s)
+{
+	SKEIN_INJECT_KEY(p, s);
+    ulong4 pv0 = p.even, pv1 = p.odd;
+    
+	SkeinMix8(&pv0, &pv1, 39, 30, 34, 24);
+	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
+	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
+	
+	SkeinMix8(&pv0, &pv1, 13, 50, 10, 17);
+	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
+	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
+	
+	SkeinMix8(&pv0, &pv1, 25, 29, 39, 43);
+	pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
+	pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
+	
+	SkeinMix8(&pv0, &pv1, 8, 35, 56, 22);
+	return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5)));
+}
+
+ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t)
+{
+	#pragma unroll
+	for(int i = 0; i < 18; ++i)
+	{
+		p = SkeinEvenRound(p, h, t, i);
+		++i;
+		ulong tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, i);
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+	}
+	
+	SKEIN_INJECT_KEY(p, 18);
+	return(p);
+}
+
+#endif
+
+)==="
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
new file mode 100644
index 0000000..442d8f1
--- /dev/null
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -0,0 +1,114 @@
+
+#pragma once
+
+#include "autoAdjust.hpp"
+
+
+#include "jconf.h"
+#include "../../console.h"
+#include "../../ConfigEditor.hpp"
+#include "amd_gpu/gpu.h"
+#include "../../Params.hpp"
+
+#include <vector>
+#include <cstdio>
+#include <sstream>
+#include <string>
+#include <iostream>
+#include  <algorithm>
+
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+
+namespace xmrstak
+{
+namespace amd
+{
+
+class autoAdjust
+{    
+public:
+
+    autoAdjust()
+    {
+
+    }
+
+    /** print the adjusted values if needed
+     *
+     * Routine exit the application and print the adjusted values if needed else
+     * nothing is happened.
+     */
+    bool printConfig()
+    {
+
+		int platformIndex = getAMDPlatformIdx();
+
+		if(platformIndex == -1)
+		{
+			printer::inst()->print_msg(L0,"WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver.");
+			return false;
+		}
+
+		devVec = getAMDDevices(0);
+
+
+		int deviceCount = devVec.size();
+
+        if(deviceCount == 0)
+            return false;
+
+ 
+        generateThreadConfig(platformIndex);
+		return true;
+
+    }
+
+private:
+
+    void generateThreadConfig(const int platformIndex)
+    {
+		// load the template of the backend config into a char variable
+		const char *tpl =
+			#include "./config.tpl"
+		;
+
+		ConfigEditor configTpl{};
+		configTpl.set( std::string(tpl) );
+
+		std::string conf;
+        int i = 0;
+        for(auto& ctx : devVec)
+        {
+			// keep 64MiB memory free (value is randomly chosen)
+			size_t availableMem = ctx.freeMem - (64u * 1024 * 1024);
+			// 224byte extra memory is used per thread for meta data
+			size_t perThread = (size_t(1u)<<21) + 224u;
+			size_t max_intensity = availableMem / perThread;
+			// 1000 is a magic selected limit \todo select max intensity depending of the gpu type
+			size_t intensity = std::min( size_t(1000u) , max_intensity );
+			conf += std::string("  // gpu: ") + ctx.name + "\n";
+			// set 8 threads per block (this is a good value for the most gpus)
+            conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
+                "    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+                "    \"affine_to_cpu\" : false, \n"
+                "  },\n";
+            ++i;
+        }
+
+		configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex));
+		configTpl.replace("NUMGPUS",std::to_string(devVec.size()));
+		configTpl.replace("GPUCONFIG",conf);
+		configTpl.write(Params::inst().configFileAMD);
+		printer::inst()->print_msg(L0, "AMD: GPU configuration stored in file '%s'", Params::inst().configFileAMD.c_str());
+    }
+
+    std::vector<GpuContext> devVec;
+};
+
+} // namespace amd
+} // namepsace xmrstak
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
new file mode 100644
index 0000000..b8b6dc4
--- /dev/null
+++ b/xmrstak/backend/amd/config.tpl
@@ -0,0 +1,29 @@
+R"===(
+
+/* 
+ * Number of GPUs that you have in your system. Each GPU will get its own CPU thread.
+ */
+"gpu_thread_num" : NUMGPUS,
+
+/*
+ * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary.
+ *      index    - GPU index number usually starts from 0
+ *  intensity    - Number of parallel GPU threads (nothing to do with CPU threads)
+ *   worksize    - Number of local GPU threads (nothing to do with CPU threads)
+ * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ * "gpu_threads_conf" :
+ * [
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
+ * ],
+ */
+
+"gpu_threads_conf" : [
+GPUCONFIG
+],
+
+/*
+ * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel.
+ */
+"platform_index" : PLATFORMINDEX,
+
+)==="
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
new file mode 100644
index 0000000..f8a551e
--- /dev/null
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -0,0 +1,261 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+
+#include "jconf.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#include <intrin.h>
+#else
+#include <cpuid.h>
+#endif
+
+#include "../../rapidjson/document.h"
+#include "../../rapidjson/error/en.h"
+#include "../../jext.h"
+#include "../../console.h"
+
+namespace xmrstak
+{
+namespace amd
+{
+
+using namespace rapidjson;
+
+/*
+ * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
+ */
+enum configEnum { iGpuThreadNum, aGpuThreadsConf, iPlatformIdx };
+
+struct configVal {
+	configEnum iName;
+	const char* sName;
+	Type iType;
+};
+
+//Same order as in configEnum, as per comment above
+configVal oConfigValues[] = {
+	{ iGpuThreadNum, "gpu_thread_num", kNumberType },
+	{ aGpuThreadsConf, "gpu_threads_conf", kArrayType },
+	{ iPlatformIdx, "platform_index", kNumberType }
+};
+
+constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+
+inline bool checkType(Type have, Type want)
+{
+	if(want == have)
+		return true;
+	else if(want == kTrueType && have == kFalseType)
+		return true;
+	else if(want == kFalseType && have == kTrueType)
+		return true;
+	else
+		return false;
+}
+
+struct jconf::opaque_private
+{
+	Document jsonDoc;
+	const Value* configValues[iConfigCnt]; //Compile time constant
+
+	opaque_private()
+	{
+	}
+};
+
+jconf* jconf::oInst = nullptr;
+
+jconf::jconf()
+{
+	prv = new opaque_private();
+}
+
+bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
+{
+	if(id >= prv->configValues[aGpuThreadsConf]->Size())
+		return false;
+
+	const Value& oThdConf = prv->configValues[aGpuThreadsConf]->GetArray()[id];
+
+	if(!oThdConf.IsObject())
+		return false;
+
+	const Value *idx, *intensity, *w_size, *aff;
+	idx = GetObjectMember(oThdConf, "index");
+	intensity = GetObjectMember(oThdConf, "intensity");
+	w_size = GetObjectMember(oThdConf, "worksize");
+	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+
+	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr)
+		return false;
+
+	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
+		return false;
+
+	if(!aff->IsUint64() && !aff->IsBool())
+		return false;
+
+	cfg.index = idx->GetUint64();
+	cfg.intensity = intensity->GetUint64();
+	cfg.w_size = w_size->GetUint64();
+
+	if(aff->IsNumber())
+		cfg.cpu_aff = aff->GetInt64();
+	else
+		cfg.cpu_aff = -1;
+
+	return true;
+}
+
+size_t jconf::GetPlatformIdx()
+{
+	return prv->configValues[iPlatformIdx]->GetUint64();
+}
+
+size_t jconf::GetThreadCount()
+{
+	return prv->configValues[aGpuThreadsConf]->Size();
+}
+
+bool jconf::parse_config(const char* sFilename)
+{
+	FILE * pFile;
+	char * buffer;
+	size_t flen;
+
+	pFile = fopen(sFilename, "rb");
+	if (pFile == NULL)
+	{
+		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
+		return false;
+	}
+
+	fseek(pFile,0,SEEK_END);
+	flen = ftell(pFile);
+	rewind(pFile);
+
+	if(flen >= 64*1024)
+	{
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
+		return false;
+	}
+
+	if(flen <= 16)
+	{
+		printer::inst()->print_msg(L0, "File is empty or too short - %s.", sFilename);
+		return false;
+	}
+
+	buffer = (char*)malloc(flen + 3);
+	if(fread(buffer+1, flen, 1, pFile) != 1)
+	{
+		free(buffer);
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "Read error while reading %s.", sFilename);
+		return false;
+	}
+	fclose(pFile);
+
+	//Replace Unicode BOM with spaces - we always use UTF-8
+	unsigned char* ubuffer = (unsigned char*)buffer;
+	if(ubuffer[1] == 0xEF && ubuffer[2] == 0xBB && ubuffer[3] == 0xBF)
+	{
+		buffer[1] = ' ';
+		buffer[2] = ' ';
+		buffer[3] = ' ';
+	}
+
+	buffer[0] = '{';
+	buffer[flen] = '}';
+	buffer[flen + 1] = '\0';
+
+	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	free(buffer);
+
+	if(prv->jsonDoc.HasParseError())
+	{
+		printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s",
+			int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
+		return false;
+	}
+
+
+	if(!prv->jsonDoc.IsObject())
+	{ //This should never happen as we created the root ourselves
+		printer::inst()->print_msg(L0, "Invalid config file. No root?\n");
+		return false;
+	}
+
+	for(size_t i = 0; i < iConfigCnt; i++)
+	{
+		if(oConfigValues[i].iName != i)
+		{
+			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order.");
+			return false;
+		}
+
+		prv->configValues[i] = GetObjectMember(prv->jsonDoc, oConfigValues[i].sName);
+
+		if(prv->configValues[i] == nullptr)
+		{
+			printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName);
+			return false;
+		}
+
+		if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
+		{
+			printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName);
+			return false;
+		}
+	}
+
+	size_t n_thd = prv->configValues[aGpuThreadsConf]->Size();
+	if(prv->configValues[iGpuThreadNum]->GetUint64() != n_thd)
+	{
+		printer::inst()->print_msg(L0,
+			"Invalid config file. Your GPU config array has %llu members, while you want to use %llu threads.",
+			int_port(n_thd), int_port(prv->configValues[iGpuThreadNum]->GetUint64()));
+		return false;
+	}
+
+	thd_cfg c;
+	for(size_t i=0; i < n_thd; i++)
+	{
+		if(!GetThreadConfig(i, c))
+		{
+			printer::inst()->print_msg(L0, "Thread %llu has invalid config.", int_port(i));
+			return false;
+		}
+	}
+
+}
+
+} // namespace amd
+} // namespace xmrstak
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
new file mode 100644
index 0000000..9fef331
--- /dev/null
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -0,0 +1,44 @@
+#pragma once
+#include <stdlib.h>
+#include <string>
+#include "../../Params.hpp"
+
+namespace xmrstak
+{
+namespace amd
+{
+
+class jconf
+{
+public:
+	static jconf* inst()
+	{
+		if (oInst == nullptr) oInst = new jconf;
+		return oInst;
+	};
+
+	bool parse_config(const char* sFilename = Params::inst().configFileAMD.c_str());
+
+	struct thd_cfg {
+		size_t index;
+		size_t intensity;
+		size_t w_size;
+		long long cpu_aff;
+	};
+
+	size_t GetThreadCount();
+	bool GetThreadConfig(size_t id, thd_cfg &cfg);
+
+	size_t GetPlatformIdx();
+
+private:
+	jconf();
+	static jconf* oInst;
+
+	struct opaque_private;
+	opaque_private* prv;
+
+};
+
+} // namespace amd
+} // namespace xmrstak
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
new file mode 100644
index 0000000..f2f5ff4
--- /dev/null
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -0,0 +1,237 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include <assert.h>
+#include <cmath>
+#include <chrono>
+#include <thread>
+
+#include "../../ConfigEditor.hpp"
+#include "autoAdjust.hpp"
+
+#include <vector>
+#include "../../console.h"
+#include "../../crypto/cryptonight_aesni.h"
+#include "../cpu/minethd.h"
+#include "../../jconf.h"
+
+#include "../../executor.h"
+#include "minethd.h"
+#include "../../jconf.h"
+#include "../../crypto/cryptonight.h"
+#include "../../Environment.hpp"
+#include "../../Params.hpp"
+#include "amd_gpu/gpu.h"
+
+
+namespace xmrstak
+{
+namespace amd
+{
+
+minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx)
+{
+	oWork = pWork;
+	bQuit = 0;
+	iThreadNo = (uint8_t)iNo;
+	iJobNo = 0;
+	iHashCount = 0;
+	iTimestamp = 0;
+	pGpuCtx = ctx;
+
+	oWorkThd = std::thread(&minethd::work_main, this);
+}
+
+extern "C"  {
+#ifdef WIN32
+__declspec(dllexport) 
+#endif
+std::vector<IBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, Environment& env)
+{
+	Environment::inst() = env;
+	return amd::minethd::thread_starter(threadOffset, pWork);
+}
+} // extern "C"
+
+bool minethd::init_gpus()
+{
+	size_t i, n = jconf::inst()->GetThreadCount();
+
+	printer::inst()->print_msg(L1, "Compiling code and initializing GPUs. This will take a while...");
+	vGpuData.resize(n);
+
+	jconf::thd_cfg cfg;
+	for(i = 0; i < n; i++)
+	{
+		jconf::inst()->GetThreadConfig(i, cfg);
+		vGpuData[i].deviceIdx = cfg.index;
+		vGpuData[i].rawIntensity = cfg.intensity;
+		vGpuData[i].workSize = cfg.w_size;
+	}
+
+	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
+}
+
+std::vector<GpuContext> minethd::vGpuData;
+
+std::vector<IBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork)
+{
+	std::vector<IBackend*>* pvThreads = new std::vector<IBackend*>();
+
+	if(!ConfigEditor::file_exist(Params::inst().configFileAMD))
+	{
+		autoAdjust adjust;
+		if(!adjust.printConfig())
+			return pvThreads;
+	}
+
+	if(!jconf::inst()->parse_config())
+	{
+		win_exit();
+	}
+
+	// \ todo get device count and exit if no opencl device 
+
+	if(!init_gpus())
+	{
+		printer::inst()->print_msg(L1, "WARNING: AMD device not found");
+		return pvThreads;
+	}
+	
+	size_t i, n = jconf::inst()->GetThreadCount();
+	pvThreads->reserve(n);
+
+	jconf::thd_cfg cfg;
+	for (i = 0; i < n; i++)
+	{
+		jconf::inst()->GetThreadConfig(i, cfg);
+		minethd* thd = new minethd(pWork, i + threadOffset, &vGpuData[i]);
+
+		if(cfg.cpu_aff >= 0)
+		{
+#if defined(__APPLE__)
+			printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+#endif
+			cpu::minethd::thd_setaffinity(thd->oWorkThd.native_handle(), cfg.cpu_aff);
+		}
+
+		pvThreads->push_back(thd);
+		if(cfg.cpu_aff >= 0)
+			printer::inst()->print_msg(L1, "Starting GPU thread, affinity: %d.", (int)cfg.cpu_aff);
+		else
+			printer::inst()->print_msg(L1, "Starting GPU thread, no affinity.");
+	}
+
+	return pvThreads;
+}
+
+void minethd::switch_work(miner_work& pWork)
+{
+	// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
+	// faster than threads can consume them. This should never happen in real life.
+	// Pool cant physically send jobs faster than every 250ms or so due to net latency.
+
+	while (GlobalStates::inst().iConsumeCnt.load(std::memory_order_seq_cst) < GlobalStates::inst().iThreadCount)
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+	GlobalStates::inst().oGlobalWork = pWork;
+	GlobalStates::inst().iConsumeCnt.store(0, std::memory_order_seq_cst);
+	GlobalStates::inst().iGlobalJobNo++;
+}
+
+void minethd::consume_work()
+{
+	memcpy(&oWork, &GlobalStates::inst().oGlobalWork, sizeof(miner_work));
+	iJobNo++;
+	GlobalStates::inst().iConsumeCnt++;
+	
+}
+
+void minethd::work_main()
+{
+	uint64_t iCount = 0;
+
+	cryptonight_ctx* cpu_ctx;
+	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
+	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/);
+	
+	GlobalStates::inst().iConsumeCnt++;
+	
+	while (bQuit == 0)
+	{
+		if (oWork.bStall)
+		{
+			/*  We are stalled here because the executor didn't find a job for us yet,
+			    either because of network latency, or a socket problem. Since we are
+			    raison d'etre of this software it us sensible to just wait until we have something*/
+
+			while (GlobalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+				std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+			consume_work();
+			continue;
+		}
+
+		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
+		pGpuCtx->Nonce = calc_start_nonce(oWork.iResumeCnt);
+		uint32_t target = oWork.iTarget32;
+		XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target);
+
+		while(GlobalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		{
+			cl_uint results[0x100];
+			memset(results,0,sizeof(cl_uint)*(0x100));
+
+			XMRRunJob(pGpuCtx, results);
+
+			for(size_t i = 0; i < results[0xFF]; i++)
+			{
+				uint8_t	bWorkBlob[112];
+				uint8_t	bResult[32];
+
+				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
+				memset(bResult, 0, sizeof(job_result::bResult));
+
+				*(uint32_t*)(bWorkBlob + 39) = results[i];
+
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
+					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult), oWork.iPoolId));
+				else
+					executor::inst()->log_result_error("AMD Invalid Result");
+			}
+
+			iCount += pGpuCtx->rawIntensity;
+			using namespace std::chrono;
+			uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
+			iHashCount.store(iCount, std::memory_order_relaxed);
+			iTimestamp.store(iStamp, std::memory_order_relaxed);
+			std::this_thread::yield();
+		}
+
+		consume_work();
+	}
+}
+
+} // namespace amd
+} // namespace xmrstak
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
new file mode 100644
index 0000000..4fb3b13
--- /dev/null
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -0,0 +1,61 @@
+#pragma once
+#include <thread>
+#include <atomic>
+#include "./jconf.h"
+#include "../IBackend.hpp"
+#include "../../Environment.hpp"
+
+#include "amd_gpu/gpu.h"
+
+namespace xmrstak
+{
+namespace amd
+{
+
+class minethd  : public IBackend
+{
+public:
+
+	static void switch_work(miner_work& pWork);
+	static std::vector<IBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
+	static bool init_gpus();
+
+private:
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	
+	minethd(miner_work& pWork, size_t iNo, GpuContext* ctx);
+
+	// We use the top 8 bits of the nonce for thread and resume
+	// This allows us to resume up to 64 threads 4 times before
+	// we get nonce collisions
+	// Bottom 24 bits allow for an hour of work at 4000 H/s
+	inline uint32_t calc_start_nonce(uint32_t resume)
+	{
+		return reverseBits<uint32_t>(static_cast<uint32_t>(iThreadNo + GlobalStates::inst().iThreadCount * resume));
+	}
+	
+	void work_main();
+	void double_work_main();
+	void consume_work();
+
+	uint64_t iJobNo;
+
+	static miner_work oGlobalWork;
+	miner_work oWork;
+
+	std::thread oWorkThd;
+	uint8_t iThreadNo;
+
+	bool bQuit;
+	bool bNoPrefetch;
+
+	//Mutable ptr to vector below, different for each thread
+	GpuContext* pGpuCtx;
+
+	// WARNING - this vector (but not its contents) must be immutable
+	// once the threads are started
+	static std::vector<GpuContext> vGpuData;
+};
+
+} // namespace amd
+} // namespace xmrstak
diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp
new file mode 100644
index 0000000..1013f79
--- /dev/null
+++ b/xmrstak/backend/backendConnector.cpp
@@ -0,0 +1,104 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include <assert.h>
+#include <cmath>
+#include <chrono>
+#include <cstring>
+#include <thread>
+#include <bitset>
+
+#include "IBackend.hpp"
+#include "BackendConnector.hpp"
+
+#include "cpu/minethd.h"
+#ifndef CONF_NO_CUDA
+#	include "nvidia/minethd.h"
+#endif
+#ifndef CONF_NO_OPENCL
+#	include "amd/minethd.h"
+#endif
+#include "miner_work.h"
+#include "GlobalStates.hpp"
+ #include <cstdlib>
+
+#include "Plugin.hpp"
+#include "../Environment.hpp"
+#include "../console.h"
+#include "../Params.hpp"
+
+namespace xmrstak
+{
+
+
+bool BackendConnector::self_test()
+{
+	
+	return true;
+}
+
+std::vector<IBackend*>* BackendConnector::thread_starter(miner_work& pWork)
+{
+    GlobalStates::inst().iGlobalJobNo = 0;
+	GlobalStates::inst().iConsumeCnt = 0;
+
+
+	std::vector<IBackend*>* pvThreads = new std::vector<IBackend*>;
+
+#ifndef CONF_NO_CUDA
+	if(Params::inst().useNVIDIA)
+	{
+		Plugin nvidiaPlugin("NVIDIA", "xmrstak_cuda_backend");
+		std::vector<IBackend*>* nvidiaThreads = nvidiaPlugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, Environment::inst());
+		pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
+		if(nvidiaThreads->size() == 0)
+			printer::inst()->print_msg(L0, "WARNING: backend NVIDIA disabled.");
+	}
+#endif
+
+#ifndef CONF_NO_OPENCL
+	if(Params::inst().useAMD)
+	{
+		Plugin amdPlugin("AMD", "xmrstak_opencl_backend");
+		std::vector<IBackend*>* amdThreads = amdPlugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, Environment::inst());
+		pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
+		if(amdThreads->size() == 0)
+			printer::inst()->print_msg(L0, "WARNING: backend AMD disabled.");
+	}
+#endif
+
+#ifndef CONF_NO_CPU
+	if(Params::inst().useCPU)
+	{
+		auto cpuThreads = cpu::minethd::thread_starter(static_cast<uint32_t>(pvThreads->size()), pWork);
+		pvThreads->insert(std::end(*pvThreads), std::begin(cpuThreads), std::end(cpuThreads));
+		if(cpuThreads.size() == 0)
+			printer::inst()->print_msg(L0, "WARNING: backend CPU disabled.");
+	}
+#endif
+	
+	GlobalStates::inst().iThreadCount = pvThreads->size();
+	return pvThreads;
+}
+
+} // namepsace xmrstak
diff --git a/xmrstak/backend/backendConnector.hpp b/xmrstak/backend/backendConnector.hpp
new file mode 100644
index 0000000..024d9b1
--- /dev/null
+++ b/xmrstak/backend/backendConnector.hpp
@@ -0,0 +1,18 @@
+#pragma once
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <mutex>
+#include "IBackend.hpp"
+#include "miner_work.h"
+
+namespace xmrstak
+{
+
+	struct BackendConnector
+	{
+		static std::vector<IBackend*>* thread_starter(miner_work& pWork);
+		static bool self_test();
+	};
+
+} // namepsace xmrstak
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
new file mode 100644
index 0000000..32c8576
--- /dev/null
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -0,0 +1,172 @@
+#pragma once
+#include "jconf.h"
+#include "../../console.h"
+#include "../../jconf.h"
+#include "../../ConfigEditor.hpp"
+#include "../../Params.hpp"
+#include <string>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif // _WIN32
+
+
+namespace xmrstak
+{
+namespace cpu
+{
+// Mask bits between h and l and return the value
+// This enables us to put in values exactly like in the manual
+// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22)
+inline int32_t get_masked(int32_t val, int32_t h, int32_t l)
+{
+	val &= (0x7FFFFFFF >> (31-(h-l))) << l;
+	return val >> l;
+}
+
+class autoAdjust
+{
+public:
+
+	autoAdjust()
+	{
+	}
+
+	bool printConfig()
+	{
+
+		ConfigEditor configTpl{};
+
+		// load the template of the backend config into a char variable
+		const char *tpl =
+			#include "./config.tpl"
+		;
+		configTpl.set( std::string(tpl) );
+
+		std::string conf;
+
+		if(!detectL3Size() || L3KB_size < 1024 || L3KB_size > 102400)
+		{
+			if(L3KB_size < 1024 || L3KB_size > 102400)
+				printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size);
+
+			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down.");
+		}
+		else
+		{
+			printer::inst()->print_msg(L0, "Autoconf L3 size detected at %u KB.", L3KB_size);
+
+			detectCPUConf();
+
+			printer::inst()->print_msg(L0, "Autoconf core count detected as %u on %s.", corecnt,
+				linux_layout ? "Linux" : "Windows");
+
+			uint32_t aff_id = 0;
+			for(uint32_t i=0; i < corecnt; i++)
+			{
+				bool double_mode;
+
+				if(L3KB_size <= 0)
+					break;
+
+				double_mode = L3KB_size / 2048 > (int32_t)(corecnt-i);
+
+				conf += std::string("    { \"low_power_mode\" : ");
+				conf += std::string(double_mode ? "true" : "false");
+				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::to_string(aff_id);
+				conf += std::string(" },\n");
+
+				if(!linux_layout || old_amd)
+				{
+					aff_id += 2;
+
+					if(aff_id >= corecnt)
+						aff_id = 1;
+				}
+				else
+					aff_id++;
+
+				if(double_mode)
+					L3KB_size -= 4096;
+				else
+					L3KB_size -= 2048;
+			}
+		}
+
+		configTpl.replace("CPUCONFIG",conf);
+		configTpl.write(Params::inst().configFileCPU);
+		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", Params::inst().configFileCPU.c_str());
+
+		return true;
+	}
+
+private:
+	bool detectL3Size()
+	{
+		int32_t cpu_info[4];
+		char cpustr[13] = {0};
+
+		::jconf::cpuid(0, 0, cpu_info);
+		memcpy(cpustr, &cpu_info[1], 4);
+		memcpy(cpustr+4, &cpu_info[3], 4);
+		memcpy(cpustr+8, &cpu_info[2], 4);
+
+		if(strcmp(cpustr, "GenuineIntel") == 0)
+		{
+			::jconf::cpuid(4, 3, cpu_info);
+
+			if(get_masked(cpu_info[0], 7, 5) != 3)
+			{
+				printer::inst()->print_msg(L0, "Autoconf failed: Couln't find L3 cache page.");
+				return false;
+			}
+
+			L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) *
+				(get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024;
+
+			return true;
+		}
+		else if(strcmp(cpustr, "AuthenticAMD") == 0)
+		{
+			::jconf::cpuid(0x80000006, 0, cpu_info);
+
+			L3KB_size = get_masked(cpu_info[3], 31, 18) * 512;
+
+			::jconf::cpuid(1, 0, cpu_info);
+			if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen
+				old_amd = true;
+
+			return true;
+		}
+		else
+		{
+			printer::inst()->print_msg(L0, "Autoconf failed: Unknown CPU type: %s.", cpustr);
+			return false;
+		}
+	}
+
+	void detectCPUConf()
+	{
+#ifdef _WIN32
+		SYSTEM_INFO info;
+		GetSystemInfo(&info);
+		corecnt = info.dwNumberOfProcessors;
+		linux_layout = false;
+#else
+		corecnt = sysconf(_SC_NPROCESSORS_ONLN);
+		linux_layout = true;
+#endif // _WIN32
+	}
+
+	int32_t L3KB_size = 0;
+	uint32_t corecnt;
+	bool old_amd = false;
+	bool linux_layout;
+};
+
+} // namespace cpu
+} // namepsace xmrstak
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
new file mode 100644
index 0000000..e1916e0
--- /dev/null
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -0,0 +1,210 @@
+#pragma once
+
+#include "../../console.h"
+#include <hwloc.h>
+#include <stdio.h>
+#include "../../Params.hpp"
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif // _WIN32
+
+#include <string>
+#include "../../ConfigEditor.hpp"
+
+namespace xmrstak
+{
+namespace cpu
+{
+
+class autoAdjust
+{
+public:
+
+	autoAdjust()
+	{
+	}
+
+	bool printConfig()
+	{
+		
+		hwloc_topology_t topology;
+		hwloc_topology_init(&topology);
+		hwloc_topology_load(topology);
+
+		std::string conf;
+		ConfigEditor configTpl{};
+
+		// load the template of the backend config into a char variable
+		const char *tpl =
+			#include "./config.tpl"
+		;
+		configTpl.set( std::string(tpl) );
+
+		try
+		{
+			std::vector<hwloc_obj_t> tlcs;
+			tlcs.reserve(16);
+			results.reserve(16);
+
+			findChildrenCaches(hwloc_get_root_obj(topology),
+				[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); } );
+
+			if(tlcs.size() == 0)
+				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
+
+			for(hwloc_obj_t obj : tlcs)
+				proccessTopLevelCache(obj);
+			
+			for(uint32_t id : results)
+			{
+				conf += std::string("    { \"low_power_mode\" : ");
+				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
+				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::to_string(id & 0x7FFFFFF);
+				conf += std::string(" },\n");
+			}
+		}
+		catch(const std::runtime_error& err)
+		{
+			// \todo add fallback to default auto adjust
+			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what());
+		}
+
+		configTpl.replace("CPUCONFIG",conf);
+		configTpl.write(Params::inst().configFileCPU);
+		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", Params::inst().configFileCPU.c_str());
+		/* Destroy topology object. */
+		hwloc_topology_destroy(topology);
+
+		return true;
+	}
+
+private:
+	static constexpr size_t hashSize = 2 * 1024 * 1024;
+	std::vector<uint32_t> results;
+
+	template<typename func>
+	inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
+	{
+		for(size_t i=0; i < obj->arity; i++)
+		{
+			if(obj->children[i]->type == type)
+				lambda(obj->children[i]);
+			else
+				findChildrenByType(obj->children[i], type, lambda);
+		}
+	}
+
+	inline bool isCacheObject(hwloc_obj_t obj)
+	{
+#if HWLOC_API_VERSION >= 0x20000
+		return hwloc_obj_type_is_cache(obj->type);
+#else
+		return obj->type == HWLOC_OBJ_CACHE;
+#endif // HWLOC_API_VERSION
+	}
+
+	template<typename func>
+	inline void findChildrenCaches(hwloc_obj_t obj, func lambda)
+	{
+		for(size_t i=0; i < obj->arity; i++)
+		{
+			if(isCacheObject(obj->children[i]))
+				lambda(obj->children[i]);
+			else
+				findChildrenCaches(obj->children[i], lambda);
+		}
+	}
+
+	inline bool isCacheExclusive(hwloc_obj_t obj)
+	{
+		const char* value = hwloc_obj_get_info_by_name(obj, "Inclusive");
+		return value == nullptr || value[0] != '1';
+	}
+
+	// Top level cache isn't shared with other cores on the same package
+	// This will usually be 1 x L3, but can be 2 x L2 per package
+	void proccessTopLevelCache(hwloc_obj_t obj)
+	{
+		if(obj->attr == nullptr)
+			throw(std::runtime_error("Cache object hasn't got attributes."));
+
+		size_t PUs = 0;
+		findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; } );
+
+		//Strange case, but we will handle it silently, surely there must be one PU somewhere?
+		if(PUs == 0)
+			return;
+
+		if(obj->attr->cache.size == 0)
+		{
+			//We will always have one child if PUs > 0
+			if(!isCacheObject(obj->children[0]))
+				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
+
+			//Try our luck with lower level caches
+			for(size_t i=0; i < obj->arity; i++)
+				proccessTopLevelCache(obj->children[i]);
+			return;
+		}
+
+		size_t cacheSize = obj->attr->cache.size;
+		if(isCacheExclusive(obj))
+		{
+			for(size_t i=0; i < obj->arity; i++)
+			{
+				hwloc_obj_t l2obj = obj->children[i];
+				//If L2 is exclusive and greater or equal to 2MB add room for one more hash
+				if(isCacheObject(l2obj) && l2obj->attr != nullptr && l2obj->attr->cache.size >= hashSize)
+					cacheSize += hashSize;
+			}
+		}
+
+		std::vector<hwloc_obj_t> cores;
+		cores.reserve(16);
+		findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); } );
+
+		size_t cacheHashes = (cacheSize + hashSize/2) / hashSize;
+
+		//Firstly allocate PU 0 of every CORE, then PU 1 etc.
+		size_t pu_id = 0;
+		while(cacheHashes > 0 && PUs > 0)
+		{
+			bool allocated_pu = false;
+			for(hwloc_obj_t core : cores)
+			{
+				if(core->arity <= pu_id || core->children[pu_id]->type != HWLOC_OBJ_PU)
+					continue;
+
+				size_t os_id = core->children[pu_id]->os_index;
+
+				if(cacheHashes > PUs)
+				{
+					cacheHashes -= 2;
+					os_id |= 0x8000000; //double hash marker bit
+				}
+				else
+					cacheHashes--;
+				PUs--;
+
+				allocated_pu = true;
+				results.emplace_back(os_id);
+
+				if(cacheHashes == 0)
+					break;
+			}
+
+			if(!allocated_pu)
+				throw(std::runtime_error("Failed to allocate a PU."));
+
+			pu_id++;
+		}
+	}
+};
+
+} // namespace cpu
+} // namepsace xmrstak
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
new file mode 100644
index 0000000..990a31d
--- /dev/null
+++ b/xmrstak/backend/cpu/config.tpl
@@ -0,0 +1,32 @@
+R"===(
+/*
+ * Thread configuration for each thread. Make sure it matches the number above.
+ * low_power_mode - This mode will double the cache usage, and double the single thread performance. It will 
+ *                  consume much less power (as less cores are working), but will max out at around 80-85% of 
+ *                  the maximum performance.
+ *
+ * no_prefetch -    Some sytems can gain up to extra 5% here, but sometimes it will have no difference or make
+ *                  things slower.
+ *
+ * affine_to_cpu -  This can be either false (no affinity), or the CPU core number. Note that on hyperthreading 
+ *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting 
+ *                  even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 
+ *                  physical core CPU you should select cpu numbers 0-3.
+ *
+ * On the first run the miner will look at your system and suggest a basic configuration that will work,
+ * you can try to tweak it from there to get the best performance.
+ * 
+ * A filled out configuration should look like this:
+ * "cpu_threads_conf" :
+ * [ 
+ *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
+ * ],
+ */
+
+"cpu_threads_conf" :
+[
+CPUCONFIG
+],
+
+)==="
diff --git a/xmrstak/backend/cpu/crypto/c_blake256.c b/xmrstak/backend/cpu/crypto/c_blake256.c
new file mode 100644
index 0000000..ff623dd
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_blake256.c
@@ -0,0 +1,326 @@
+/*
+ * The blake256_* and blake224_* functions are largely copied from
+ * blake256_light.c and blake224_light.c from the BLAKE website:
+ *
+ *     http://131002.net/blake/
+ *
+ * The hmac_* functions implement HMAC-BLAKE-256 and HMAC-BLAKE-224.
+ * HMAC is specified by RFC 2104.
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "c_blake256.h"
+
+#define U8TO32(p) \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |    \
+	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+#define U32TO8(p, v) \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+const uint8_t sigma[][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
+	{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
+	{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
+	{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8},
+	{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13},
+	{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9},
+	{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11},
+	{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10},
+	{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0},
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
+	{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
+	{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
+	{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8}
+};
+
+const uint32_t cst[16] = {
+	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
+	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+};
+
+static const uint8_t padding[] = {
+	0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+
+void blake256_compress(state *S, const uint8_t *block) {
+	uint32_t v[16], m[16], i;
+
+#define ROT(x,n) (((x)<<(32-n))|((x)>>(n)))
+#define G(a,b,c,d,e)                                      \
+	v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e+1]]) + v[b]; \
+	v[d] = ROT(v[d] ^ v[a],16);                           \
+	v[c] += v[d];                                         \
+	v[b] = ROT(v[b] ^ v[c],12);                           \
+	v[a] += (m[sigma[i][e+1]] ^ cst[sigma[i][e]])+v[b];   \
+	v[d] = ROT(v[d] ^ v[a], 8);                           \
+	v[c] += v[d];                                         \
+	v[b] = ROT(v[b] ^ v[c], 7);
+
+	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
+	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
+	v[ 8] = S->s[0] ^ 0x243F6A88;
+	v[ 9] = S->s[1] ^ 0x85A308D3;
+	v[10] = S->s[2] ^ 0x13198A2E;
+	v[11] = S->s[3] ^ 0x03707344;
+	v[12] = 0xA4093822;
+	v[13] = 0x299F31D0;
+	v[14] = 0x082EFA98;
+	v[15] = 0xEC4E6C89;
+
+	if (S->nullt == 0) {
+		v[12] ^= S->t[0];
+		v[13] ^= S->t[0];
+		v[14] ^= S->t[1];
+		v[15] ^= S->t[1];
+	}
+
+	for (i = 0; i < 14; ++i) {
+		G(0, 4,  8, 12,  0);
+		G(1, 5,  9, 13,  2);
+		G(2, 6, 10, 14,  4);
+		G(3, 7, 11, 15,  6);
+		G(3, 4,  9, 14, 14);
+		G(2, 7,  8, 13, 12);
+		G(0, 5, 10, 15,  8);
+		G(1, 6, 11, 12, 10);
+	}
+
+	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
+	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+}
+
+void blake256_init(state *S) {
+	S->h[0] = 0x6A09E667;
+	S->h[1] = 0xBB67AE85;
+	S->h[2] = 0x3C6EF372;
+	S->h[3] = 0xA54FF53A;
+	S->h[4] = 0x510E527F;
+	S->h[5] = 0x9B05688C;
+	S->h[6] = 0x1F83D9AB;
+	S->h[7] = 0x5BE0CD19;
+	S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
+	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
+}
+
+void blake224_init(state *S) {
+	S->h[0] = 0xC1059ED8;
+	S->h[1] = 0x367CD507;
+	S->h[2] = 0x3070DD17;
+	S->h[3] = 0xF70E5939;
+	S->h[4] = 0xFFC00B31;
+	S->h[5] = 0x68581511;
+	S->h[6] = 0x64F98FA7;
+	S->h[7] = 0xBEFA4FA4;
+	S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
+	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
+}
+
+// datalen = number of bits
+void blake256_update(state *S, const uint8_t *data, uint64_t datalen) {
+	int left = S->buflen >> 3;
+	int fill = 64 - left;
+
+	if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) {
+		memcpy((void *) (S->buf + left), (void *) data, fill);
+		S->t[0] += 512;
+		if (S->t[0] == 0) S->t[1]++;
+		blake256_compress(S, S->buf);
+		data += fill;
+		datalen -= (fill << 3);
+		left = 0;
+	}
+
+	while (datalen >= 512) {
+		S->t[0] += 512;
+		if (S->t[0] == 0) S->t[1]++;
+		blake256_compress(S, data);
+		data += 64;
+		datalen -= 512;
+	}
+
+	if (datalen > 0) {
+		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+		S->buflen = (left << 3) + datalen;
+	} else {
+		S->buflen = 0;
+	}
+}
+
+// datalen = number of bits
+void blake224_update(state *S, const uint8_t *data, uint64_t datalen) {
+	blake256_update(S, data, datalen);
+}
+
+void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
+	uint8_t msglen[8];
+	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
+	if (lo < (unsigned) S->buflen) hi++;
+	U32TO8(msglen + 0, hi);
+	U32TO8(msglen + 4, lo);
+
+	if (S->buflen == 440) { /* one padding byte */
+		S->t[0] -= 8;
+		blake256_update(S, &pa, 8);
+	} else {
+		if (S->buflen < 440) { /* enough space to fill the block  */
+			if (S->buflen == 0) S->nullt = 1;
+			S->t[0] -= 440 - S->buflen;
+			blake256_update(S, padding, 440 - S->buflen);
+		} else { /* need 2 compressions */
+			S->t[0] -= 512 - S->buflen;
+			blake256_update(S, padding, 512 - S->buflen);
+			S->t[0] -= 440;
+			blake256_update(S, padding + 1, 440);
+			S->nullt = 1;
+		}
+		blake256_update(S, &pb, 8);
+		S->t[0] -= 8;
+	}
+	S->t[0] -= 64;
+	blake256_update(S, msglen, 64);
+
+	U32TO8(digest +  0, S->h[0]);
+	U32TO8(digest +  4, S->h[1]);
+	U32TO8(digest +  8, S->h[2]);
+	U32TO8(digest + 12, S->h[3]);
+	U32TO8(digest + 16, S->h[4]);
+	U32TO8(digest + 20, S->h[5]);
+	U32TO8(digest + 24, S->h[6]);
+	U32TO8(digest + 28, S->h[7]);
+}
+
+void blake256_final(state *S, uint8_t *digest) {
+	blake256_final_h(S, digest, 0x81, 0x01);
+}
+
+void blake224_final(state *S, uint8_t *digest) {
+	blake256_final_h(S, digest, 0x80, 0x00);
+}
+
+// inlen = number of bytes
+void blake256_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
+	state S;
+	blake256_init(&S);
+	blake256_update(&S, in, inlen * 8);
+	blake256_final(&S, out);
+}
+
+// inlen = number of bytes
+void blake224_hash(uint8_t *out, const uint8_t *in, uint64_t inlen) {
+	state S;
+	blake224_init(&S);
+	blake224_update(&S, in, inlen * 8);
+	blake224_final(&S, out);
+}
+
+// keylen = number of bytes
+void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
+	const uint8_t *key = _key;
+	uint8_t keyhash[32];
+	uint8_t pad[64];
+	uint64_t i;
+
+	if (keylen > 64) {
+		blake256_hash(keyhash, key, keylen);
+		key = keyhash;
+		keylen = 32;
+	}
+
+	blake256_init(&S->inner);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < keylen; ++i) {
+		pad[i] ^= key[i];
+	}
+	blake256_update(&S->inner, pad, 512);
+
+	blake256_init(&S->outer);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < keylen; ++i) {
+		pad[i] ^= key[i];
+	}
+	blake256_update(&S->outer, pad, 512);
+
+	memset(keyhash, 0, 32);
+}
+
+// keylen = number of bytes
+void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
+	const uint8_t *key = _key;
+	uint8_t keyhash[32];
+	uint8_t pad[64];
+	uint64_t i;
+
+	if (keylen > 64) {
+		blake256_hash(keyhash, key, keylen);
+		key = keyhash;
+		keylen = 28;
+	}
+
+	blake224_init(&S->inner);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < keylen; ++i) {
+		pad[i] ^= key[i];
+	}
+	blake224_update(&S->inner, pad, 512);
+
+	blake224_init(&S->outer);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < keylen; ++i) {
+		pad[i] ^= key[i];
+	}
+	blake224_update(&S->outer, pad, 512);
+
+	memset(keyhash, 0, 32);
+}
+
+// datalen = number of bits
+void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint64_t datalen) {
+  // update the inner state
+  blake256_update(&S->inner, data, datalen);
+}
+
+// datalen = number of bits
+void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint64_t datalen) {
+  // update the inner state
+  blake224_update(&S->inner, data, datalen);
+}
+
+void hmac_blake256_final(hmac_state *S, uint8_t *digest) {
+	uint8_t ihash[32];
+	blake256_final(&S->inner, ihash);
+	blake256_update(&S->outer, ihash, 256);
+	blake256_final(&S->outer, digest);
+	memset(ihash, 0, 32);
+}
+
+void hmac_blake224_final(hmac_state *S, uint8_t *digest) {
+	uint8_t ihash[32];
+	blake224_final(&S->inner, ihash);
+	blake224_update(&S->outer, ihash, 224);
+	blake224_final(&S->outer, digest);
+	memset(ihash, 0, 32);
+}
+
+// keylen = number of bytes; inlen = number of bytes
+void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint64_t inlen) {
+	hmac_state S;
+	hmac_blake256_init(&S, key, keylen);
+	hmac_blake256_update(&S, in, inlen * 8);
+	hmac_blake256_final(&S, out);
+}
+
+// keylen = number of bytes; inlen = number of bytes
+void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint64_t inlen) {
+	hmac_state S;
+	hmac_blake224_init(&S, key, keylen);
+	hmac_blake224_update(&S, in, inlen * 8);
+	hmac_blake224_final(&S, out);
+}
diff --git a/xmrstak/backend/cpu/crypto/c_blake256.h b/xmrstak/backend/cpu/crypto/c_blake256.h
new file mode 100644
index 0000000..b9c2aad
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_blake256.h
@@ -0,0 +1,43 @@
+#ifndef _BLAKE256_H_
+#define _BLAKE256_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint32_t h[8], s[4], t[2];
+  int buflen, nullt;
+  uint8_t buf[64];
+} state;
+
+typedef struct {
+  state inner;
+  state outer;
+} hmac_state;
+
+void blake256_init(state *);
+void blake224_init(state *);
+
+void blake256_update(state *, const uint8_t *, uint64_t);
+void blake224_update(state *, const uint8_t *, uint64_t);
+
+void blake256_final(state *, uint8_t *);
+void blake224_final(state *, uint8_t *);
+
+void blake256_hash(uint8_t *, const uint8_t *, uint64_t);
+void blake224_hash(uint8_t *, const uint8_t *, uint64_t);
+
+/* HMAC functions: */
+
+void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t);
+void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t);
+
+void hmac_blake256_update(hmac_state *, const uint8_t *, uint64_t);
+void hmac_blake224_update(hmac_state *, const uint8_t *, uint64_t);
+
+void hmac_blake256_final(hmac_state *, uint8_t *);
+void hmac_blake224_final(hmac_state *, uint8_t *);
+
+void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
+void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint64_t);
+
+#endif /* _BLAKE256_H_ */
diff --git a/xmrstak/backend/cpu/crypto/c_groestl.c b/xmrstak/backend/cpu/crypto/c_groestl.c
new file mode 100644
index 0000000..1318d5a
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_groestl.c
@@ -0,0 +1,360 @@
+/* hash.c     April 2012
+ * Groestl ANSI C code optimised for 32-bit machines
+ * Author: Thomas Krinninger
+ *
+ *  This work is based on the implementation of
+ *          Soeren S. Thomsen and Krystian Matusiewicz
+ *          
+ *
+ */
+
+#include "c_groestl.h"
+#include "groestl_tables.h"
+
+#define P_TYPE 0
+#define Q_TYPE 1
+
+const uint8_t shift_Values[2][8] = {{0,1,2,3,4,5,6,7},{1,3,5,7,0,2,4,6}};
+
+const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};
+
+
+#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
+															v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
+															v1 = temp_var;}
+  
+
+#define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t)				\
+   tu = T[2*(uint32_t)x[4*c0+0]];			    \
+   tl = T[2*(uint32_t)x[4*c0+0]+1];		    \
+   tv1 = T[2*(uint32_t)x[4*c1+1]];			\
+   tv2 = T[2*(uint32_t)x[4*c1+1]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
+   tu ^= tv1;						\
+   tl ^= tv2;						\
+   tv1 = T[2*(uint32_t)x[4*c2+2]];			\
+   tv2 = T[2*(uint32_t)x[4*c2+2]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
+   tu ^= tv1;						\
+   tl ^= tv2;   					\
+   tv1 = T[2*(uint32_t)x[4*c3+3]];			\
+   tv2 = T[2*(uint32_t)x[4*c3+3]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
+   tu ^= tv1;						\
+   tl ^= tv2;						\
+   tl ^= T[2*(uint32_t)x[4*c4+0]];			\
+   tu ^= T[2*(uint32_t)x[4*c4+0]+1];			\
+   tv1 = T[2*(uint32_t)x[4*c5+1]];			\
+   tv2 = T[2*(uint32_t)x[4*c5+1]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
+   tl ^= tv1;						\
+   tu ^= tv2;						\
+   tv1 = T[2*(uint32_t)x[4*c6+2]];			\
+   tv2 = T[2*(uint32_t)x[4*c6+2]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
+   tl ^= tv1;						\
+   tu ^= tv2;   					\
+   tv1 = T[2*(uint32_t)x[4*c7+3]];			\
+   tv2 = T[2*(uint32_t)x[4*c7+3]+1];			\
+   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
+   tl ^= tv1;						\
+   tu ^= tv2;						\
+   y[i] = tu;						\
+   y[i+1] = tl;
+
+
+/* compute one round of P (short variants) */
+static void RND512P(uint8_t *x, uint32_t *y, uint32_t r) {
+  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+  uint32_t* x32 = (uint32_t*)x;
+  x32[ 0] ^= 0x00000000^r;
+  x32[ 2] ^= 0x00000010^r;
+  x32[ 4] ^= 0x00000020^r;
+  x32[ 6] ^= 0x00000030^r;
+  x32[ 8] ^= 0x00000040^r;
+  x32[10] ^= 0x00000050^r;
+  x32[12] ^= 0x00000060^r;
+  x32[14] ^= 0x00000070^r;
+  COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+/* compute one round of Q (short variants) */
+static void RND512Q(uint8_t *x, uint32_t *y, uint32_t r) {
+  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+  uint32_t* x32 = (uint32_t*)x;
+  x32[ 0] = ~x32[ 0];
+  x32[ 1] ^= 0xffffffff^r;
+  x32[ 2] = ~x32[ 2];
+  x32[ 3] ^= 0xefffffff^r;
+  x32[ 4] = ~x32[ 4];
+  x32[ 5] ^= 0xdfffffff^r;
+  x32[ 6] = ~x32[ 6];
+  x32[ 7] ^= 0xcfffffff^r;
+  x32[ 8] = ~x32[ 8];
+  x32[ 9] ^= 0xbfffffff^r;
+  x32[10] = ~x32[10];
+  x32[11] ^= 0xafffffff^r;
+  x32[12] = ~x32[12];
+  x32[13] ^= 0x9fffffff^r;
+  x32[14] = ~x32[14];
+  x32[15] ^= 0x8fffffff^r;
+  COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+  COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+/* compute compression function (short variants) */
+static void F512(uint32_t *h, const uint32_t *m) {
+  int i;
+  uint32_t Ptmp[2*COLS512];
+  uint32_t Qtmp[2*COLS512];
+  uint32_t y[2*COLS512];
+  uint32_t z[2*COLS512];
+
+  for (i = 0; i < 2*COLS512; i++) {
+	z[i] = m[i];
+	Ptmp[i] = h[i]^m[i];
+  }
+
+  /* compute Q(m) */
+  RND512Q((uint8_t*)z, y, 0x00000000);
+  RND512Q((uint8_t*)y, z, 0x01000000);
+  RND512Q((uint8_t*)z, y, 0x02000000);
+  RND512Q((uint8_t*)y, z, 0x03000000);
+  RND512Q((uint8_t*)z, y, 0x04000000);
+  RND512Q((uint8_t*)y, z, 0x05000000);
+  RND512Q((uint8_t*)z, y, 0x06000000);
+  RND512Q((uint8_t*)y, z, 0x07000000);
+  RND512Q((uint8_t*)z, y, 0x08000000);
+  RND512Q((uint8_t*)y, Qtmp, 0x09000000);
+
+  /* compute P(h+m) */
+  RND512P((uint8_t*)Ptmp, y, 0x00000000);
+  RND512P((uint8_t*)y, z, 0x00000001);
+  RND512P((uint8_t*)z, y, 0x00000002);
+  RND512P((uint8_t*)y, z, 0x00000003);
+  RND512P((uint8_t*)z, y, 0x00000004);
+  RND512P((uint8_t*)y, z, 0x00000005);
+  RND512P((uint8_t*)z, y, 0x00000006);
+  RND512P((uint8_t*)y, z, 0x00000007);
+  RND512P((uint8_t*)z, y, 0x00000008);
+  RND512P((uint8_t*)y, Ptmp, 0x00000009);
+
+  /* compute P(h+m) + Q(m) + h */
+  for (i = 0; i < 2*COLS512; i++) {
+	h[i] ^= Ptmp[i]^Qtmp[i];
+  }
+}
+
+
+/* digest up to msglen bytes of input (full blocks only) */
+static void Transform(groestlHashState *ctx,
+	       const uint8_t *input, 
+	       int msglen) {
+
+  /* digest message, one block at a time */
+  for (; msglen >= SIZE512; 
+	   msglen -= SIZE512, input += SIZE512) {
+	F512(ctx->chaining,(uint32_t*)input);
+
+	/* increment block counter */
+	ctx->block_counter1++;
+	if (ctx->block_counter1 == 0) ctx->block_counter2++;
+  }
+}
+
+/* given state h, do h <- P(h)+h */
+static void OutputTransformation(groestlHashState *ctx) {
+  int j;
+  uint32_t temp[2*COLS512];
+  uint32_t y[2*COLS512];
+  uint32_t z[2*COLS512];
+
+
+
+	for (j = 0; j < 2*COLS512; j++) {
+	  temp[j] = ctx->chaining[j];
+	}
+	RND512P((uint8_t*)temp, y, 0x00000000);
+	RND512P((uint8_t*)y, z, 0x00000001);
+	RND512P((uint8_t*)z, y, 0x00000002);
+	RND512P((uint8_t*)y, z, 0x00000003);
+	RND512P((uint8_t*)z, y, 0x00000004);
+	RND512P((uint8_t*)y, z, 0x00000005);
+	RND512P((uint8_t*)z, y, 0x00000006);
+	RND512P((uint8_t*)y, z, 0x00000007);
+	RND512P((uint8_t*)z, y, 0x00000008);
+	RND512P((uint8_t*)y, temp, 0x00000009);
+	for (j = 0; j < 2*COLS512; j++) {
+	  ctx->chaining[j] ^= temp[j];
+	}									  
+}
+
+/* initialise context */
+static void Init(groestlHashState* ctx) {
+  int i = 0;
+  /* allocate memory for state and data buffer */
+
+  for(;i<(SIZE512/sizeof(uint32_t));i++)
+  {
+	ctx->chaining[i] = 0;
+  }
+
+  /* set initial value */
+  ctx->chaining[2*COLS512-1] = u32BIG((uint32_t)HASH_BIT_LEN);
+
+  /* set other variables */
+  ctx->buf_ptr = 0;
+  ctx->block_counter1 = 0;
+  ctx->block_counter2 = 0;
+  ctx->bits_in_last_byte = 0;
+}
+
+/* update state with databitlen bits of input */
+static void Update(groestlHashState* ctx,
+		  const BitSequence* input,
+		  DataLength databitlen) {
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* if the buffer contains data that has not yet been digested, first
+	 add data to buffer until full */
+  if (ctx->buf_ptr) {
+	while (ctx->buf_ptr < SIZE512 && index < msglen) {
+	  ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+	}
+	if (ctx->buf_ptr < SIZE512) {
+	  /* buffer still not full, return */
+	  if (rem) {
+	ctx->bits_in_last_byte = rem;
+	ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+	  }
+	  return;
+	}
+
+	/* digest buffer */
+	ctx->buf_ptr = 0;
+	Transform(ctx, ctx->buffer, SIZE512);
+  }
+
+  /* digest bulk of message */
+  Transform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/SIZE512)*SIZE512;
+
+  /* store remaining data in buffer */
+  while (index < msglen) {
+	ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+  }
+
+  /* if non-integral number of bytes have been supplied, store
+	 remaining bits in last byte, together with information about
+	 number of bits */
+  if (rem) {
+	ctx->bits_in_last_byte = rem;
+	ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+  }
+}
+
+#define BILB ctx->bits_in_last_byte
+
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+static void Final(groestlHashState* ctx,
+		 BitSequence* output) {
+  int i, j = 0, hashbytelen = HASH_BIT_LEN/8;
+  uint8_t *s = (BitSequence*)ctx->chaining;
+
+  /* pad with '1'-bit and first few '0'-bits */
+  if (BILB) {
+	ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
+	ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
+	BILB = 0;
+  }
+  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+  /* pad with '0'-bits */
+  if (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
+	/* padding requires two blocks */
+	while (ctx->buf_ptr < SIZE512) {
+	  ctx->buffer[(int)ctx->buf_ptr++] = 0;
+	}
+	/* digest first padding block */
+	Transform(ctx, ctx->buffer, SIZE512);
+	ctx->buf_ptr = 0;
+  }
+  while (ctx->buf_ptr < SIZE512-LENGTHFIELDLEN) {
+	ctx->buffer[(int)ctx->buf_ptr++] = 0;
+  }
+
+  /* length padding */
+  ctx->block_counter1++;
+  if (ctx->block_counter1 == 0) ctx->block_counter2++;
+  ctx->buf_ptr = SIZE512;
+
+  while (ctx->buf_ptr > SIZE512-(int)sizeof(uint32_t)) {
+	ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
+	ctx->block_counter1 >>= 8;
+  }
+  while (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
+	ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
+	ctx->block_counter2 >>= 8;
+  }
+  /* digest final padding block */
+  Transform(ctx, ctx->buffer, SIZE512); 
+  /* perform output transformation */
+  OutputTransformation(ctx);
+
+  /* store hash result in output */
+  for (i = SIZE512-hashbytelen; i < SIZE512; i++,j++) {
+	output[j] = s[i];
+  }
+
+  /* zeroise relevant variables and deallocate memory */
+  for (i = 0; i < COLS512; i++) {
+	ctx->chaining[i] = 0;
+  }
+  for (i = 0; i < SIZE512; i++) {
+	ctx->buffer[i] = 0;
+  }
+}
+
+/* hash bit sequence */
+void groestl(const BitSequence* data, 
+		DataLength databitlen,
+		BitSequence* hashval) {
+
+  groestlHashState context;
+
+  /* initialise */
+	Init(&context);
+
+
+  /* process message */
+  Update(&context, data, databitlen);
+
+  /* finalise */
+  Final(&context, hashval);
+}
+/*
+static int crypto_hash(unsigned char *out,
+		const unsigned char *in,
+		unsigned long long len)
+{
+  groestl(in, 8*len, out);
+  return 0;
+}
+
+*/
diff --git a/xmrstak/backend/cpu/crypto/c_groestl.h b/xmrstak/backend/cpu/crypto/c_groestl.h
new file mode 100644
index 0000000..2b51339
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_groestl.h
@@ -0,0 +1,60 @@
+#ifndef __hash_h
+#define __hash_h
+/*
+#include "crypto_uint8.h"
+#include "crypto_uint32.h"
+#include "crypto_uint64.h"
+#include "crypto_hash.h" 
+
+typedef crypto_uint8 uint8_t; 
+typedef crypto_uint32 uint32_t; 
+typedef crypto_uint64 uint64_t;
+*/
+#include <stdint.h>
+
+#include "hash.h"
+
+/* some sizes (number of bytes) */
+#define ROWS 8
+#define LENGTHFIELDLEN ROWS
+#define COLS512 8
+
+#define SIZE512 (ROWS*COLS512)
+
+#define ROUNDS512 10
+#define HASH_BIT_LEN 256
+
+#define ROTL32(v, n) ((((v)<<(n))|((v)>>(32-(n))))&li_32(ffffffff))
+
+
+#define li_32(h) 0x##h##u
+#define EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
+#define u32BIG(a)				\
+  ((ROTL32(a,8) & li_32(00FF00FF)) |		\
+   (ROTL32(a,24) & li_32(FF00FF00)))
+
+
+/* NIST API begin */
+typedef struct {
+  uint32_t chaining[SIZE512/sizeof(uint32_t)];            /* actual state */
+  uint32_t block_counter1,
+  block_counter2;         /* message block counter(s) */
+  BitSequence buffer[SIZE512];      /* data buffer */
+  int buf_ptr;              /* data buffer pointer */
+  int bits_in_last_byte;    /* no. of message bits in last byte of
+			       data buffer */
+} groestlHashState;
+
+/*void Init(hashState*);
+void Update(hashState*, const BitSequence*, DataLength);
+void Final(hashState*, BitSequence*); */
+void groestl(const BitSequence*, DataLength, BitSequence*);
+/* NIST API end   */
+
+/*
+int crypto_hash(unsigned char *out,
+		const unsigned char *in,
+		unsigned long long len);
+*/
+
+#endif /* __hash_h */
diff --git a/xmrstak/backend/cpu/crypto/c_jh.c b/xmrstak/backend/cpu/crypto/c_jh.c
new file mode 100644
index 0000000..9d685a0
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_jh.c
@@ -0,0 +1,367 @@
+/*This program gives the 64-bit optimized bitslice implementation of JH using ANSI C
+
+   --------------------------------
+   Performance
+
+   Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
+   Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
+   Speed for long message:
+   1) 45.8 cycles/byte   compiler: Intel C++ Compiler 11.1   compilation option: icc -O2
+   2) 56.8 cycles/byte   compiler: gcc 4.4.3                 compilation option: gcc -O3
+
+   --------------------------------
+   Last Modified: January 16, 2011
+*/
+
+#include "c_jh.h"
+
+#include <stdint.h>
+#include <string.h>
+
+/*typedef unsigned long long uint64;*/
+typedef uint64_t uint64;
+
+/*define data alignment for different C compilers*/
+#if defined(__GNUC__)
+	  #define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
+#else
+	  #define DATA_ALIGN16(x) __declspec(align(16)) x
+#endif
+
+
+typedef struct {
+	int hashbitlen;	   	              /*the message digest size*/
+	unsigned long long databitlen;    /*the message size in bits*/
+	unsigned long long datasize_in_buffer;      /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
+	DATA_ALIGN16(uint64 x[8][2]);     /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/
+	unsigned char buffer[64];         /*the 512-bit message block to be hashed;*/
+} hashState;
+
+
+/*The initial hash value H(0)*/
+const unsigned char JH224_H0[128]={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e};
+const unsigned char JH256_H0[128]={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69};
+const unsigned char JH384_H0[128]={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f};
+const unsigned char JH512_H0[128]={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
+
+/*42 round constants, each round constant is 32-byte (256-bit)*/
+const unsigned char E8_bitslice_roundconstant[42][32]={
+{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
+{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
+{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
+{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
+{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
+{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
+{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
+{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
+{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
+{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
+{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
+{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
+{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
+{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
+{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
+{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
+{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
+{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
+{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
+{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
+{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
+{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
+{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
+{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
+{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
+{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
+{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
+{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
+{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
+{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
+{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
+{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
+{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
+{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
+{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
+{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
+{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
+{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
+{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
+{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
+{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
+{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
+
+
+static void E8(hashState *state);  /*The bijective function E8, in bitslice form*/
+static void F8(hashState *state);  /*The compression function F8 */
+
+/*The API functions*/
+static HashReturn Init(hashState *state, int hashbitlen);
+static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
+static HashReturn Final(hashState *state, BitSequence *hashval);
+HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval);
+
+/*swapping bit 2i with bit 2i+1 of 64-bit x*/
+#define SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
+/*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 64-bit x*/
+#define SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
+/*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 64-bit x*/
+#define SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
+/*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 64-bit x*/
+#define SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
+/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 64-bit x*/
+#define SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
+/*swapping bits 64i||64i+1||......||64i+31 with bits 64i+32||64i+33||......||64i+63 of 64-bit x*/
+#define SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
+
+/*The MDS transform*/
+#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
+	  (m4) ^= (m1);                \
+	  (m5) ^= (m2);                \
+	  (m6) ^= (m0) ^ (m3);         \
+	  (m7) ^= (m0);                \
+	  (m0) ^= (m5);                \
+	  (m1) ^= (m6);                \
+	  (m2) ^= (m4) ^ (m7);         \
+	  (m3) ^= (m4);
+
+/*Two Sboxes are computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/
+/*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power*/
+#define SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
+	  m3  = ~(m3);                  \
+	  m7  = ~(m7);                  \
+	  m0 ^= ((~(m2)) & (cc0));      \
+	  m4 ^= ((~(m6)) & (cc1));      \
+	  temp0 = (cc0) ^ ((m0) & (m1));\
+	  temp1 = (cc1) ^ ((m4) & (m5));\
+	  m0 ^= ((m2) & (m3));          \
+	  m4 ^= ((m6) & (m7));          \
+	  m3 ^= ((~(m1)) & (m2));       \
+	  m7 ^= ((~(m5)) & (m6));       \
+	  m1 ^= ((m0) & (m2));          \
+	  m5 ^= ((m4) & (m6));          \
+	  m2 ^= ((m0) & (~(m3)));       \
+	  m6 ^= ((m4) & (~(m7)));       \
+	  m0 ^= ((m1) | (m3));          \
+	  m4 ^= ((m5) | (m7));          \
+	  m3 ^= ((m1) & (m2));          \
+	  m7 ^= ((m5) & (m6));          \
+	  m1 ^= (temp0 & (m0));         \
+	  m5 ^= (temp1 & (m4));         \
+	  m2 ^= temp0;                  \
+	  m6 ^= temp1;
+
+/*The bijective function E8, in bitslice form*/
+static void E8(hashState *state)
+{
+	  uint64 i,roundnumber,temp0,temp1;
+
+	  for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) {
+			/*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/
+			for (i = 0; i < 2; i++) {
+				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i+2] );
+				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+				  SWAP1(state->x[1][i]); SWAP1(state->x[3][i]); SWAP1(state->x[5][i]); SWAP1(state->x[7][i]);
+			}
+
+			/*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/
+			for (i = 0; i < 2; i++) {
+				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i+2] );
+				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+				  SWAP2(state->x[1][i]); SWAP2(state->x[3][i]); SWAP2(state->x[5][i]); SWAP2(state->x[7][i]);
+			}
+
+			/*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/
+			for (i = 0; i < 2; i++) {
+				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i+2] );
+				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+				  SWAP4(state->x[1][i]); SWAP4(state->x[3][i]); SWAP4(state->x[5][i]); SWAP4(state->x[7][i]);
+			}
+
+			/*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/
+			for (i = 0; i < 2; i++) {
+				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i+2] );
+				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+				  SWAP8(state->x[1][i]); SWAP8(state->x[3][i]); SWAP8(state->x[5][i]); SWAP8(state->x[7][i]);
+			}
+
+			/*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/
+			for (i = 0; i < 2; i++) {
+				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i+2] );
+				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+				  SWAP16(state->x[1][i]); SWAP16(state->x[3][i]); SWAP16(state->x[5][i]); SWAP16(state->x[7][i]);
+			}
+
+			/*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/
+			for (i = 0; i < 2; i++) {
+				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i+2] );
+				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+				  SWAP32(state->x[1][i]); SWAP32(state->x[3][i]); SWAP32(state->x[5][i]); SWAP32(state->x[7][i]);
+			}
+
+			/*round 7*roundnumber+6: Sbox and MDS layers*/
+			for (i = 0; i < 2; i++) {
+				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i+2] );
+				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			}
+			/*round 7*roundnumber+6: swapping layer*/
+			for (i = 1; i < 8; i = i+2) {
+				  temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0;
+			}
+	  }
+
+}
+
+/*The compression function F8 */
+static void F8(hashState *state)
+{
+	  uint64  i;
+
+	  /*xor the 512-bit message with the fist half of the 1024-bit hash state*/
+	  for (i = 0; i < 8; i++)  state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i];
+
+	  /*the bijective function E8 */
+	  E8(state);
+
+	  /*xor the 512-bit message with the second half of the 1024-bit hash state*/
+	  for (i = 0; i < 8; i++)  state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64*)state->buffer)[i];
+}
+
+/*before hashing a message, initialize the hash state as H0 */
+static HashReturn Init(hashState *state, int hashbitlen)
+{
+	  state->databitlen = 0;
+	  state->datasize_in_buffer = 0;
+
+	  /*initialize the initial hash value of JH*/
+	  state->hashbitlen = hashbitlen;
+
+	  /*load the intital hash value into state*/
+	  switch (hashbitlen)
+	  {
+			case 224: memcpy(state->x,JH224_H0,128); break;
+			case 256: memcpy(state->x,JH256_H0,128); break;
+			case 384: memcpy(state->x,JH384_H0,128); break;
+			case 512: memcpy(state->x,JH512_H0,128); break;
+	  }
+
+	  return(SUCCESS);
+}
+
+
+/*hash each 512-bit message block, except the last partial block*/
+static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+{
+	  DataLength index; /*the starting address of the data to be compressed*/
+
+	  state->databitlen += databitlen;
+	  index = 0;
+
+	  /*if there is remaining data in the buffer, fill it to a full message block first*/
+	  /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
+
+	  /*There is data in the buffer, but the incoming data is insufficient for a full block*/
+	  if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  ) {
+			if ( (databitlen & 7) == 0 ) {
+				 memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ;
+		    }
+			else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ;
+			state->datasize_in_buffer += databitlen;
+			databitlen = 0;
+	  }
+
+	  /*There is data in the buffer, and the incoming data is sufficient for a full block*/
+	  if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  ) {
+	        memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
+	        index = 64-(state->datasize_in_buffer >> 3);
+	        databitlen = databitlen - (512 - state->datasize_in_buffer);
+	        F8(state);
+	        state->datasize_in_buffer = 0;
+	  }
+
+	  /*hash the remaining full message blocks*/
+	  for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
+			memcpy(state->buffer, data+index, 64);
+			F8(state);
+	  }
+
+	  /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
+	  if ( databitlen > 0) {
+			if ((databitlen & 7) == 0)
+				  memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
+			else
+				  memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
+			state->datasize_in_buffer = databitlen;
+	  }
+
+	  return(SUCCESS);
+}
+
+/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
+static HashReturn Final(hashState *state, BitSequence *hashval)
+{
+	  unsigned int i;
+
+	  if ( (state->databitlen & 0x1ff) == 0 ) {
+			/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
+			memset(state->buffer, 0, 64);
+			state->buffer[0]  = 0x80;
+			state->buffer[63] = state->databitlen & 0xff;
+			state->buffer[62] = (state->databitlen >> 8)  & 0xff;
+			state->buffer[61] = (state->databitlen >> 16) & 0xff;
+			state->buffer[60] = (state->databitlen >> 24) & 0xff;
+			state->buffer[59] = (state->databitlen >> 32) & 0xff;
+			state->buffer[58] = (state->databitlen >> 40) & 0xff;
+			state->buffer[57] = (state->databitlen >> 48) & 0xff;
+			state->buffer[56] = (state->databitlen >> 56) & 0xff;
+			F8(state);
+	  }
+	  else {
+		    /*set the rest of the bytes in the buffer to 0*/
+			if ( (state->datasize_in_buffer & 7) == 0)
+				  for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)  state->buffer[i] = 0;
+			else
+				  for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)  state->buffer[i] = 0;
+
+			/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
+			state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
+
+			F8(state);
+			memset(state->buffer, 0, 64);
+			state->buffer[63] = state->databitlen & 0xff;
+			state->buffer[62] = (state->databitlen >> 8) & 0xff;
+			state->buffer[61] = (state->databitlen >> 16) & 0xff;
+			state->buffer[60] = (state->databitlen >> 24) & 0xff;
+			state->buffer[59] = (state->databitlen >> 32) & 0xff;
+			state->buffer[58] = (state->databitlen >> 40) & 0xff;
+			state->buffer[57] = (state->databitlen >> 48) & 0xff;
+			state->buffer[56] = (state->databitlen >> 56) & 0xff;
+			F8(state);
+	  }
+
+	  /*truncating the final hash value to generate the message digest*/
+	  switch(state->hashbitlen) {
+			case 224: memcpy(hashval,(unsigned char*)state->x+64+36,28);  break;
+			case 256: memcpy(hashval,(unsigned char*)state->x+64+32,32);  break;
+			case 384: memcpy(hashval,(unsigned char*)state->x+64+16,48);  break;
+			case 512: memcpy(hashval,(unsigned char*)state->x+64,64);     break;
+	  }
+
+	  return(SUCCESS);
+}
+
+/* hash a message,
+   three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen)
+   one output:   message digest (hashval)
+*/
+HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval)
+{
+	  hashState state;
+
+	  if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) {
+			Init(&state, hashbitlen);
+			Update(&state, data, databitlen);
+			Final(&state, hashval);
+			return SUCCESS;
+	  }
+	  else
+			return(BAD_HASHLEN);
+}
diff --git a/xmrstak/backend/cpu/crypto/c_jh.h b/xmrstak/backend/cpu/crypto/c_jh.h
new file mode 100644
index 0000000..d10d40f
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_jh.h
@@ -0,0 +1,19 @@
+/*This program gives the 64-bit optimized bitslice implementation of JH using ANSI C
+
+   --------------------------------
+   Performance
+
+   Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
+   Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
+   Speed for long message:
+   1) 45.8 cycles/byte   compiler: Intel C++ Compiler 11.1   compilation option: icc -O2
+   2) 56.8 cycles/byte   compiler: gcc 4.4.3                 compilation option: gcc -O3
+
+   --------------------------------
+   Last Modified: January 16, 2011
+*/
+#pragma once
+
+#include "hash.h"
+
+HashReturn jh_hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
diff --git a/xmrstak/backend/cpu/crypto/c_keccak.c b/xmrstak/backend/cpu/crypto/c_keccak.c
new file mode 100644
index 0000000..eadb85b
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_keccak.c
@@ -0,0 +1,176 @@
+// keccak.c
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+// A baseline Keccak (3rd round) implementation.
+
+#include <stdint.h>
+#include <memory.h>
+
+#define HASH_DATA_AREA 136
+#define KECCAK_ROUNDS 24
+
+#ifndef ROTL64
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+#endif
+
+const uint64_t keccakf_rndc[24] = 
+{
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080, 
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
+// update the state with given number of rounds
+
+void keccakf(uint64_t st[25], int rounds)
+{
+	int i, j, round;
+	uint64_t t, bc[5];
+
+	for (round = 0; round < rounds; ++round) {
+
+		// Theta
+		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+		for (i = 0; i < 5; ++i) {
+			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+			st[i     ] ^= t;
+			st[i +  5] ^= t;
+			st[i + 10] ^= t;
+			st[i + 15] ^= t;
+			st[i + 20] ^= t;
+		}
+
+		// Rho Pi
+		t = st[1];
+		st[ 1] = ROTL64(st[ 6], 44);
+		st[ 6] = ROTL64(st[ 9], 20);
+		st[ 9] = ROTL64(st[22], 61);
+		st[22] = ROTL64(st[14], 39);
+		st[14] = ROTL64(st[20], 18);
+		st[20] = ROTL64(st[ 2], 62);
+		st[ 2] = ROTL64(st[12], 43);
+		st[12] = ROTL64(st[13], 25);
+		st[13] = ROTL64(st[19],  8);
+		st[19] = ROTL64(st[23], 56);
+		st[23] = ROTL64(st[15], 41);
+		st[15] = ROTL64(st[ 4], 27);
+		st[ 4] = ROTL64(st[24], 14);
+		st[24] = ROTL64(st[21],  2);
+		st[21] = ROTL64(st[ 8], 55);
+		st[ 8] = ROTL64(st[16], 45);
+		st[16] = ROTL64(st[ 5], 36);
+		st[ 5] = ROTL64(st[ 3], 28);
+		st[ 3] = ROTL64(st[18], 21);
+		st[18] = ROTL64(st[17], 15);
+		st[17] = ROTL64(st[11], 10);
+		st[11] = ROTL64(st[ 7],  6);
+		st[ 7] = ROTL64(st[10],  3);
+		st[10] = ROTL64(t, 1);
+
+		//  Chi
+		// unrolled loop, where only last iteration is different
+		j = 0;
+		bc[0] = st[j    ];
+		bc[1] = st[j + 1];
+
+		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
+		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
+		st[j + 3] ^= (~st[j + 4]) & bc[0];
+		st[j + 4] ^= (~bc[0]) & bc[1];
+
+		j = 5;
+		bc[0] = st[j    ];
+		bc[1] = st[j + 1];
+
+		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
+		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
+		st[j + 3] ^= (~st[j + 4]) & bc[0];
+		st[j + 4] ^= (~bc[0]) & bc[1];
+
+		j = 10;
+		bc[0] = st[j    ];
+		bc[1] = st[j + 1];
+
+		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
+		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
+		st[j + 3] ^= (~st[j + 4]) & bc[0];
+		st[j + 4] ^= (~bc[0]) & bc[1];
+
+		j = 15;
+		bc[0] = st[j    ];
+		bc[1] = st[j + 1];
+
+		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
+		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
+		st[j + 3] ^= (~st[j + 4]) & bc[0];
+		st[j + 4] ^= (~bc[0]) & bc[1];
+
+		j = 20;
+		bc[0] = st[j    ];
+		bc[1] = st[j + 1];
+		bc[2] = st[j + 2];
+		bc[3] = st[j + 3];
+		bc[4] = st[j + 4];
+
+		st[j    ] ^= (~bc[1]) & bc[2];
+		st[j + 1] ^= (~bc[2]) & bc[3];
+		st[j + 2] ^= (~bc[3]) & bc[4];
+		st[j + 3] ^= (~bc[4]) & bc[0];
+		st[j + 4] ^= (~bc[0]) & bc[1];
+		
+		//  Iota
+		st[0] ^= keccakf_rndc[round];
+	}
+}
+
+// compute a keccak hash (md) of given byte length from "in"
+typedef uint64_t state_t[25];
+
+void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
+{
+	state_t st;
+	uint8_t temp[144];
+	int i, rsiz, rsizw;
+
+	rsiz = sizeof(state_t) == mdlen ? HASH_DATA_AREA : 200 - 2 * mdlen;
+	rsizw = rsiz / 8;
+	
+	memset(st, 0, sizeof(st));
+
+	for ( ; inlen >= rsiz; inlen -= rsiz, in += rsiz) {
+		for (i = 0; i < rsizw; i++)
+			st[i] ^= ((uint64_t *) in)[i];
+		keccakf(st, KECCAK_ROUNDS);
+	}
+	
+	// last block and padding
+	memcpy(temp, in, inlen);
+	temp[inlen++] = 1;
+	memset(temp + inlen, 0, rsiz - inlen);
+	temp[rsiz - 1] |= 0x80;
+
+	for (i = 0; i < rsizw; i++)
+		st[i] ^= ((uint64_t *) temp)[i];
+
+	keccakf(st, KECCAK_ROUNDS);
+
+	memcpy(md, st, mdlen);
+}
+
+void keccak1600(const uint8_t *in, int inlen, uint8_t *md)
+{
+	keccak(in, inlen, md, sizeof(state_t));
+}
+\ No newline at end of file
diff --git a/xmrstak/backend/cpu/crypto/c_keccak.h b/xmrstak/backend/cpu/crypto/c_keccak.h
new file mode 100644
index 0000000..4f7f857
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_keccak.h
@@ -0,0 +1,26 @@
+// keccak.h
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+#ifndef KECCAK_H
+#define KECCAK_H
+
+#include <stdint.h>
+#include <string.h>
+
+#ifndef KECCAK_ROUNDS
+#define KECCAK_ROUNDS 24
+#endif
+
+#ifndef ROTL64
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+#endif
+
+// compute a keccak hash (md) of given byte length from "in"
+int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
+
+// update the state
+void keccakf(uint64_t st[25], int norounds);
+
+void keccak1600(const uint8_t *in, int inlen, uint8_t *md);
+
+#endif
diff --git a/xmrstak/backend/cpu/crypto/c_skein.c b/xmrstak/backend/cpu/crypto/c_skein.c
new file mode 100644
index 0000000..2453713
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_skein.c
@@ -0,0 +1,2036 @@
+/***********************************************************************
+**
+** Implementation of the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+** 
+************************************************************************/
+
+#define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
+
+#include <stddef.h>                          /* get size_t definition */
+#include <string.h>      /* get the memcpy/memset functions */
+#include "c_skein.h"       /* get the Skein API definitions   */
+
+#define DISABLE_UNUSED 0
+
+#ifndef SKEIN_256_NIST_MAX_HASHBITS
+#define SKEIN_256_NIST_MAX_HASHBITS (0)
+#endif
+
+#ifndef SKEIN_512_NIST_MAX_HASHBITS
+#define SKEIN_512_NIST_MAX_HASHBITS (512)
+#endif
+
+#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+
+#define  SKEIN_256_STATE_WORDS ( 4)
+#define  SKEIN_512_STATE_WORDS ( 8)
+#define  SKEIN1024_STATE_WORDS (16)
+#define  SKEIN_MAX_STATE_WORDS (16)
+
+#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+
+#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_RND_SPECIAL       (1000u)
+#define SKEIN_RND_KEY_INITIAL   (SKEIN_RND_SPECIAL+0u)
+#define SKEIN_RND_KEY_INJECT    (SKEIN_RND_SPECIAL+1u)
+#define SKEIN_RND_FEED_FWD      (SKEIN_RND_SPECIAL+2u)
+
+typedef struct
+{
+  size_t  hashBitLen;                      /* size of hash result, in bits */
+  size_t  bCnt;                            /* current byte count in buffer b[] */
+  u64b_t  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
+} Skein_Ctxt_Hdr_t;
+
+typedef struct                               /*  256-bit Skein hash context structure */
+{
+  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+  u64b_t  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
+  u08b_t  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+} Skein_256_Ctxt_t;
+
+typedef struct                               /*  512-bit Skein hash context structure */
+{
+  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+  u64b_t  X[SKEIN_512_STATE_WORDS];        /* chaining variables */
+  u08b_t  b[SKEIN_512_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+} Skein_512_Ctxt_t;
+
+typedef struct                               /* 1024-bit Skein hash context structure */
+{
+  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
+  u64b_t  X[SKEIN1024_STATE_WORDS];        /* chaining variables */
+  u08b_t  b[SKEIN1024_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+} Skein1024_Ctxt_t;
+
+/*   Skein APIs for (incremental) "straight hashing" */
+#if SKEIN_256_NIST_MAX_HASH_BITS
+static int  Skein_256_Init  (Skein_256_Ctxt_t *ctx, size_t hashBitLen);
+#endif
+static int  Skein_512_Init  (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
+static int  Skein1024_Init  (Skein1024_Ctxt_t *ctx, size_t hashBitLen);
+
+static int  Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+static int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+static int  Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+
+static int  Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+static int  Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+/*
+**   Skein APIs for "extended" initialization: MAC keys, tree hashing.
+**   After an InitExt() call, just use Update/Final calls as with Init().
+**
+**   Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes.
+**          When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, 
+**              the results of InitExt() are identical to calling Init().
+**          The function Init() may be called once to "precompute" the IV for
+**              a given hashBitLen value, then by saving a copy of the context
+**              the IV computation may be avoided in later calls.
+**          Similarly, the function InitExt() may be called once per MAC key 
+**              to precompute the MAC IV, then a copy of the context saved and
+**              reused for each new MAC computation.
+**/
+#if 0
+static int  Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+static int  Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+static int  Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+#endif
+
+/*
+**   Skein APIs for MAC and tree hash:
+**      Final_Pad:  pad, do final block, but no OUTPUT type
+**      Output:     do just the output stage
+*/
+#if 0
+static int  Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+static int  Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+static int  Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+#endif
+
+#ifndef SKEIN_TREE_HASH
+#define SKEIN_TREE_HASH (1)
+#endif
+#if 0
+#if  SKEIN_TREE_HASH
+static int  Skein_256_Output   (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+static int  Skein_512_Output   (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+#endif
+#endif
+
+/*****************************************************************
+** "Internal" Skein definitions
+**    -- not needed for sequential hashing API, but will be 
+**           helpful for other uses of Skein (e.g., tree hash mode).
+**    -- included here so that they can be shared between
+**           reference and optimized code.
+******************************************************************/
+
+/* tweak word T[1]: bit field starting positions */
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+
+#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
+#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD   (((u64b_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+
+/* tweak word T[1]: tree level bit field mask */
+#define SKEIN_T1_TREE_LVL_MASK  (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n)  (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY      ( 0)                    /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG      ( 4)                    /* configuration block */
+#define SKEIN_BLK_TYPE_PERS     ( 8)                    /* personalization string */
+#define SKEIN_BLK_TYPE_PK       (12)                    /* public key (for digital signature hashing) */
+#define SKEIN_BLK_TYPE_KDF      (16)                    /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE    (20)                    /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+#define SKEIN_BLK_TYPE_MASK     (63)                    /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* configuration block */
+#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
+#define SKEIN_T1_BLK_TYPE_PK    SKEIN_T1_BLK_TYPE(PK)   /* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL       (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION           (1)
+
+#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
+#endif
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
+#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN       (4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS  ( 0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS  ( 8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS  (16)
+
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl)                   \
+  ( (((u64b_t)(leaf  )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
+  (((u64b_t)(node  )) << SKEIN_CFG_TREE_NODE_SIZE_POS) |    \
+  (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
+
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
+
+/*
+**   Skein macros for getting/setting tweak words, etc.
+**   These are useful for partial input bytes, hash tree init/update, etc.
+**/
+#define Skein_Get_Tweak(ctxPtr,TWK_NUM)         ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Get_T0(ctxPtr)    Skein_Get_Tweak(ctxPtr,0)
+#define Skein_Get_T1(ctxPtr)    Skein_Get_Tweak(ctxPtr,1)
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr,T0,T1)           \
+{                                           \
+  Skein_Set_T0(ctxPtr,(T0));                  \
+  Skein_Set_T1(ctxPtr,(T1));                  \
+}
+
+#define Skein_Set_Type(ctxPtr,BLK_TYPE)         \
+  Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
+{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Clear_First_Flag(hdr)      { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;       }
+#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+
+#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
+
+/*****************************************************************
+** "Internal" Skein definitions for debugging and error checking
+******************************************************************/
+#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
+#define Skein_Show_Round(bits,ctx,r,X)
+#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
+#define Skein_Show_Final(bits,ctx,cnt,outPtr)
+#define Skein_Show_Key(bits,ctx,key,keyBytes)
+
+
+#ifndef SKEIN_ERR_CHECK        /* run-time checks (e.g., bad params, uninitialized context)? */
+#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
+#define Skein_assert(x)
+#elif   defined(SKEIN_ASSERT)
+#include <assert.h>     
+#define Skein_Assert(x,retCode) assert(x) 
+#define Skein_assert(x)         assert(x) 
+#else
+#include <assert.h>     
+#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
+#define Skein_assert(x)         assert(x)                     /* internal error */
+#endif
+
+/*****************************************************************
+** Skein block function constants (shared across Ref and Opt code)
+******************************************************************/
+enum    
+{   
+  /* Skein_256 round rotation constants */
+  R_256_0_0=14, R_256_0_1=16,
+  R_256_1_0=52, R_256_1_1=57,
+  R_256_2_0=23, R_256_2_1=40,
+  R_256_3_0= 5, R_256_3_1=37,
+  R_256_4_0=25, R_256_4_1=33,
+  R_256_5_0=46, R_256_5_1=12,
+  R_256_6_0=58, R_256_6_1=22,
+  R_256_7_0=32, R_256_7_1=32,
+
+  /* Skein_512 round rotation constants */
+  R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+  R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+  R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+  R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+  R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+  R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+  R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+  R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
+
+  /* Skein1024 round rotation constants */
+  R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
+  R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
+  R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
+  R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
+  R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
+  R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
+  R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
+  R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
+};
+
+#ifndef SKEIN_ROUNDS
+#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
+#define SKEIN_512_ROUNDS_TOTAL (72)
+#define SKEIN1024_ROUNDS_TOTAL (80)
+#else                                        /* allow command-line define in range 8*(5..14)   */
+#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5))
+#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
+#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS    ) + 5) % 10) + 5))
+#endif
+
+
+/*
+***************** Pre-computed Skein IVs *******************
+**
+** NOTE: these values are not "magic" constants, but
+** are generated using the Threefish block function.
+** They are pre-computed here only for speed; i.e., to
+** avoid the need for a Threefish call during Init().
+**
+** The IV for any fixed hash length may be pre-computed.
+** Only the most common values are included here.
+**
+************************************************************
+**/
+
+#define MK_64 SKEIN_MK_64
+
+/* blkSize =  256 bits. hashSize =  128 bits */
+const u64b_t SKEIN_256_IV_128[] =
+	{
+	MK_64(0xE1111906,0x964D7260),
+	MK_64(0x883DAAA7,0x7C8D811C),
+	MK_64(0x10080DF4,0x91960F7A),
+	MK_64(0xCCF7DDE5,0xB45BC1C2)
+	};
+
+/* blkSize =  256 bits. hashSize =  160 bits */
+const u64b_t SKEIN_256_IV_160[] =
+	{
+	MK_64(0x14202314,0x72825E98),
+	MK_64(0x2AC4E9A2,0x5A77E590),
+	MK_64(0xD47A5856,0x8838D63E),
+	MK_64(0x2DD2E496,0x8586AB7D)
+	};
+
+/* blkSize =  256 bits. hashSize =  224 bits */
+const u64b_t SKEIN_256_IV_224[] =
+	{
+	MK_64(0xC6098A8C,0x9AE5EA0B),
+	MK_64(0x876D5686,0x08C5191C),
+	MK_64(0x99CB88D7,0xD7F53884),
+	MK_64(0x384BDDB1,0xAEDDB5DE)
+	};
+
+/* blkSize =  256 bits. hashSize =  256 bits */
+const u64b_t SKEIN_256_IV_256[] =
+	{
+	MK_64(0xFC9DA860,0xD048B449),
+	MK_64(0x2FCA6647,0x9FA7D833),
+	MK_64(0xB33BC389,0x6656840F),
+	MK_64(0x6A54E920,0xFDE8DA69)
+	};
+
+/* blkSize =  512 bits. hashSize =  128 bits */
+const u64b_t SKEIN_512_IV_128[] =
+	{
+	MK_64(0xA8BC7BF3,0x6FBF9F52),
+	MK_64(0x1E9872CE,0xBD1AF0AA),
+	MK_64(0x309B1790,0xB32190D3),
+	MK_64(0xBCFBB854,0x3F94805C),
+	MK_64(0x0DA61BCD,0x6E31B11B),
+	MK_64(0x1A18EBEA,0xD46A32E3),
+	MK_64(0xA2CC5B18,0xCE84AA82),
+	MK_64(0x6982AB28,0x9D46982D)
+	};
+
+/* blkSize =  512 bits. hashSize =  160 bits */
+const u64b_t SKEIN_512_IV_160[] =
+	{
+	MK_64(0x28B81A2A,0xE013BD91),
+	MK_64(0xC2F11668,0xB5BDF78F),
+	MK_64(0x1760D8F3,0xF6A56F12),
+	MK_64(0x4FB74758,0x8239904F),
+	MK_64(0x21EDE07F,0x7EAF5056),
+	MK_64(0xD908922E,0x63ED70B8),
+	MK_64(0xB8EC76FF,0xECCB52FA),
+	MK_64(0x01A47BB8,0xA3F27A6E)
+	};
+
+/* blkSize =  512 bits. hashSize =  224 bits */
+const u64b_t SKEIN_512_IV_224[] =
+	{
+	MK_64(0xCCD06162,0x48677224),
+	MK_64(0xCBA65CF3,0xA92339EF),
+	MK_64(0x8CCD69D6,0x52FF4B64),
+	MK_64(0x398AED7B,0x3AB890B4),
+	MK_64(0x0F59D1B1,0x457D2BD0),
+	MK_64(0x6776FE65,0x75D4EB3D),
+	MK_64(0x99FBC70E,0x997413E9),
+	MK_64(0x9E2CFCCF,0xE1C41EF7)
+	};
+
+/* blkSize =  512 bits. hashSize =  256 bits */
+const u64b_t SKEIN_512_IV_256[] =
+	{
+	MK_64(0xCCD044A1,0x2FDB3E13),
+	MK_64(0xE8359030,0x1A79A9EB),
+	MK_64(0x55AEA061,0x4F816E6F),
+	MK_64(0x2A2767A4,0xAE9B94DB),
+	MK_64(0xEC06025E,0x74DD7683),
+	MK_64(0xE7A436CD,0xC4746251),
+	MK_64(0xC36FBAF9,0x393AD185),
+	MK_64(0x3EEDBA18,0x33EDFC13)
+	};
+
+/* blkSize =  512 bits. hashSize =  384 bits */
+const u64b_t SKEIN_512_IV_384[] =
+	{
+	MK_64(0xA3F6C6BF,0x3A75EF5F),
+	MK_64(0xB0FEF9CC,0xFD84FAA4),
+	MK_64(0x9D77DD66,0x3D770CFE),
+	MK_64(0xD798CBF3,0xB468FDDA),
+	MK_64(0x1BC4A666,0x8A0E4465),
+	MK_64(0x7ED7D434,0xE5807407),
+	MK_64(0x548FC1AC,0xD4EC44D6),
+	MK_64(0x266E1754,0x6AA18FF8)
+	};
+
+/* blkSize =  512 bits. hashSize =  512 bits */
+const u64b_t SKEIN_512_IV_512[] =
+	{
+	MK_64(0x4903ADFF,0x749C51CE),
+	MK_64(0x0D95DE39,0x9746DF03),
+	MK_64(0x8FD19341,0x27C79BCE),
+	MK_64(0x9A255629,0xFF352CB1),
+	MK_64(0x5DB62599,0xDF6CA7B0),
+	MK_64(0xEABE394C,0xA9D5C3F4),
+	MK_64(0x991112C7,0x1A75B523),
+	MK_64(0xAE18A40B,0x660FCC33)
+	};
+
+/* blkSize = 1024 bits. hashSize =  384 bits */
+const u64b_t SKEIN1024_IV_384[] =
+	{
+	MK_64(0x5102B6B8,0xC1894A35),
+	MK_64(0xFEEBC9E3,0xFE8AF11A),
+	MK_64(0x0C807F06,0xE32BED71),
+	MK_64(0x60C13A52,0xB41A91F6),
+	MK_64(0x9716D35D,0xD4917C38),
+	MK_64(0xE780DF12,0x6FD31D3A),
+	MK_64(0x797846B6,0xC898303A),
+	MK_64(0xB172C2A8,0xB3572A3B),
+	MK_64(0xC9BC8203,0xA6104A6C),
+	MK_64(0x65909338,0xD75624F4),
+	MK_64(0x94BCC568,0x4B3F81A0),
+	MK_64(0x3EBBF51E,0x10ECFD46),
+	MK_64(0x2DF50F0B,0xEEB08542),
+	MK_64(0x3B5A6530,0x0DBC6516),
+	MK_64(0x484B9CD2,0x167BBCE1),
+	MK_64(0x2D136947,0xD4CBAFEA)
+	};
+
+/* blkSize = 1024 bits. hashSize =  512 bits */
+const u64b_t SKEIN1024_IV_512[] =
+	{
+	MK_64(0xCAEC0E5D,0x7C1B1B18),
+	MK_64(0xA01B0E04,0x5F03E802),
+	MK_64(0x33840451,0xED912885),
+	MK_64(0x374AFB04,0xEAEC2E1C),
+	MK_64(0xDF25A0E2,0x813581F7),
+	MK_64(0xE4004093,0x8B12F9D2),
+	MK_64(0xA662D539,0xC2ED39B6),
+	MK_64(0xFA8B85CF,0x45D8C75A),
+	MK_64(0x8316ED8E,0x29EDE796),
+	MK_64(0x053289C0,0x2E9F91B8),
+	MK_64(0xC3F8EF1D,0x6D518B73),
+	MK_64(0xBDCEC3C4,0xD5EF332E),
+	MK_64(0x549A7E52,0x22974487),
+	MK_64(0x67070872,0x5B749816),
+	MK_64(0xB9CD28FB,0xF0581BD1),
+	MK_64(0x0E2940B8,0x15804974)
+	};
+
+/* blkSize = 1024 bits. hashSize = 1024 bits */
+const u64b_t SKEIN1024_IV_1024[] =
+	{
+	MK_64(0xD593DA07,0x41E72355),
+	MK_64(0x15B5E511,0xAC73E00C),
+	MK_64(0x5180E5AE,0xBAF2C4F0),
+	MK_64(0x03BD41D3,0xFCBCAFAF),
+	MK_64(0x1CAEC6FD,0x1983A898),
+	MK_64(0x6E510B8B,0xCDD0589F),
+	MK_64(0x77E2BDFD,0xC6394ADA),
+	MK_64(0xC11E1DB5,0x24DCB0A3),
+	MK_64(0xD6D14AF9,0xC6329AB5),
+	MK_64(0x6A9B0BFC,0x6EB67E0D),
+	MK_64(0x9243C60D,0xCCFF1332),
+	MK_64(0x1A1F1DDE,0x743F02D4),
+	MK_64(0x0996753C,0x10ED0BB8),
+	MK_64(0x6572DD22,0xF2B4969A),
+	MK_64(0x61FD3062,0xD00A579A),
+	MK_64(0x1DE0536E,0x8682E539)
+	};
+
+
+#ifndef SKEIN_USE_ASM
+#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
+#endif
+
+#ifndef SKEIN_LOOP
+#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
+#endif
+
+#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)                
+#define ts              (kw + KW_TWK_BASE)
+
+#ifdef SKEIN_DEBUG
+#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
+#else
+#define DebugSaveTweak(ctx)
+#endif
+
+/*****************************  Skein_256 ******************************/
+#if !(SKEIN_USE_ASM & 256)
+static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+	{ /* do it in C */
+	enum
+		{
+		WCNT = SKEIN_256_STATE_WORDS
+		};
+#undef  RCNT
+#define RCNT  (SKEIN_256_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
+#else
+#define SKEIN_UNROLL_256 (0)
+#endif
+
+#if SKEIN_UNROLL_256
+#if (RCNT % SKEIN_UNROLL_256)
+#error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
+#endif
+	size_t  r;
+	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+	u64b_t  X0,X1,X2,X3;                        /* local copy of context vars, for speed */
+	u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+	const u64b_t *Xptr[4];                      /* use for debugging (help compiler put Xn in registers) */
+	Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+#endif
+	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do  {
+		/* this implementation only supports 2**64 input bytes (no carry out here) */
+		ts[0] += byteCntAdd;                    /* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];     
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		Skein_Get64_LSB_First(w,blkPtr,WCNT);   /* get input block in little-endian format */
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+		X0 = w[0] + ks[0];                      /* do the first full key injection */
+		X1 = w[1] + ks[1] + ts[0];
+		X2 = w[2] + ks[2] + ts[1];
+		X3 = w[3] + ks[3];
+
+		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);    /* show starting state values */
+
+		blkPtr += SKEIN_256_BLOCK_BYTES;
+
+		/* run the rounds */
+
+#define Round256(p0,p1,p2,p3,ROT,rNum)                              \
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+
+#if SKEIN_UNROLL_256 == 0                       
+#define R256(p0,p1,p2,p3,ROT,rNum)           /* fully unrolled */   \
+	Round256(p0,p1,p2,p3,ROT,rNum)                                  \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I256(R)                                                     \
+	X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
+	X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
+	X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
+	X3   += ks[((R)+4) % 5] +     (R)+1;                            \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else                                       /* looping version */
+#define R256(p0,p1,p2,p3,ROT,rNum)                                  \
+	Round256(p0,p1,p2,p3,ROT,rNum)                                  \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I256(R)                                                     \
+	X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
+	X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
+	X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
+	X3   += ks[r+(R)+3] +    r+(R)   ;                              \
+	ks[r + (R)+4    ]   = ks[r+(R)-1];     /* rotate key schedule */\
+	ts[r + (R)+2    ]   = ts[r+(R)-1];                              \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+	for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256)  /* loop thru it */
+#endif  
+		{    
+#define R256_8_rounds(R)                  \
+		R256(0,1,2,3,R_256_0,8*(R) + 1);  \
+		R256(0,3,2,1,R_256_1,8*(R) + 2);  \
+		R256(0,1,2,3,R_256_2,8*(R) + 3);  \
+		R256(0,3,2,1,R_256_3,8*(R) + 4);  \
+		I256(2*(R));                      \
+		R256(0,1,2,3,R_256_4,8*(R) + 5);  \
+		R256(0,3,2,1,R_256_5,8*(R) + 6);  \
+		R256(0,1,2,3,R_256_6,8*(R) + 7);  \
+		R256(0,3,2,1,R_256_7,8*(R) + 8);  \
+		I256(2*(R)+1);
+
+		R256_8_rounds( 0);
+
+#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
+
+  #if   R256_Unroll_R( 1)
+		R256_8_rounds( 1);
+  #endif
+  #if   R256_Unroll_R( 2)
+		R256_8_rounds( 2);
+  #endif
+  #if   R256_Unroll_R( 3)
+		R256_8_rounds( 3);
+  #endif
+  #if   R256_Unroll_R( 4)
+		R256_8_rounds( 4);
+  #endif
+  #if   R256_Unroll_R( 5)
+		R256_8_rounds( 5);
+  #endif
+  #if   R256_Unroll_R( 6)
+		R256_8_rounds( 6);
+  #endif
+  #if   R256_Unroll_R( 7)
+		R256_8_rounds( 7);
+  #endif
+  #if   R256_Unroll_R( 8)
+		R256_8_rounds( 8);
+  #endif
+  #if   R256_Unroll_R( 9)
+		R256_8_rounds( 9);
+  #endif
+  #if   R256_Unroll_R(10)
+		R256_8_rounds(10);
+  #endif
+  #if   R256_Unroll_R(11)
+		R256_8_rounds(11);
+  #endif
+  #if   R256_Unroll_R(12)
+		R256_8_rounds(12);
+  #endif
+  #if   R256_Unroll_R(13)
+		R256_8_rounds(13);
+  #endif
+  #if   R256_Unroll_R(14)
+		R256_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_256 > 14)
+#error  "need more unrolling in Skein_256_Process_Block"
+  #endif
+		}
+		/* do the final "feedforward" xor, update context chaining vars */
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+
+		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+		}
+	while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+	}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+static size_t Skein_256_Process_Block_CodeSize(void)
+	{
+	return ((u08b_t *) Skein_256_Process_Block_CodeSize) -
+		   ((u08b_t *) Skein_256_Process_Block);
+	}
+static uint_t Skein_256_Unroll_Cnt(void)
+	{
+	return SKEIN_UNROLL_256;
+	}
+#endif
+#endif
+
+/*****************************  Skein_512 ******************************/
+#if !(SKEIN_USE_ASM & 512)
+static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+	{ /* do it in C */
+	enum
+		{
+		WCNT = SKEIN_512_STATE_WORDS
+		};
+#undef  RCNT
+#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
+#else
+#define SKEIN_UNROLL_512 (0)
+#endif
+
+#if SKEIN_UNROLL_512
+#if (RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512"               /* sanity check on unroll count */
+#endif
+	size_t  r;
+	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+	u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
+	u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+	const u64b_t *Xptr[8];                      /* use for debugging (help compiler put Xn in registers) */
+	Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+	Xptr[4] = &X4;  Xptr[5] = &X5;  Xptr[6] = &X6;  Xptr[7] = &X7;
+#endif
+
+	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do  {
+		/* this implementation only supports 2**64 input bytes (no carry out here) */
+		ts[0] += byteCntAdd;                    /* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 
+				ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+		X0   = w[0] + ks[0];                    /* do the first full key injection */
+		X1   = w[1] + ks[1];
+		X2   = w[2] + ks[2];
+		X3   = w[3] + ks[3];
+		X4   = w[4] + ks[4];
+		X5   = w[5] + ks[5] + ts[0];
+		X6   = w[6] + ks[6] + ts[1];
+		X7   = w[7] + ks[7];
+
+		blkPtr += SKEIN_512_BLOCK_BYTES;
+
+		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+		/* run the rounds */
+#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+	X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
+	X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+
+#if SKEIN_UNROLL_512 == 0                       
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
+	Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I512(R)                                                     \
+	X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
+	X1   += ks[((R)+2) % 9];                                        \
+	X2   += ks[((R)+3) % 9];                                        \
+	X3   += ks[((R)+4) % 9];                                        \
+	X4   += ks[((R)+5) % 9];                                        \
+	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
+	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
+	X7   += ks[((R)+8) % 9] +     (R)+1;                            \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else                                       /* looping version */
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+	Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I512(R)                                                     \
+	X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
+	X1   += ks[r+(R)+1];                                            \
+	X2   += ks[r+(R)+2];                                            \
+	X3   += ks[r+(R)+3];                                            \
+	X4   += ks[r+(R)+4];                                            \
+	X5   += ks[r+(R)+5] + ts[r+(R)+0];                              \
+	X6   += ks[r+(R)+6] + ts[r+(R)+1];                              \
+	X7   += ks[r+(R)+7] +    r+(R)   ;                              \
+	ks[r +       (R)+8] = ks[r+(R)-1];  /* rotate key schedule */   \
+	ts[r +       (R)+2] = ts[r+(R)-1];                              \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+	for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512)   /* loop thru it */
+#endif                         /* end of looped code definitions */
+		{
+#define R512_8_rounds(R)  /* do 8 full rounds */  \
+		R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
+		R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
+		R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
+		R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
+		I512(2*(R));                              \
+		R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
+		R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
+		R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
+		R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
+		I512(2*(R)+1);        /* and key injection */
+
+		R512_8_rounds( 0);
+
+#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
+
+  #if   R512_Unroll_R( 1)
+		R512_8_rounds( 1);
+  #endif
+  #if   R512_Unroll_R( 2)
+		R512_8_rounds( 2);
+  #endif
+  #if   R512_Unroll_R( 3)
+		R512_8_rounds( 3);
+  #endif
+  #if   R512_Unroll_R( 4)
+		R512_8_rounds( 4);
+  #endif
+  #if   R512_Unroll_R( 5)
+		R512_8_rounds( 5);
+  #endif
+  #if   R512_Unroll_R( 6)
+		R512_8_rounds( 6);
+  #endif
+  #if   R512_Unroll_R( 7)
+		R512_8_rounds( 7);
+  #endif
+  #if   R512_Unroll_R( 8)
+		R512_8_rounds( 8);
+  #endif
+  #if   R512_Unroll_R( 9)
+		R512_8_rounds( 9);
+  #endif
+  #if   R512_Unroll_R(10)
+		R512_8_rounds(10);
+  #endif
+  #if   R512_Unroll_R(11)
+		R512_8_rounds(11);
+  #endif
+  #if   R512_Unroll_R(12)
+		R512_8_rounds(12);
+  #endif
+  #if   R512_Unroll_R(13)
+		R512_8_rounds(13);
+  #endif
+  #if   R512_Unroll_R(14)
+		R512_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_512 > 14)
+#error  "need more unrolling in Skein_512_Process_Block"
+  #endif
+		}
+
+		/* do the final "feedforward" xor, update context chaining vars */
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+		ctx->X[4] = X4 ^ w[4];
+		ctx->X[5] = X5 ^ w[5];
+		ctx->X[6] = X6 ^ w[6];
+		ctx->X[7] = X7 ^ w[7];
+		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+		}
+	while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+	}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+static size_t Skein_512_Process_Block_CodeSize(void)
+	{
+	return ((u08b_t *) Skein_512_Process_Block_CodeSize) -
+		   ((u08b_t *) Skein_512_Process_Block);
+	}
+static uint_t Skein_512_Unroll_Cnt(void)
+	{
+	return SKEIN_UNROLL_512;
+	}
+#endif
+#endif
+
+/*****************************  Skein1024 ******************************/
+#if !(SKEIN_USE_ASM & 1024)
+static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+	{ /* do it in C, always looping (unrolled is bigger AND slower!) */
+	enum
+		{
+		WCNT = SKEIN1024_STATE_WORDS
+		};
+#undef  RCNT
+#define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
+
+#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#else
+#define SKEIN_UNROLL_1024 (0)
+#endif
+
+#if (SKEIN_UNROLL_1024 != 0)
+#if (RCNT % SKEIN_UNROLL_1024)
+#error "Invalid SKEIN_UNROLL_1024"              /* sanity check on unroll count */
+#endif
+	size_t  r;
+	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+#endif
+
+	u64b_t  X00,X01,X02,X03,X04,X05,X06,X07,    /* local copy of vars, for speed */
+			X08,X09,X10,X11,X12,X13,X14,X15;
+	u64b_t  w [WCNT];                           /* local copy of input block */
+#ifdef SKEIN_DEBUG
+	const u64b_t *Xptr[16];                     /* use for debugging (help compiler put Xn in registers) */
+	Xptr[ 0] = &X00;  Xptr[ 1] = &X01;  Xptr[ 2] = &X02;  Xptr[ 3] = &X03;
+	Xptr[ 4] = &X04;  Xptr[ 5] = &X05;  Xptr[ 6] = &X06;  Xptr[ 7] = &X07;
+	Xptr[ 8] = &X08;  Xptr[ 9] = &X09;  Xptr[10] = &X10;  Xptr[11] = &X11;
+	Xptr[12] = &X12;  Xptr[13] = &X13;  Xptr[14] = &X14;  Xptr[15] = &X15;
+#endif
+
+	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do  {
+		/* this implementation only supports 2**64 input bytes (no carry out here) */
+		ts[0] += byteCntAdd;                    /* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[ 0] = ctx->X[ 0];
+		ks[ 1] = ctx->X[ 1];
+		ks[ 2] = ctx->X[ 2];
+		ks[ 3] = ctx->X[ 3];
+		ks[ 4] = ctx->X[ 4];
+		ks[ 5] = ctx->X[ 5];
+		ks[ 6] = ctx->X[ 6];
+		ks[ 7] = ctx->X[ 7];
+		ks[ 8] = ctx->X[ 8];
+		ks[ 9] = ctx->X[ 9];
+		ks[10] = ctx->X[10];
+		ks[11] = ctx->X[11];
+		ks[12] = ctx->X[12];
+		ks[13] = ctx->X[13];
+		ks[14] = ctx->X[14];
+		ks[15] = ctx->X[15];
+		ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^
+				 ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^
+				 ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^
+				 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
+
+		ts[2]  = ts[0] ^ ts[1];
+
+		Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+		X00    = w[ 0] + ks[ 0];                 /* do the first full key injection */
+		X01    = w[ 1] + ks[ 1];
+		X02    = w[ 2] + ks[ 2];
+		X03    = w[ 3] + ks[ 3];
+		X04    = w[ 4] + ks[ 4];
+		X05    = w[ 5] + ks[ 5];
+		X06    = w[ 6] + ks[ 6];
+		X07    = w[ 7] + ks[ 7];
+		X08    = w[ 8] + ks[ 8];
+		X09    = w[ 9] + ks[ 9];
+		X10    = w[10] + ks[10];
+		X11    = w[11] + ks[11];
+		X12    = w[12] + ks[12];
+		X13    = w[13] + ks[13] + ts[0];
+		X14    = w[14] + ks[14] + ts[1];
+		X15    = w[15] + ks[15];
+
+		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+
+#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0;   \
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2;   \
+	X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4;   \
+	X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6;   \
+	X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8;   \
+	X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA;   \
+	X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC;   \
+	X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE;   \
+
+#if SKEIN_UNROLL_1024 == 0                      
+#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+	Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr);
+
+#define I1024(R)                                                      \
+	X00   += ks[((R)+ 1) % 17]; /* inject the key schedule value */   \
+	X01   += ks[((R)+ 2) % 17];                                       \
+	X02   += ks[((R)+ 3) % 17];                                       \
+	X03   += ks[((R)+ 4) % 17];                                       \
+	X04   += ks[((R)+ 5) % 17];                                       \
+	X05   += ks[((R)+ 6) % 17];                                       \
+	X06   += ks[((R)+ 7) % 17];                                       \
+	X07   += ks[((R)+ 8) % 17];                                       \
+	X08   += ks[((R)+ 9) % 17];                                       \
+	X09   += ks[((R)+10) % 17];                                       \
+	X10   += ks[((R)+11) % 17];                                       \
+	X11   += ks[((R)+12) % 17];                                       \
+	X12   += ks[((R)+13) % 17];                                       \
+	X13   += ks[((R)+14) % 17] + ts[((R)+1) % 3];                     \
+	X14   += ks[((R)+15) % 17] + ts[((R)+2) % 3];                     \
+	X15   += ks[((R)+16) % 17] +     (R)+1;                           \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); 
+#else                                       /* looping version */
+#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+	Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr);
+
+#define I1024(R)                                                      \
+	X00   += ks[r+(R)+ 0];    /* inject the key schedule value */     \
+	X01   += ks[r+(R)+ 1];                                            \
+	X02   += ks[r+(R)+ 2];                                            \
+	X03   += ks[r+(R)+ 3];                                            \
+	X04   += ks[r+(R)+ 4];                                            \
+	X05   += ks[r+(R)+ 5];                                            \
+	X06   += ks[r+(R)+ 6];                                            \
+	X07   += ks[r+(R)+ 7];                                            \
+	X08   += ks[r+(R)+ 8];                                            \
+	X09   += ks[r+(R)+ 9];                                            \
+	X10   += ks[r+(R)+10];                                            \
+	X11   += ks[r+(R)+11];                                            \
+	X12   += ks[r+(R)+12];                                            \
+	X13   += ks[r+(R)+13] + ts[r+(R)+0];                              \
+	X14   += ks[r+(R)+14] + ts[r+(R)+1];                              \
+	X15   += ks[r+(R)+15] +    r+(R)   ;                              \
+	ks[r  +       (R)+16] = ks[r+(R)-1];  /* rotate key schedule */   \
+	ts[r  +       (R)+ 2] = ts[r+(R)-1];                              \
+	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+	for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024)    /* loop thru it */
+#endif  
+		{
+#define R1024_8_rounds(R)    /* do 8 full rounds */                               \
+		R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \
+		R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \
+		R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \
+		R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \
+		I1024(2*(R));                                                             \
+		R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \
+		R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \
+		R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \
+		R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \
+		I1024(2*(R)+1);
+
+		R1024_8_rounds( 0);
+
+#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
+
+  #if   R1024_Unroll_R( 1)
+		R1024_8_rounds( 1);
+  #endif
+  #if   R1024_Unroll_R( 2)
+		R1024_8_rounds( 2);
+  #endif
+  #if   R1024_Unroll_R( 3)
+		R1024_8_rounds( 3);
+  #endif
+  #if   R1024_Unroll_R( 4)
+		R1024_8_rounds( 4);
+  #endif
+  #if   R1024_Unroll_R( 5)
+		R1024_8_rounds( 5);
+  #endif
+  #if   R1024_Unroll_R( 6)
+		R1024_8_rounds( 6);
+  #endif
+  #if   R1024_Unroll_R( 7)
+		R1024_8_rounds( 7);
+  #endif
+  #if   R1024_Unroll_R( 8)
+		R1024_8_rounds( 8);
+  #endif
+  #if   R1024_Unroll_R( 9)
+		R1024_8_rounds( 9);
+  #endif
+  #if   R1024_Unroll_R(10)
+		R1024_8_rounds(10);
+  #endif
+  #if   R1024_Unroll_R(11)
+		R1024_8_rounds(11);
+  #endif
+  #if   R1024_Unroll_R(12)
+		R1024_8_rounds(12);
+  #endif
+  #if   R1024_Unroll_R(13)
+		R1024_8_rounds(13);
+  #endif
+  #if   R1024_Unroll_R(14)
+		R1024_8_rounds(14);
+  #endif
+  #if  (SKEIN_UNROLL_1024 > 14)
+#error  "need more unrolling in Skein_1024_Process_Block"
+  #endif
+		}
+		/* do the final "feedforward" xor, update context chaining vars */
+
+		ctx->X[ 0] = X00 ^ w[ 0];
+		ctx->X[ 1] = X01 ^ w[ 1];
+		ctx->X[ 2] = X02 ^ w[ 2];
+		ctx->X[ 3] = X03 ^ w[ 3];
+		ctx->X[ 4] = X04 ^ w[ 4];
+		ctx->X[ 5] = X05 ^ w[ 5];
+		ctx->X[ 6] = X06 ^ w[ 6];
+		ctx->X[ 7] = X07 ^ w[ 7];
+		ctx->X[ 8] = X08 ^ w[ 8];
+		ctx->X[ 9] = X09 ^ w[ 9];
+		ctx->X[10] = X10 ^ w[10];
+		ctx->X[11] = X11 ^ w[11];
+		ctx->X[12] = X12 ^ w[12];
+		ctx->X[13] = X13 ^ w[13];
+		ctx->X[14] = X14 ^ w[14];
+		ctx->X[15] = X15 ^ w[15];
+
+		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+		
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+		blkPtr += SKEIN1024_BLOCK_BYTES;
+		}
+	while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+	}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+static size_t Skein1024_Process_Block_CodeSize(void)
+	{
+	return ((u08b_t *) Skein1024_Process_Block_CodeSize) -
+		   ((u08b_t *) Skein1024_Process_Block);
+	}
+static uint_t Skein1024_Unroll_Cnt(void)
+	{
+	return SKEIN_UNROLL_1024;
+	}
+#endif
+#endif
+
+
+#if 0
+/*****************************************************************/
+/*     256-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+static int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+	{
+	union
+		{
+		u08b_t  b[SKEIN_256_STATE_BYTES];
+		u64b_t  w[SKEIN_256_STATE_WORDS];
+		} cfg;                              /* config block */
+		
+	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+	switch (hashBitLen)
+		{             /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+		case  256: memcpy(ctx->X,SKEIN_256_IV_256,sizeof(ctx->X));  break;
+		case  224: memcpy(ctx->X,SKEIN_256_IV_224,sizeof(ctx->X));  break;
+		case  160: memcpy(ctx->X,SKEIN_256_IV_160,sizeof(ctx->X));  break;
+		case  128: memcpy(ctx->X,SKEIN_256_IV_128,sizeof(ctx->X));  break;
+#endif
+		default:
+			/* here if there is no precomputed IV value available */
+			/* build/process the config block, type == CONFIG (could be precomputed) */
+			Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+			cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+			cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+			cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+			memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+			/* compute the initial chaining values from config block */
+			memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+			Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+			break;
+		}
+	/* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+	/* Set up to process the data message portion of the hash (default) */
+	Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+	return SKEIN_SUCCESS;
+	}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_256_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+static int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+	{
+	union
+		{
+		u08b_t  b[SKEIN_256_STATE_BYTES];
+		u64b_t  w[SKEIN_256_STATE_WORDS];
+		} cfg;                              /* config block */
+		
+	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0)                          /* is there a key? */
+		{                                   
+		memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+		}
+	else                                        /* here to pre-process a key */
+		{
+		Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+		/* do a mini-Init right here */
+		ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+		Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+		memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+		Skein_256_Update(ctx,key,keyBytes);     /* hash the key */
+		Skein_256_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+		memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+		{
+		uint_t i;
+		for (i=0;i<SKEIN_256_STATE_WORDS;i++)   /* convert key bytes to context words */
+			ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+		}
+	/* build/process the config block, type == CONFIG (could be precomputed for each key) */
+	ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+	Skein_Start_New_Type(ctx,CFG_FINAL);
+
+	memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+	cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+	Skein_Show_Key(256,&ctx->h,key,keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx,MSG);
+	
+	return SKEIN_SUCCESS;
+	}
+#endif
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+static int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+	{
+	size_t n;
+
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+		{
+		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			{
+			n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+			if (n)
+				{
+				Skein_assert(n < msgByteCnt);         /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+				msgByteCnt  -= n;
+				msg         += n;
+				ctx->h.bCnt += n;
+				}
+			Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+			Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+			}
+		/* now process any remaining full blocks, directly from input message data */
+		if (msgByteCnt > SKEIN_256_BLOCK_BYTES)
+			{
+			n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES;   /* number of full blocks to process */
+			Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+			msg        += n * SKEIN_256_BLOCK_BYTES;
+			}
+		Skein_assert(ctx->h.bCnt == 0);
+		}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt)
+		{
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+		}
+
+	return SKEIN_SUCCESS;
+	}
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+static int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	size_t i,n,byteCnt;
+	u64b_t X[SKEIN_256_STATE_WORDS];
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)            /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+
+	Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	
+	/* now output the result */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+	/* run Threefish in "counter mode" to generate output */
+	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+	for (i=0;i < byteCnt;i += SKEIN_256_BLOCK_BYTES)
+		{
+		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+		Skein_Start_New_Type(ctx,OUT_FINAL);
+		Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i;   /* number of output bytes left to go */
+		if (n >= SKEIN_256_BLOCK_BYTES)
+			n  = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal+i,ctx->X,n);   /* "output" the ctr mode bytes */
+		Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+		}
+	return SKEIN_SUCCESS;
+	}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+static size_t Skein_256_API_CodeSize(void)
+	{
+	return ((u08b_t *) Skein_256_API_CodeSize) -
+		   ((u08b_t *) Skein_256_Init);
+	}
+#endif
+
+/*****************************************************************/
+/*     512-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+	{
+	union
+		{
+		u08b_t  b[SKEIN_512_STATE_BYTES];
+		u64b_t  w[SKEIN_512_STATE_WORDS];
+		} cfg;                              /* config block */
+		
+	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+	switch (hashBitLen)
+		{             /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+		case  512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X));  break;
+		case  384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X));  break;
+		case  256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X));  break;
+		case  224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X));  break;
+#endif
+		default:
+			/* here if there is no precomputed IV value available */
+			/* build/process the config block, type == CONFIG (could be precomputed) */
+			Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+			cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+			cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+			cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+			memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+			/* compute the initial chaining values from config block */
+			memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+			Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+			break;
+		}
+
+	/* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+	/* Set up to process the data message portion of the hash (default) */
+	Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+	return SKEIN_SUCCESS;
+	}
+
+#if 0
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_512_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+static int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+	{
+	union
+		{
+		u08b_t  b[SKEIN_512_STATE_BYTES];
+		u64b_t  w[SKEIN_512_STATE_WORDS];
+		} cfg;                              /* config block */
+		
+	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0)                          /* is there a key? */
+		{                                   
+		memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+		}
+	else                                        /* here to pre-process a key */
+		{
+		Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+		/* do a mini-Init right here */
+		ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+		Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+		memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+		Skein_512_Update(ctx,key,keyBytes);     /* hash the key */
+		Skein_512_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+		memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+		{
+		uint_t i;
+		for (i=0;i<SKEIN_512_STATE_WORDS;i++)   /* convert key bytes to context words */
+			ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+		}
+	/* build/process the config block, type == CONFIG (could be precomputed for each key) */
+	ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+	Skein_Start_New_Type(ctx,CFG_FINAL);
+
+	memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+	cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+	Skein_Show_Key(512,&ctx->h,key,keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx,MSG);
+	
+	return SKEIN_SUCCESS;
+	}
+#endif
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+	{
+	size_t n;
+
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+		{
+		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			{
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+			if (n)
+				{
+				Skein_assert(n < msgByteCnt);         /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+				msgByteCnt  -= n;
+				msg         += n;
+				ctx->h.bCnt += n;
+				}
+			Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+			Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+			}
+		/* now process any remaining full blocks, directly from input message data */
+		if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
+			{
+			n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
+			Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+			msg        += n * SKEIN_512_BLOCK_BYTES;
+			}
+		Skein_assert(ctx->h.bCnt == 0);
+		}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt)
+		{
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+		}
+
+	return SKEIN_SUCCESS;
+	}
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	size_t i,n,byteCnt;
+	u64b_t X[SKEIN_512_STATE_WORDS];
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+	Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	
+	/* now output the result */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+	/* run Threefish in "counter mode" to generate output */
+	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+		{
+		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+		Skein_Start_New_Type(ctx,OUT_FINAL);
+		Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+		if (n >= SKEIN_512_BLOCK_BYTES)
+			n  = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+		Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+		}
+	return SKEIN_SUCCESS;
+	}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+static size_t Skein_512_API_CodeSize(void)
+	{
+	return ((u08b_t *) Skein_512_API_CodeSize) -
+		   ((u08b_t *) Skein_512_Init);
+	}
+#endif
+
+/*****************************************************************/
+/*    1024-bit Skein                                             */
+/*****************************************************************/
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+static int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+	{
+	union
+		{
+		u08b_t  b[SKEIN1024_STATE_BYTES];
+		u64b_t  w[SKEIN1024_STATE_WORDS];
+		} cfg;                              /* config block */
+		
+	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+
+	switch (hashBitLen)
+		{              /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+		case  512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break;
+		case  384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break;
+		case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break;
+#endif
+		default:
+			/* here if there is no precomputed IV value available */
+			/* build/process the config block, type == CONFIG (could be precomputed) */
+			Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
+
+			cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
+			cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+			cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+			memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+			/* compute the initial chaining values from config block */
+			memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
+			Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+			break;
+		}
+
+	/* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+	/* Set up to process the data message portion of the hash (default) */
+	Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+	return SKEIN_SUCCESS;
+	}
+
+#if 0
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein1024_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+static int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+	{
+	union
+		{
+		u08b_t  b[SKEIN1024_STATE_BYTES];
+		u64b_t  w[SKEIN1024_STATE_WORDS];
+		} cfg;                              /* config block */
+		
+	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0)                          /* is there a key? */
+		{                                   
+		memset(ctx->X,0,sizeof(ctx->X));        /* no key: use all zeroes as key for config block */
+		}
+	else                                        /* here to pre-process a key */
+		{
+		Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+		/* do a mini-Init right here */
+		ctx->h.hashBitLen=8*sizeof(ctx->X);     /* set output hash bit count = state size */
+		Skein_Start_New_Type(ctx,KEY);          /* set tweaks: T0 = 0; T1 = KEY type */
+		memset(ctx->X,0,sizeof(ctx->X));        /* zero the initial chaining variables */
+		Skein1024_Update(ctx,key,keyBytes);     /* hash the key */
+		Skein1024_Final_Pad(ctx,cfg.b);         /* put result into cfg.b[] */
+		memcpy(ctx->X,cfg.b,sizeof(cfg.b));     /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+		{
+		uint_t i;
+		for (i=0;i<SKEIN1024_STATE_WORDS;i++)   /* convert key bytes to context words */
+			ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+		}
+	/* build/process the config block, type == CONFIG (could be precomputed for each key) */
+	ctx->h.hashBitLen = hashBitLen;             /* output hash bit count */
+	Skein_Start_New_Type(ctx,CFG_FINAL);
+
+	memset(&cfg.w,0,sizeof(cfg.w));             /* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
+	cfg.w[2] = Skein_Swap64(treeInfo);          /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+	Skein_Show_Key(1024,&ctx->h,key,keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;                            /* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx,MSG);
+	
+	return SKEIN_SUCCESS;
+	}
+#endif
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+static int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+	{
+	size_t n;
+
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+		{
+		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			{
+			n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+			if (n)
+				{
+				Skein_assert(n < msgByteCnt);         /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+				msgByteCnt  -= n;
+				msg         += n;
+				ctx->h.bCnt += n;
+				}
+			Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+			Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+			}
+		/* now process any remaining full blocks, directly from input message data */
+		if (msgByteCnt > SKEIN1024_BLOCK_BYTES)
+			{
+			n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES;   /* number of full blocks to process */
+			Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+			msg        += n * SKEIN1024_BLOCK_BYTES;
+			}
+		Skein_assert(ctx->h.bCnt == 0);
+		}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt)
+		{
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+		}
+
+	return SKEIN_SUCCESS;
+	}
+   
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+static int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	size_t i,n,byteCnt;
+	u64b_t X[SKEIN1024_STATE_WORDS];
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)            /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+	Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	
+	/* now output the result */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+	/* run Threefish in "counter mode" to generate output */
+	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+	for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+		{
+		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+		Skein_Start_New_Type(ctx,OUT_FINAL);
+		Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
+		if (n >= SKEIN1024_BLOCK_BYTES)
+			n  = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+		Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+		}
+	return SKEIN_SUCCESS;
+	}
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+static size_t Skein1024_API_CodeSize(void)
+	{
+	return ((u08b_t *) Skein1024_API_CodeSize) -
+		   ((u08b_t *) Skein1024_Init);
+	}
+#endif
+
+/**************** Functions to support MAC/tree hashing ***************/
+/*   (this code is identical for Optimized and Reference versions)    */
+
+#if 0
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+static int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)   /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+	Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+	
+	Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_256_BLOCK_BYTES);   /* "output" the state bytes */
+	
+	return SKEIN_SUCCESS;
+	}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+static int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)   /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+	Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+	
+	Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_512_BLOCK_BYTES);   /* "output" the state bytes */
+	
+	return SKEIN_SUCCESS;
+	}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+static int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;        /* tag as the final block */
+	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)   /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+	Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);    /* process the final block */
+	
+	Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN1024_BLOCK_BYTES);   /* "output" the state bytes */
+	
+	return SKEIN_SUCCESS;
+	}
+
+
+#if SKEIN_TREE_HASH
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+static int Skein_256_Output(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	size_t i,n,byteCnt;
+	u64b_t X[SKEIN_256_STATE_WORDS];
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	/* now output the result */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+	/* run Threefish in "counter mode" to generate output */
+	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+	for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+		{
+		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+		Skein_Start_New_Type(ctx,OUT_FINAL);
+		Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i*SKEIN_256_BLOCK_BYTES;   /* number of output bytes left to go */
+		if (n >= SKEIN_256_BLOCK_BYTES)
+			n  = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+		Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+		}
+	return SKEIN_SUCCESS;
+	}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+static int Skein_512_Output(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	size_t i,n,byteCnt;
+	u64b_t X[SKEIN_512_STATE_WORDS];
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	/* now output the result */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+	/* run Threefish in "counter mode" to generate output */
+	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+		{
+		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+		Skein_Start_New_Type(ctx,OUT_FINAL);
+		Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+		if (n >= SKEIN_512_BLOCK_BYTES)
+			n  = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+		Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+		}
+	return SKEIN_SUCCESS;
+	}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage                                       */
+static int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+	{
+	size_t i,n,byteCnt;
+	u64b_t X[SKEIN1024_STATE_WORDS];
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+	/* now output the result */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;    /* total number of output bytes */
+
+	/* run Threefish in "counter mode" to generate output */
+	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+	for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+		{
+		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+		Skein_Start_New_Type(ctx,OUT_FINAL);
+		Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
+		if (n >= SKEIN1024_BLOCK_BYTES)
+			n  = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+		Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+		}
+	return SKEIN_SUCCESS;
+	}
+#endif
+#endif
+
+typedef struct
+{
+  uint_t  statebits;                      /* 256, 512, or 1024 */
+  union
+  {
+	Skein_Ctxt_Hdr_t h;                 /* common header "overlay" */
+	Skein_256_Ctxt_t ctx_256;
+	Skein_512_Ctxt_t ctx_512;
+	Skein1024_Ctxt_t ctx1024;
+  } u;
+}
+hashState;
+
+/* "incremental" hashing API */
+static SkeinHashReturn Init  (hashState *state, int hashbitlen);
+static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen);
+static SkeinHashReturn Final (hashState *state,       SkeinBitSequence *hashval);
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init the context */
+static SkeinHashReturn Init(hashState *state, int hashbitlen)
+{
+#if SKEIN_256_NIST_MAX_HASH_BITS
+  if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
+  {
+	Skein_Assert(hashbitlen > 0,BAD_HASHLEN);
+	state->statebits = 64*SKEIN_256_STATE_WORDS;
+	return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
+  }
+#endif
+  if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
+  {
+	state->statebits = 64*SKEIN_512_STATE_WORDS;
+	return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
+  }
+  else
+  {
+	state->statebits = 64*SKEIN1024_STATE_WORDS;
+	return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
+  }
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process data to be hashed */
+static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen)
+{
+  /* only the final Update() call is allowed do partial bytes, else assert an error */
+  Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL);
+
+  Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,SKEIN_FAIL);
+  if ((databitlen & 7) == 0)  /* partial bytes? */
+  {
+	switch ((state->statebits >> 8) & 3)
+	{
+	case 2:  return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
+	case 1:  return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
+	case 0:  return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
+	default: return SKEIN_FAIL;
+	}
+  }
+  else
+  {   /* handle partial final byte */
+	size_t bCnt = (databitlen >> 3) + 1;                  /* number of bytes to handle (nonzero here!) */
+	u08b_t b,mask;
+
+	mask = (u08b_t) (1u << (7 - (databitlen & 7)));       /* partial byte bit mask */
+	b    = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte */
+
+	switch ((state->statebits >> 8) & 3)
+	{
+	case 2:  Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte    */
+	  Skein_512_Update(&state->u.ctx_512,&b  ,  1   ); /* process the (masked) partial byte */
+	  break;
+	case 1:  Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte    */
+	  Skein_256_Update(&state->u.ctx_256,&b  ,  1   ); /* process the (masked) partial byte */
+	  break;
+	case 0:  Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte    */
+	  Skein1024_Update(&state->u.ctx1024,&b  ,  1   ); /* process the (masked) partial byte */
+	  break;
+	default: return SKEIN_FAIL;
+	}
+	Skein_Set_Bit_Pad_Flag(state->u.h);                    /* set tweak flag for the final call */
+
+	return SKEIN_SUCCESS;
+  }
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize hash computation and output the result (hashbitlen bits) */
+static SkeinHashReturn Final(hashState *state, SkeinBitSequence *hashval)
+{
+  Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+  switch ((state->statebits >> 8) & 3)
+  {
+  case 2:  return Skein_512_Final(&state->u.ctx_512,hashval);
+  case 1:  return Skein_256_Final(&state->u.ctx_256,hashval);
+  case 0:  return Skein1024_Final(&state->u.ctx1024,hashval);
+  default: return SKEIN_FAIL;
+  }
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* all-in-one hash function */
+SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, /* all-in-one call */
+				SkeinDataLength databitlen,SkeinBitSequence *hashval)
+{
+  hashState  state;
+  SkeinHashReturn r = Init(&state,hashbitlen);
+  if (r == SKEIN_SUCCESS)
+  { /* these calls do not fail when called properly */
+	r = Update(&state,data,databitlen);
+	Final(&state,hashval);
+  }
+  return r;
+}
diff --git a/xmrstak/backend/cpu/crypto/c_skein.h b/xmrstak/backend/cpu/crypto/c_skein.h
new file mode 100644
index 0000000..6165a2a
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/c_skein.h
@@ -0,0 +1,47 @@
+#ifndef _SKEIN_H_
+#define _SKEIN_H_     1
+/**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+** 
+** The following compile-time switches may be defined to control some
+** tradeoffs between speed, code size, error checking, and security.
+**
+** The "default" note explains what happens when the switch is not defined.
+**
+**  SKEIN_DEBUG            -- make callouts from inside Skein code
+**                            to examine/display intermediate values.
+**                            [default: no callouts (no overhead)]
+**
+**  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+**                            code. If not defined, most error checking 
+**                            is disabled (for performance). Otherwise, 
+**                            the switch value is interpreted as:
+**                                0: use assert()      to flag errors
+**                                1: return SKEIN_FAIL to flag errors
+**
+***************************************************************************/
+#include "skein_port.h"                      /* get platform-specific definitions */
+
+typedef enum
+{
+  SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
+  SKEIN_FAIL            =      1,
+  SKEIN_BAD_HASHLEN     =      2
+}
+SkeinHashReturn;
+
+typedef size_t   SkeinDataLength;                /* bit count  type */
+typedef u08b_t   SkeinBitSequence;               /* bit stream type */
+
+/* "all-in-one" call */
+SkeinHashReturn skein_hash(int hashbitlen,   const SkeinBitSequence *data,
+		SkeinDataLength databitlen, SkeinBitSequence *hashval);
+
+#endif  /* ifndef _SKEIN_H_ */
diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h
new file mode 100644
index 0000000..978c798
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cryptonight.h
@@ -0,0 +1,31 @@
+#ifndef __CRYPTONIGHT_H_INCLUDED
+#define __CRYPTONIGHT_H_INCLUDED
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <inttypes.h>
+
+#define MEMORY  2097152
+
+typedef struct {
+	uint8_t hash_state[224]; // Need only 200, explicit align
+	uint8_t* long_state;
+	uint8_t ctx_info[24]; //Use some of the extra memory for flags
+} cryptonight_ctx;
+
+typedef struct {
+	const char* warning;
+} alloc_msg;
+
+size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
+cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
+void cryptonight_free_ctx(cryptonight_ctx* ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
new file mode 100644
index 0000000..8bbb27c
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -0,0 +1,457 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  */
+#pragma once
+
+#include "cryptonight.h"
+#include <memory.h>
+#include <stdio.h>
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
+{
+	unsigned __int128 r = (unsigned __int128)a * (unsigned __int128)b;
+	*hi = r >> 64;
+	return (uint64_t)r;
+}
+#define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
+#else
+#include <intrin.h>
+#endif // __GNUC__
+
+#if !defined(_LP64) && !defined(_WIN64)
+#error You are trying to do a 32-bit build. This will all end in tears. I know it.
+#endif
+
+extern "C"
+{
+	void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
+	void keccakf(uint64_t st[25], int rounds);
+	extern void(*const extra_hashes[4])(const void *, size_t, char *);
+
+	__m128i soft_aesenc(__m128i in, __m128i key);
+	__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
+}
+
+// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+static inline __m128i sl_xor(__m128i tmp1)
+{
+	__m128i tmp4;
+	tmp4 = _mm_slli_si128(tmp1, 0x04);
+	tmp1 = _mm_xor_si128(tmp1, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	tmp1 = _mm_xor_si128(tmp1, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	tmp1 = _mm_xor_si128(tmp1, tmp4);
+	return tmp1;
+}
+
+template<uint8_t rcon>
+static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2)
+{
+	__m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon);
+	xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+	*xout0 = sl_xor(*xout0);
+	*xout0 = _mm_xor_si128(*xout0, xout1);
+	xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00);
+	xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+	*xout2 = sl_xor(*xout2);
+	*xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t rcon)
+{
+	__m128i xout1 = soft_aeskeygenassist(*xout2, rcon);
+	xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+	*xout0 = sl_xor(*xout0);
+	*xout0 = _mm_xor_si128(*xout0, xout1);
+	xout1 = soft_aeskeygenassist(*xout0, 0x00);
+	xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+	*xout2 = sl_xor(*xout2);
+	*xout2 = _mm_xor_si128(*xout2, xout1);
+}
+
+template<bool SOFT_AES>
+static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3,
+	__m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
+{
+	__m128i xout0, xout2;
+
+	xout0 = _mm_load_si128(memory);
+	xout2 = _mm_load_si128(memory+1);
+	*k0 = xout0;
+	*k1 = xout2;
+
+	if(SOFT_AES)
+		soft_aes_genkey_sub(&xout0, &xout2, 0x01);
+	else
+		aes_genkey_sub<0x01>(&xout0, &xout2);
+	*k2 = xout0;
+	*k3 = xout2;
+
+	if(SOFT_AES)
+		soft_aes_genkey_sub(&xout0, &xout2, 0x02);
+	else
+		aes_genkey_sub<0x02>(&xout0, &xout2);
+	*k4 = xout0;
+	*k5 = xout2;
+
+	if(SOFT_AES)
+		soft_aes_genkey_sub(&xout0, &xout2, 0x04);
+	else
+		aes_genkey_sub<0x04>(&xout0, &xout2);
+	*k6 = xout0;
+	*k7 = xout2;
+
+	if(SOFT_AES)
+		soft_aes_genkey_sub(&xout0, &xout2, 0x08);
+	else
+		aes_genkey_sub<0x08>(&xout0, &xout2);
+	*k8 = xout0;
+	*k9 = xout2;
+}
+
+static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+	*x0 = _mm_aesenc_si128(*x0, key);
+	*x1 = _mm_aesenc_si128(*x1, key);
+	*x2 = _mm_aesenc_si128(*x2, key);
+	*x3 = _mm_aesenc_si128(*x3, key);
+	*x4 = _mm_aesenc_si128(*x4, key);
+	*x5 = _mm_aesenc_si128(*x5, key);
+	*x6 = _mm_aesenc_si128(*x6, key);
+	*x7 = _mm_aesenc_si128(*x7, key);
+}
+
+static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+{
+	*x0 = soft_aesenc(*x0, key);
+	*x1 = soft_aesenc(*x1, key);
+	*x2 = soft_aesenc(*x2, key);
+	*x3 = soft_aesenc(*x3, key);
+	*x4 = soft_aesenc(*x4, key);
+	*x5 = soft_aesenc(*x5, key);
+	*x6 = soft_aesenc(*x6, key);
+	*x7 = soft_aesenc(*x7, key);
+}
+
+template<size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cn_explode_scratchpad(const __m128i* input, __m128i* output)
+{
+	// This is more than we have registers, compiler will assign 2 keys on the stack
+	__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
+	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+	aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+	xin0 = _mm_load_si128(input + 4);
+	xin1 = _mm_load_si128(input + 5);
+	xin2 = _mm_load_si128(input + 6);
+	xin3 = _mm_load_si128(input + 7);
+	xin4 = _mm_load_si128(input + 8);
+	xin5 = _mm_load_si128(input + 9);
+	xin6 = _mm_load_si128(input + 10);
+	xin7 = _mm_load_si128(input + 11);
+
+	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+	{
+		if(SOFT_AES)
+		{
+			soft_aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			soft_aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+		}
+		else
+		{
+			aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+			aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+		}
+
+		_mm_store_si128(output + i + 0, xin0);
+		_mm_store_si128(output + i + 1, xin1);
+		_mm_store_si128(output + i + 2, xin2);
+		_mm_store_si128(output + i + 3, xin3);
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)output + i + 0, _MM_HINT_T2);
+
+		_mm_store_si128(output + i + 4, xin4);
+		_mm_store_si128(output + i + 5, xin5);
+		_mm_store_si128(output + i + 6, xin6);
+		_mm_store_si128(output + i + 7, xin7);
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)output + i + 4, _MM_HINT_T2);
+	}
+}
+
+template<size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cn_implode_scratchpad(const __m128i* input, __m128i* output)
+{
+	// This is more than we have registers, compiler will assign 2 keys on the stack
+	__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
+	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
+
+	aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
+
+	xout0 = _mm_load_si128(output + 4);
+	xout1 = _mm_load_si128(output + 5);
+	xout2 = _mm_load_si128(output + 6);
+	xout3 = _mm_load_si128(output + 7);
+	xout4 = _mm_load_si128(output + 8);
+	xout5 = _mm_load_si128(output + 9);
+	xout6 = _mm_load_si128(output + 10);
+	xout7 = _mm_load_si128(output + 11);
+
+	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+	{
+		if(PREFETCH)
+			_mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
+
+		xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+		xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+		xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+		xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)input + i + 4, _MM_HINT_NTA);
+
+		xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+		xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+		xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+		xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+		if(SOFT_AES)
+		{
+			soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+		}
+		else
+		{
+			aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+			aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+		}
+	}
+
+	_mm_store_si128(output + 4, xout0);
+	_mm_store_si128(output + 5, xout1);
+	_mm_store_si128(output + 6, xout2);
+	_mm_store_si128(output + 7, xout3);
+	_mm_store_si128(output + 8, xout4);
+	_mm_store_si128(output + 9, xout5);
+	_mm_store_si128(output + 10, xout6);
+	_mm_store_si128(output + 11, xout7);
+}
+
+template<size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
+{
+	keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
+
+	// Optim - 99% time boundary
+	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+
+	uint8_t* l0 = ctx0->long_state;
+	uint64_t* h0 = (uint64_t*)ctx0->hash_state;
+
+	uint64_t al0 = h0[0] ^ h0[4];
+	uint64_t ah0 = h0[1] ^ h0[5];
+	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+	uint64_t idx0 = h0[0] ^ h0[4];
+
+	// Optim - 90% time boundary
+	for(size_t i = 0; i < ITERATIONS; i++)
+	{
+		__m128i cx;
+		cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
+
+		if(SOFT_AES)
+			cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+		else
+			cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+		_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+		idx0 = _mm_cvtsi128_si64(cx);
+		bx0 = cx;
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
+
+		uint64_t hi, lo, cl, ch;
+		cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0];
+		ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1];
+
+		lo = _umul128(idx0, cl, &hi);
+
+		al0 += hi;
+		ah0 += lo;
+		((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+		((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+		ah0 ^= ch;
+		al0 ^= cl;
+		idx0 = al0;
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
+	}
+
+	// Optim - 90% time boundary
+	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+
+	// Optim - 99% time boundary
+
+	keccakf((uint64_t*)ctx0->hash_state, 24);
+	extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
+}
+
+// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
+// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
+// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
+template<size_t ITERATIONS, size_t MEM, bool SOFT_AES, bool PREFETCH>
+void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx* __restrict ctx0, cryptonight_ctx* __restrict ctx1)
+{
+	keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
+	keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200);
+
+	// Optim - 99% time boundary
+	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
+
+	uint8_t* l0 = ctx0->long_state;
+	uint64_t* h0 = (uint64_t*)ctx0->hash_state;
+	uint8_t* l1 = ctx1->long_state;
+	uint64_t* h1 = (uint64_t*)ctx1->hash_state;
+
+	uint64_t axl0 = h0[0] ^ h0[4];
+	uint64_t axh0 = h0[1] ^ h0[5];
+	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+	uint64_t axl1 = h1[0] ^ h1[4];
+	uint64_t axh1 = h1[1] ^ h1[5];
+	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+	uint64_t idx0 = h0[0] ^ h0[4];
+	uint64_t idx1 = h1[0] ^ h1[4];
+
+	// Optim - 90% time boundary
+	for (size_t i = 0; i < ITERATIONS; i++)
+	{
+		__m128i cx;
+		cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
+
+		if(SOFT_AES)
+			cx = soft_aesenc(cx, _mm_set_epi64x(axh0, axl0));
+		else
+			cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0));
+
+		_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+		idx0 = _mm_cvtsi128_si64(cx);
+		bx0 = cx;
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
+
+		cx = _mm_load_si128((__m128i *)&l1[idx1 & 0x1FFFF0]);
+
+		if(SOFT_AES)
+			cx = soft_aesenc(cx, _mm_set_epi64x(axh1, axl1));
+		else
+			cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1));
+
+		_mm_store_si128((__m128i *)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx));
+		idx1 = _mm_cvtsi128_si64(cx);
+		bx1 = cx;
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)&l1[idx1 & 0x1FFFF0], _MM_HINT_T0);
+
+		uint64_t hi, lo, cl, ch;
+		cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0];
+		ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1];
+
+		lo = _umul128(idx0, cl, &hi);
+
+		axl0 += hi;
+		axh0 += lo;
+		((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = axl0;
+		((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = axh0;
+		axh0 ^= ch;
+		axl0 ^= cl;
+		idx0 = axl0;
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)&l0[idx0 & 0x1FFFF0], _MM_HINT_T0);
+
+		cl = ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0];
+		ch = ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1];
+
+		lo = _umul128(idx1, cl, &hi);
+
+		axl1 += hi;
+		axh1 += lo;
+		((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = axl1;
+		((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = axh1;
+		axh1 ^= ch;
+		axl1 ^= cl;
+		idx1 = axl1;
+
+		if(PREFETCH)
+			_mm_prefetch((const char*)&l1[idx1 & 0x1FFFF0], _MM_HINT_T0);
+	}
+
+	// Optim - 90% time boundary
+	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
+
+	// Optim - 99% time boundary
+
+	keccakf((uint64_t*)ctx0->hash_state, 24);
+	extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
+	keccakf((uint64_t*)ctx1->hash_state, 24);
+	extra_hashes[ctx1->hash_state[0] & 3](ctx1->hash_state, 200, (char*)output + 32);
+}
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
new file mode 100644
index 0000000..9d03ed7
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -0,0 +1,195 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+extern "C"
+{
+#include "c_groestl.h"
+#include "c_blake256.h"
+#include "c_jh.h"
+#include "c_skein.h"
+}
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __GNUC__
+#include <mm_malloc.h>
+#else
+#include <malloc.h>
+#endif // __GNUC__
+
+#if defined(__APPLE__)
+#include <mach/vm_statistics.h>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <errno.h>
+#include <string.h>
+#endif // _WIN32
+
+void do_blake_hash(const void* input, size_t len, char* output) {
+	blake256_hash((uint8_t*)output, (const uint8_t*)input, len);
+}
+
+void do_groestl_hash(const void* input, size_t len, char* output) {
+	groestl((const uint8_t*)input, len * 8, (uint8_t*)output);
+}
+
+void do_jh_hash(const void* input, size_t len, char* output) {
+	jh_hash(32 * 8, (const uint8_t*)input, 8 * len, (uint8_t*)output);
+}
+
+void do_skein_hash(const void* input, size_t len, char* output) {
+	skein_hash(8 * 32, (const uint8_t*)input, 8 * len, (uint8_t*)output);
+}
+
+void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+
+#ifdef _WIN32
+BOOL AddPrivilege(TCHAR* pszPrivilege)
+{
+	HANDLE           hToken;
+	TOKEN_PRIVILEGES tp;
+	BOOL             status;
+
+	if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
+		return FALSE;
+
+	if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid))
+		return FALSE;
+
+	tp.PrivilegeCount = 1;
+	tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+	status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
+
+	if (!status || (GetLastError() != ERROR_SUCCESS))
+		return FALSE;
+
+	CloseHandle(hToken);
+	return TRUE;
+}
+#endif
+
+size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
+{
+#ifdef _WIN32
+	if (AddPrivilege(TEXT("SeLockMemoryPrivilege")) == 0)
+	{
+		msg->warning = "Obtaning SeLockMemoryPrivilege failed.";
+		return 0;
+	}
+	return 1;
+#else
+	return 1;
+#endif // _WIN32
+}
+
+cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
+{
+	cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096);
+
+	if(use_fast_mem == 0)
+	{
+		// use 2MiB aligned memory
+		ptr->long_state = (uint8_t*)_mm_malloc(MEMORY, 2*1024*1024);
+		ptr->ctx_info[0] = 0;
+		ptr->ctx_info[1] = 0;
+		return ptr;
+	}
+
+#ifdef _WIN32
+	SIZE_T iLargePageMin = GetLargePageMinimum();
+
+	if(MEMORY > iLargePageMin)
+		iLargePageMin *= 2;
+
+	ptr->long_state = (uint8_t*)VirtualAlloc(NULL, iLargePageMin,
+		MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE);
+
+	if(ptr->long_state == NULL)
+	{
+		_mm_free(ptr);
+		msg->warning = "VirtualAlloc failed.";
+		return NULL;
+	}
+	else
+	{
+		ptr->ctx_info[0] = 1;
+		return ptr;
+	}
+#else
+
+#if defined(__APPLE__)
+	ptr->long_state  = (uint8_t*)mmap(0, MEMORY, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
+#elif defined(__FreeBSD__)
+	ptr->long_state = (uint8_t*)mmap(0, MEMORY, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0);
+#else
+	ptr->long_state = (uint8_t*)mmap(0, MEMORY, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0);
+#endif
+
+	if (ptr->long_state == MAP_FAILED)
+	{
+		_mm_free(ptr);
+		msg->warning = "mmap failed";
+		return NULL;
+	}
+
+	ptr->ctx_info[0] = 1;
+
+	if(madvise(ptr->long_state, MEMORY, MADV_RANDOM|MADV_WILLNEED) != 0)
+		msg->warning = "madvise failed";
+
+	ptr->ctx_info[1] = 0;
+	if(use_mlock != 0 && mlock(ptr->long_state, MEMORY) != 0)
+		msg->warning = "mlock failed";
+	else
+		ptr->ctx_info[1] = 1;
+
+	return ptr;
+#endif // _WIN32
+}
+
+void cryptonight_free_ctx(cryptonight_ctx* ctx)
+{
+	if(ctx->ctx_info[0] != 0)
+	{
+#ifdef _WIN32
+		VirtualFree(ctx->long_state, 0, MEM_RELEASE);
+#else
+		if(ctx->ctx_info[1] != 0)
+			munlock(ctx->long_state, MEMORY);
+		munmap(ctx->long_state, MEMORY);
+#endif // _WIN32
+	}
+	else
+		_mm_free(ctx->long_state);
+
+	_mm_free(ctx);
+}
diff --git a/xmrstak/backend/cpu/crypto/groestl_tables.h b/xmrstak/backend/cpu/crypto/groestl_tables.h
new file mode 100644
index 0000000..a23295c
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/groestl_tables.h
@@ -0,0 +1,38 @@
+#ifndef __tables_h
+#define __tables_h
+
+
+const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
+, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
+, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
+, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
+, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
+, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
+, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
+, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
+, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
+, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
+, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
+, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
+, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
+, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
+, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
+, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
+, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
+, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
+, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
+, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
+, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
+, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
+, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
+, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
+, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
+, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
+, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
+, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
+, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
+, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
+, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
+, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
+
+#endif /* __tables_h */
diff --git a/xmrstak/backend/cpu/crypto/hash.h b/xmrstak/backend/cpu/crypto/hash.h
new file mode 100644
index 0000000..c12d355
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/hash.h
@@ -0,0 +1,5 @@
+#pragma once
+
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn;
diff --git a/xmrstak/backend/cpu/crypto/int-util.h b/xmrstak/backend/cpu/crypto/int-util.h
new file mode 100644
index 0000000..8748976
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/int-util.h
@@ -0,0 +1,153 @@
+// Copyright(c) 2012 - 2013 The Cryptonote developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#pragma once
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(_MSC_VER)
+#include <stdlib.h>
+
+static inline uint32_t rol32(uint32_t x, int r) {
+	static_assert(sizeof(uint32_t) == sizeof(unsigned int), "this code assumes 32-bit integers");
+	return _rotl(x, r);
+}
+
+static inline uint64_t rol64(uint64_t x, int r) {
+	return _rotl64(x, r);
+}
+
+#else
+
+static inline uint32_t rol32(uint32_t x, int r) {
+	return (x << (r & 31)) | (x >> (-r & 31));
+}
+
+static inline uint64_t rol64(uint64_t x, int r) {
+	return (x << (r & 63)) | (x >> (-r & 63));
+}
+
+#endif
+
+static inline uint64_t hi_dword(uint64_t val) {
+	return val >> 32;
+}
+
+static inline uint64_t lo_dword(uint64_t val) {
+	return val & 0xFFFFFFFF;
+}
+
+static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) {
+	dividend |= ((uint64_t)*remainder) << 32;
+	*remainder = dividend % divisor;
+	return dividend / divisor;
+}
+
+// Long division with 2^32 base
+static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) {
+	uint64_t dividend_dwords[4];
+	uint32_t remainder = 0;
+
+	dividend_dwords[3] = hi_dword(dividend_hi);
+	dividend_dwords[2] = lo_dword(dividend_hi);
+	dividend_dwords[1] = hi_dword(dividend_lo);
+	dividend_dwords[0] = lo_dword(dividend_lo);
+
+	*quotient_hi = div_with_reminder(dividend_dwords[3], divisor, &remainder) << 32;
+	*quotient_hi |= div_with_reminder(dividend_dwords[2], divisor, &remainder);
+	*quotient_lo = div_with_reminder(dividend_dwords[1], divisor, &remainder) << 32;
+	*quotient_lo |= div_with_reminder(dividend_dwords[0], divisor, &remainder);
+
+	return remainder;
+}
+
+#define IDENT32(x) ((uint32_t) (x))
+#define IDENT64(x) ((uint64_t) (x))
+
+#define SWAP32(x) ((((uint32_t) (x) & 0x000000ff) << 24) | \
+  (((uint32_t) (x) & 0x0000ff00) <<  8) | \
+  (((uint32_t) (x) & 0x00ff0000) >>  8) | \
+  (((uint32_t) (x) & 0xff000000) >> 24))
+#define SWAP64(x) ((((uint64_t) (x) & 0x00000000000000ff) << 56) | \
+  (((uint64_t) (x) & 0x000000000000ff00) << 40) | \
+  (((uint64_t) (x) & 0x0000000000ff0000) << 24) | \
+  (((uint64_t) (x) & 0x00000000ff000000) <<  8) | \
+  (((uint64_t) (x) & 0x000000ff00000000) >>  8) | \
+  (((uint64_t) (x) & 0x0000ff0000000000) >> 24) | \
+  (((uint64_t) (x) & 0x00ff000000000000) >> 40) | \
+  (((uint64_t) (x) & 0xff00000000000000) >> 56))
+
+static inline uint32_t ident32(uint32_t x) { return x; }
+static inline uint64_t ident64(uint64_t x) { return x; }
+
+static inline uint32_t swap32(uint32_t x) {
+	x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8);
+	return (x << 16) | (x >> 16);
+}
+static inline uint64_t swap64(uint64_t x) {
+	x = ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8);
+	x = ((x & 0x0000ffff0000ffff) << 16) | ((x & 0xffff0000ffff0000) >> 16);
+	return (x << 32) | (x >> 32);
+}
+
+#if defined(__GNUC__)
+#define UNUSED __attribute__((unused))
+#else
+#define UNUSED
+#endif
+static inline void mem_inplace_ident(void *mem UNUSED, size_t n UNUSED) { }
+#undef UNUSED
+
+static inline void mem_inplace_swap32(void *mem, size_t n) {
+	size_t i;
+	for (i = 0; i < n; i++) {
+		((uint32_t *)mem)[i] = swap32(((const uint32_t *)mem)[i]);
+	}
+}
+static inline void mem_inplace_swap64(void *mem, size_t n) {
+	size_t i;
+	for (i = 0; i < n; i++) {
+		((uint64_t *)mem)[i] = swap64(((const uint64_t *)mem)[i]);
+	}
+}
+
+static inline void memcpy_ident32(void *dst, const void *src, size_t n) {
+	memcpy(dst, src, 4 * n);
+}
+static inline void memcpy_ident64(void *dst, const void *src, size_t n) {
+	memcpy(dst, src, 8 * n);
+}
+
+static inline void memcpy_swap32(void *dst, const void *src, size_t n) {
+	size_t i;
+	for (i = 0; i < n; i++) {
+		((uint32_t *)dst)[i] = swap32(((const uint32_t *)src)[i]);
+	}
+}
+static inline void memcpy_swap64(void *dst, const void *src, size_t n) {
+	size_t i;
+	for (i = 0; i < n; i++) {
+		((uint64_t *)dst)[i] = swap64(((const uint64_t *)src)[i]);
+	}
+}
+
+#define SWAP32LE IDENT32
+#define SWAP32BE SWAP32
+#define swap32le ident32
+#define swap32be swap32
+#define mem_inplace_swap32le mem_inplace_ident
+#define mem_inplace_swap32be mem_inplace_swap32
+#define memcpy_swap32le memcpy_ident32
+#define memcpy_swap32be memcpy_swap32
+#define SWAP64LE IDENT64
+#define SWAP64BE SWAP64
+#define swap64le ident64
+#define swap64be swap64
+#define mem_inplace_swap64le mem_inplace_ident
+#define mem_inplace_swap64be mem_inplace_swap64
+#define memcpy_swap64le memcpy_ident64
+#define memcpy_swap64be memcpy_swap64
+\ No newline at end of file
diff --git a/xmrstak/backend/cpu/crypto/skein_port.h b/xmrstak/backend/cpu/crypto/skein_port.h
new file mode 100644
index 0000000..9cbefcb
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/skein_port.h
@@ -0,0 +1,179 @@
+#ifndef _SKEIN_PORT_H_
+#define _SKEIN_PORT_H_
+
+#include <limits.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*  These defines are used to declare buffers in a way that allows
+	faster operations on longer variables to be used.  In all these
+	defines 'size' must be a power of 2 and >= 8
+
+	dec_unit_type(size,x)       declares a variable 'x' of length
+								'size' bits
+
+	dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize'
+								bytes defined as an array of variables
+								each of 'size' bits (bsize must be a
+								multiple of size / 8)
+
+	ptr_cast(x,size)            casts a pointer to a pointer to a
+								varaiable of length 'size' bits
+*/
+
+#define ui_type(size)               uint##size##_t
+#define dec_unit_type(size,x)       typedef ui_type(size) x
+#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x,size)            ((ui_type(size)*)(x))
+
+typedef unsigned int    uint_t;             /* native unsigned integer */
+typedef uint8_t         u08b_t;             /*  8-bit unsigned integer */
+typedef uint64_t        u64b_t;             /* 64-bit unsigned integer */
+
+#ifndef RotL_64
+#define RotL_64(x,N)    (((x) << (N)) | ((x) >> (64-(N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs.  The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ *    SKEIN_NEED_SWAP:  0 for little-endian, 1 for big-endian
+ *    Skein_Put64_LSB_First
+ *    Skein_Get64_LSB_First
+ *    Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */
+
+#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+/* special handler for IA64, which may be either endianness (?)  */
+/* here we assume little-endian, but this may need to be changed */
+#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+#  define PLATFORM_MUST_ALIGN (1)
+#ifndef PLATFORM_BYTE_ORDER
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+#endif
+
+#ifndef   PLATFORM_MUST_ALIGN
+#  define PLATFORM_MUST_ALIGN (0)
+#endif
+
+
+#if   PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+	/* here for big-endian CPUs */
+#define SKEIN_NEED_SWAP   (1)
+#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+	/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define SKEIN_NEED_SWAP   (0)
+#if   PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
+#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt)
+#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt))
+#endif
+#else
+#error "Skein needs endianness setting!"
+#endif
+
+#endif /* ifndef SKEIN_NEED_SWAP */
+
+/*
+ ******************************************************************
+ *      Provide any definitions still needed.
+ ******************************************************************
+ */
+#ifndef Skein_Swap64  /* swap for big-endian, nop for little-endian */
+#if     SKEIN_NEED_SWAP
+#define Skein_Swap64(w64)                       \
+  ( (( ((u64b_t)(w64))       & 0xFF) << 56) |   \
+	(((((u64b_t)(w64)) >> 8) & 0xFF) << 48) |   \
+	(((((u64b_t)(w64)) >>16) & 0xFF) << 40) |   \
+	(((((u64b_t)(w64)) >>24) & 0xFF) << 32) |   \
+	(((((u64b_t)(w64)) >>32) & 0xFF) << 24) |   \
+	(((((u64b_t)(w64)) >>40) & 0xFF) << 16) |   \
+	(((((u64b_t)(w64)) >>48) & 0xFF) <<  8) |   \
+	(((((u64b_t)(w64)) >>56) & 0xFF)      ) )
+#else
+#define Skein_Swap64(w64)  (w64)
+#endif
+#endif  /* ifndef Skein_Swap64 */
+
+
+#ifndef Skein_Put64_LSB_First
+void    Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
+#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
+	{ /* this version is fully portable (big-endian or little-endian), but slow */
+	size_t n;
+
+	for (n=0;n<bCnt;n++)
+		dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
+	}
+#else
+	;    /* output only the function prototype */
+#endif
+#endif   /* ifndef Skein_Put64_LSB_First */
+
+
+#ifndef Skein_Get64_LSB_First
+void    Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
+#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
+	{ /* this version is fully portable (big-endian or little-endian), but slow */
+	size_t n;
+
+	for (n=0;n<8*wCnt;n+=8)
+		dst[n/8] = (((u64b_t) src[n  ])      ) +
+				   (((u64b_t) src[n+1]) <<  8) +
+				   (((u64b_t) src[n+2]) << 16) +
+				   (((u64b_t) src[n+3]) << 24) +
+				   (((u64b_t) src[n+4]) << 32) +
+				   (((u64b_t) src[n+5]) << 40) +
+				   (((u64b_t) src[n+6]) << 48) +
+				   (((u64b_t) src[n+7]) << 56) ;
+	}
+#else
+	;    /* output only the function prototype */
+#endif
+#endif   /* ifndef Skein_Get64_LSB_First */
+
+#endif   /* ifndef _SKEIN_PORT_H_ */
diff --git a/xmrstak/backend/cpu/crypto/soft_aes.c b/xmrstak/backend/cpu/crypto/soft_aes.c
new file mode 100644
index 0000000..aba7c20
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/soft_aes.c
@@ -0,0 +1,212 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+/*
+ * The orginal author of this AES implementation is Karl Malbrain.
+ */
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif // __GNUC__
+
+#include <inttypes.h>
+
+#define TABLE_ALIGN     32
+#define WPOLY           0x011b
+#define N_COLS          4
+#define AES_BLOCK_SIZE  16
+#define RC_LENGTH       (5 * (AES_BLOCK_SIZE / 4 - 2))
+
+#if defined(_MSC_VER)
+#define ALIGN __declspec(align(TABLE_ALIGN))
+#elif defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(16)))
+#else
+#define ALIGN
+#endif
+
+#define rf1(r,c) (r)
+#define word_in(x,c) (*((uint32_t*)(x)+(c)))
+#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
+
+#define s(x,c) x[c]
+#define si(y,x,c) (s(y,c) = word_in(x, c))
+#define so(y,x,c) word_out(y, c, s(x,c))
+#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3)
+#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
+#define round(y,x,k) \
+y[0] = (k)[0]  ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \
+y[1] = (k)[1]  ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \
+y[2] = (k)[2]  ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \
+y[3] = (k)[3]  ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]);
+#define to_byte(x) ((x) & 0xff)
+#define bval(x,n) to_byte((x) >> (8 * (n)))
+
+#define fwd_var(x,r,c)\
+ ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
+ : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
+ : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
+ :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
+
+#define fwd_rnd(y,x,k,c)  (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
+
+#define sb_data(w) {\
+    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
+    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
+    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
+    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
+    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
+    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
+    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
+    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
+    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
+    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
+    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
+    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
+    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
+    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
+    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
+    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
+    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
+    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
+    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
+    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
+    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
+    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
+    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
+    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
+    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
+    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
+    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
+    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
+    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
+    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
+    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
+    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+
+#define rc_data(w) {\
+    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
+    w(0x1b), w(0x36) }
+
+#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
+    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+
+#define h0(x)   (x)
+#define w0(p)   bytes2word(p, 0, 0, 0)
+#define w1(p)   bytes2word(0, p, 0, 0)
+#define w2(p)   bytes2word(0, 0, p, 0)
+#define w3(p)   bytes2word(0, 0, 0, p)
+
+#define u0(p)   bytes2word(f2(p), p, p, f3(p))
+#define u1(p)   bytes2word(f3(p), f2(p), p, p)
+#define u2(p)   bytes2word(p, f3(p), f2(p), p)
+#define u3(p)   bytes2word(p, p, f3(p), f2(p))
+
+#define v0(p)   bytes2word(fe(p), f9(p), fd(p), fb(p))
+#define v1(p)   bytes2word(fb(p), fe(p), f9(p), fd(p))
+#define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
+#define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
+
+#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
+#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
+#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY))
+#define f3(x)   (f2(x) ^ x)
+#define f9(x)   (f8(x) ^ x)
+#define fb(x)   (f8(x) ^ f2(x) ^ x)
+#define fd(x)   (f8(x) ^ f4(x) ^ x)
+#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
+
+#define t_dec(m,n) t_##m##n
+#define t_set(m,n) t_##m##n
+#define t_use(m,n) t_##m##n
+
+#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) }
+
+#define four_tables(x,tab,vf,rf,c) \
+    (tab[0][bval(vf(x,0,c),rf(0,c))] \
+    ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
+    ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
+    ^ tab[3][bval(vf(x,3,c),rf(3,c))])
+
+d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
+
+__m128i soft_aesenc(__m128i in, __m128i key)
+{
+	uint32_t x0, x1, x2, x3;
+	x0 = _mm_cvtsi128_si32(in);
+	x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
+	x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
+	x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
+
+	__m128i out = _mm_set_epi32(
+		(t_fn[0][x3 & 0xff] ^ t_fn[1][(x0 >> 8) & 0xff] ^ t_fn[2][(x1 >> 16) & 0xff] ^ t_fn[3][x2 >> 24]),
+		(t_fn[0][x2 & 0xff] ^ t_fn[1][(x3 >> 8) & 0xff] ^ t_fn[2][(x0 >> 16) & 0xff] ^ t_fn[3][x1 >> 24]),
+		(t_fn[0][x1 & 0xff] ^ t_fn[1][(x2 >> 8) & 0xff] ^ t_fn[2][(x3 >> 16) & 0xff] ^ t_fn[3][x0 >> 24]),
+		(t_fn[0][x0 & 0xff] ^ t_fn[1][(x1 >> 8) & 0xff] ^ t_fn[2][(x2 >> 16) & 0xff] ^ t_fn[3][x3 >> 24]));
+
+	return _mm_xor_si128(out, key);
+}
+
+uint8_t Sbox[256] = {		// forward s-box
+0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
+
+static inline void sub_word(uint8_t* key)
+{
+	key[0] = Sbox[key[0]];
+	key[1] = Sbox[key[1]];
+	key[2] = Sbox[key[2]];
+	key[3] = Sbox[key[3]];
+}
+
+#ifdef __clang__
+uint32_t _rotr(uint32_t value, uint32_t amount)
+{
+	return (value >> amount) | (value << ((32 - amount) & 31));
+}
+#endif
+
+__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
+{
+	uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+	uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+	sub_word((uint8_t*)&X1);
+	sub_word((uint8_t*)&X3);
+	return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1);
+}
diff --git a/xmrstak/backend/cpu/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp
new file mode 100644
index 0000000..f471951
--- /dev/null
+++ b/xmrstak/backend/cpu/hwlocMemory.hpp
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "console.h"
+
+#ifndef CONF_NO_HWLOC
+
+#include <hwloc.h>
+
+/** pin memory to NUMA node
+ *
+ * Set the default memory policy for the current thread to bind memory to the
+ * NUMA node.
+ *
+ * @param puId core id
+ */
+void bindMemoryToNUMANode( size_t puId )
+{
+	int depth;
+	hwloc_topology_t topology;
+
+	hwloc_topology_init(&topology);
+	hwloc_topology_load(topology);
+
+	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
+
+	for( size_t i = 0;
+		i < hwloc_get_nbobjs_by_depth(topology, depth);
+		i++ )
+	{
+		hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i);
+		if(  pu->os_index == puId )
+		{
+			if( 0 > hwloc_set_membind_nodeset(
+				topology,
+				pu->nodeset,
+				HWLOC_MEMBIND_BIND,
+				HWLOC_MEMBIND_THREAD))
+			{
+				printer::inst()->print_msg(L0, "hwloc: can't bind memory");
+			}
+			else
+			{
+				printer::inst()->print_msg(L0, "hwloc: memory pinned");
+				break;
+			}
+		}
+	}
+}
+#else
+
+void bindMemoryToNUMANode( size_t )
+{
+}
+
+#endif
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
new file mode 100644
index 0000000..021d607
--- /dev/null
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -0,0 +1,257 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include "jconf.h"
+#include "../../console.h"
+#include <iostream>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#include <intrin.h>
+#else
+#include <cpuid.h>
+#endif
+
+#include "../../rapidjson/document.h"
+#include "../../rapidjson/error/en.h"
+#include "../../jext.h"
+
+namespace xmrstak
+{
+namespace cpu
+{
+
+using namespace rapidjson;
+
+/*
+ * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
+ */
+enum configEnum { aCpuThreadsConf, sUseSlowMem };
+
+struct configVal {
+	configEnum iName;
+	const char* sName;
+	Type iType;
+};
+
+// Same order as in configEnum, as per comment above
+// kNullType means any type
+configVal oConfigValues[] = {
+	{ aCpuThreadsConf, "cpu_threads_conf", kNullType }
+};
+
+constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+
+inline bool checkType(Type have, Type want)
+{
+	if(want == have)
+		return true;
+	else if(want == kNullType)
+		return true;
+	else if(want == kTrueType && have == kFalseType)
+		return true;
+	else if(want == kFalseType && have == kTrueType)
+		return true;
+	else
+		return false;
+}
+
+struct jconf::opaque_private
+{
+	Document jsonDoc;
+	const Value* configValues[iConfigCnt]; //Compile time constant
+
+	opaque_private()
+	{
+	}
+};
+
+jconf* jconf::oInst = nullptr;
+
+jconf::jconf()
+{
+	prv = new opaque_private();
+}
+
+bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
+{
+	if(!prv->configValues[aCpuThreadsConf]->IsArray())
+		return false;
+
+	if(id >= prv->configValues[aCpuThreadsConf]->Size())
+		return false;
+
+	const Value& oThdConf = prv->configValues[aCpuThreadsConf]->GetArray()[id];
+
+	if(!oThdConf.IsObject())
+		return false;
+
+	const Value *mode, *no_prefetch, *aff;
+	mode = GetObjectMember(oThdConf, "low_power_mode");
+	no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
+	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+
+	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
+		return false;
+
+	if(!mode->IsBool() || !no_prefetch->IsBool())
+		return false;
+
+	if(!aff->IsNumber() && !aff->IsBool())
+		return false;
+
+	if(aff->IsNumber() && aff->GetInt64() < 0)
+		return false;
+
+	cfg.bDoubleMode = mode->GetBool();
+	cfg.bNoPrefetch = no_prefetch->GetBool();
+
+	if(aff->IsNumber())
+		cfg.iCpuAff = aff->GetInt64();
+	else
+		cfg.iCpuAff = -1;
+
+	return true;
+}
+
+
+size_t jconf::GetThreadCount()
+{
+	if(prv->configValues[aCpuThreadsConf]->IsArray())
+		return prv->configValues[aCpuThreadsConf]->Size();
+	else
+		return 0;
+}
+
+bool jconf::parse_config(const char* sFilename)
+{
+	FILE * pFile;
+	char * buffer;
+	size_t flen;
+
+	pFile = fopen(sFilename, "rb");
+	if (pFile == NULL)
+	{
+		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
+		return false;
+	}
+
+	fseek(pFile,0,SEEK_END);
+	flen = ftell(pFile);
+	rewind(pFile);
+
+	if(flen >= 64*1024)
+	{
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
+		return false;
+	}
+
+	if(flen <= 16)
+	{
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "File is empty or too short - %s.", sFilename);
+		return false;
+	}
+
+	buffer = (char*)malloc(flen + 3);
+	if(fread(buffer+1, flen, 1, pFile) != 1)
+	{
+		free(buffer);
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "Read error while reading %s.", sFilename);
+		return false;
+	}
+	fclose(pFile);
+
+	//Replace Unicode BOM with spaces - we always use UTF-8
+	unsigned char* ubuffer = (unsigned char*)buffer;
+	if(ubuffer[1] == 0xEF && ubuffer[2] == 0xBB && ubuffer[3] == 0xBF)
+	{
+		buffer[1] = ' ';
+		buffer[2] = ' ';
+		buffer[3] = ' ';
+	}
+
+	buffer[0] = '{';
+	buffer[flen] = '}';
+	buffer[flen + 1] = '\0';
+
+	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	free(buffer);
+
+	if(prv->jsonDoc.HasParseError())
+	{
+		printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s",
+			int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
+		return false;
+	}
+
+	if(!prv->jsonDoc.IsObject())
+	{ //This should never happen as we created the root ourselves
+		printer::inst()->print_msg(L0, "Invalid config file. No root?\n");
+		return false;
+	}
+
+	for(size_t i = 0; i < iConfigCnt; i++)
+	{
+		if(oConfigValues[i].iName != i)
+		{
+			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order.");
+			return false;
+		}
+
+		prv->configValues[i] = GetObjectMember(prv->jsonDoc, oConfigValues[i].sName);
+
+		if(prv->configValues[i] == nullptr)
+		{
+			printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName);
+			return false;
+		}
+
+		if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
+		{
+			printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName);
+			return false;
+		}
+	}
+
+	thd_cfg c;
+	for(size_t i=0; i < GetThreadCount(); i++)
+	{
+		if(!GetThreadConfig(i, c))
+		{
+			printer::inst()->print_msg(L0, "Thread %llu has invalid config.", int_port(i));
+			return false;
+		}
+	}
+
+	return true;
+}
+
+} // namespace cpu
+} // namepsace xmrstak
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
new file mode 100644
index 0000000..1f92765
--- /dev/null
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -0,0 +1,44 @@
+#pragma once
+#include <stdlib.h>
+#include <string>
+#include "../../Params.hpp"
+
+namespace xmrstak
+{
+namespace cpu
+{
+
+class jconf
+{
+public:
+	static jconf* inst()
+	{
+		if (oInst == nullptr) oInst = new jconf;
+		return oInst;
+	};
+
+	bool parse_config(const char* sFilename = Params::inst().configFileCPU.c_str());
+
+	struct thd_cfg {
+		bool bDoubleMode;
+		bool bNoPrefetch;
+		long long iCpuAff;
+	};
+
+	size_t GetThreadCount();
+	bool GetThreadConfig(size_t id, thd_cfg &cfg);
+	bool NeedsAutoconf();
+
+	
+
+
+private:
+	jconf();
+	static jconf* oInst;
+
+	struct opaque_private;
+	opaque_private* prv;
+};
+
+} // namespace cpu
+} // namepsace xmrstak
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
new file mode 100644
index 0000000..3ffdf99
--- /dev/null
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -0,0 +1,508 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include <assert.h>
+#include <cmath>
+#include <chrono>
+#include <cstring>
+#include <thread>
+#include <bitset>
+#include "../../console.h"
+#include "../IBackend.hpp"
+#include "../GlobalStates.hpp"
+#include "../../ConfigEditor.hpp"
+#include "../../Params.hpp"
+#include "../../jconf.h"
+
+#include "../../executor.h"
+#include "minethd.h"
+#include "./jconf.h"
+#include "../../crypto/cryptonight_aesni.h"
+#include "../../hwlocMemory.hpp"
+#include "../miner_work.h"
+
+#ifndef CONF_NO_HWLOC
+#   include "autoAdjustHwloc.hpp"
+#else
+#   include "autoAdjust.hpp"
+#endif
+
+
+#ifdef _WIN32
+#include <windows.h>
+
+namespace xmrstak
+{
+namespace cpu
+{
+void minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
+{
+	SetThreadAffinityMask(h, 1ULL << cpu_id);
+}
+
+} // namespace cpu
+} // namespace xmrstak
+
+#else
+#include <pthread.h>
+
+#if defined(__APPLE__)
+#include <mach/thread_policy.h>
+#include <mach/thread_act.h>
+#define SYSCTL_CORE_COUNT   "machdep.cpu.core_count"
+#elif defined(__FreeBSD__)
+#include <pthread_np.h>
+#endif
+
+namespace xmrstak
+{
+namespace cpu
+{
+
+void minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
+{
+#if defined(__APPLE__)
+	thread_port_t mach_thread;
+	thread_affinity_policy_data_t policy = { static_cast<integer_t>(cpu_id) };
+	mach_thread = pthread_mach_thread_np(h);
+	thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1);
+#elif defined(__FreeBSD__)
+	cpuset_t mn;
+	CPU_ZERO(&mn);
+	CPU_SET(cpu_id, &mn);
+	pthread_setaffinity_np(h, sizeof(cpuset_t), &mn);
+#else
+	cpu_set_t mn;
+	CPU_ZERO(&mn);
+	CPU_SET(cpu_id, &mn);
+	pthread_setaffinity_np(h, sizeof(cpu_set_t), &mn);
+#endif
+}
+
+} // namespace cpu
+} // namespace xmrstak
+
+#endif // _WIN32
+
+
+namespace xmrstak
+{
+namespace cpu
+{
+
+minethd::minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity)
+{
+	oWork = pWork;
+	bQuit = 0;
+	iThreadNo = (uint8_t)iNo;
+	iJobNo = 0;
+	bNoPrefetch = no_prefetch;
+	this->affinity = affinity;
+
+	std::lock_guard<std::mutex> lock(work_thd_mtx);
+	if(double_work)
+		oWorkThd = std::thread(&minethd::double_work_main, this);
+	else
+		oWorkThd = std::thread(&minethd::work_main, this);
+}
+
+cryptonight_ctx* minethd::minethd_alloc_ctx()
+{
+	cryptonight_ctx* ctx;
+	alloc_msg msg = { 0 };
+
+	switch (::jconf::inst()->GetSlowMemSetting())
+	{
+	case ::jconf::never_use:
+		ctx = cryptonight_alloc_ctx(1, 1, &msg);
+		if (ctx == NULL)
+			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
+		return ctx;
+
+	case ::jconf::no_mlck:
+		ctx = cryptonight_alloc_ctx(1, 0, &msg);
+		if (ctx == NULL)
+			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
+		return ctx;
+
+	case ::jconf::print_warning:
+		ctx = cryptonight_alloc_ctx(1, 1, &msg);
+		if (msg.warning != NULL)
+			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
+		if (ctx == NULL)
+			ctx = cryptonight_alloc_ctx(0, 0, NULL);
+		return ctx;
+
+	case ::jconf::always_use:
+		return cryptonight_alloc_ctx(0, 0, NULL);
+
+	case ::jconf::unknown_value:
+		return NULL; //Shut up compiler
+	}
+
+	return nullptr; //Should never happen
+}
+
+bool minethd::self_test()
+{
+	alloc_msg msg = { 0 };
+	size_t res;
+	bool fatal = false;
+
+	switch (::jconf::inst()->GetSlowMemSetting())
+	{
+	case ::jconf::never_use:
+		res = cryptonight_init(1, 1, &msg);
+		fatal = true;
+		break;
+
+	case ::jconf::no_mlck:
+		res = cryptonight_init(1, 0, &msg);
+		fatal = true;
+		break;
+
+	case ::jconf::print_warning:
+		res = cryptonight_init(1, 1, &msg);
+		break;
+
+	case ::jconf::always_use:
+		res = cryptonight_init(0, 0, &msg);
+		break;
+
+	case ::jconf::unknown_value:
+	default:
+		return false; //Shut up compiler
+	}
+
+	if(msg.warning != nullptr)
+		printer::inst()->print_msg(L0, "MEMORY INIT ERROR: %s", msg.warning);
+
+	if(res == 0 && fatal)
+		return false;
+
+	cryptonight_ctx *ctx0, *ctx1;
+	if((ctx0 = minethd_alloc_ctx()) == nullptr)
+		return false;
+
+	if((ctx1 = minethd_alloc_ctx()) == nullptr)
+	{
+		cryptonight_free_ctx(ctx0);
+		return false;
+	}
+
+	unsigned char out[64];
+	bool bResult;
+
+	cn_hash_fun hashf;
+	cn_hash_fun_dbl hashdf;
+
+	hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false);
+	hashf("This is a test", 14, out, ctx0);
+	bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+	hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true);
+	hashf("This is a test", 14, out, ctx0);
+	bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+	hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), false);
+	hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
+	bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+		                   "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+	hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), true);
+	hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1);
+	bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+		                   "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+	cryptonight_free_ctx(ctx0);
+	cryptonight_free_ctx(ctx1);
+
+	if(!bResult)
+		printer::inst()->print_msg(L0,
+		    "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
+
+	return bResult;
+}
+
+std::vector<IBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work& pWork)
+{
+	std::vector<IBackend*> pvThreads;
+
+	if(!ConfigEditor::file_exist(Params::inst().configFileCPU))
+	{
+		autoAdjust adjust;
+		if(!adjust.printConfig())
+			return pvThreads;
+	}
+
+	if(!jconf::inst()->parse_config())
+	{
+		win_exit();
+	}
+	
+
+	//Launch the requested number of single and double threads, to distribute
+	//load evenly we need to alternate single and double threads
+	size_t i, n = jconf::inst()->GetThreadCount();
+	pvThreads.reserve(n);
+
+	jconf::thd_cfg cfg;
+	for (i = 0; i < n; i++)
+	{
+		jconf::inst()->GetThreadConfig(i, cfg);
+
+		// \todo need thread offset
+		minethd* thd = new minethd(pWork, i + threadOffset, cfg.bDoubleMode, cfg.bNoPrefetch, cfg.iCpuAff);
+		pvThreads.push_back(thd);
+
+		if(cfg.iCpuAff >= 0)
+			printer::inst()->print_msg(L1, "Starting %s thread, affinity: %d.", cfg.bDoubleMode ? "double" : "single", (int)cfg.iCpuAff);
+		else
+			printer::inst()->print_msg(L1, "Starting %s thread, no affinity.", cfg.bDoubleMode ? "double" : "single");
+	}
+
+	return pvThreads;
+}
+
+void minethd::consume_work()
+{
+	memcpy(&oWork, &GlobalStates::inst().inst().oGlobalWork, sizeof(miner_work));
+	iJobNo++;
+	GlobalStates::inst().inst().iConsumeCnt++;
+}
+
+minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch)
+{
+	// We have two independent flag bits in the functions
+	// therefore we will build a binary digit and select the
+	// function as a two digit binary
+	// Digit order SOFT_AES, NO_PREFETCH
+
+	static const cn_hash_fun func_table[4] = {
+		cryptonight_hash<0x80000, MEMORY, false, false>,
+		cryptonight_hash<0x80000, MEMORY, false, true>,
+		cryptonight_hash<0x80000, MEMORY, true, false>,
+		cryptonight_hash<0x80000, MEMORY, true, true>
+	};
+
+	std::bitset<2> digit;
+	digit.set(0, !bNoPrefetch);
+	digit.set(1, !bHaveAes);
+
+	return func_table[digit.to_ulong()];
+}
+
+void minethd::pin_thd_affinity()
+{
+	//Lock is needed because we need to use oWorkThd
+	std::lock_guard<std::mutex> lock(work_thd_mtx);
+
+	// pin memory to NUMA node
+	bindMemoryToNUMANode(affinity);
+
+#if defined(__APPLE__)
+	printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+#endif
+	thd_setaffinity(oWorkThd.native_handle(), affinity);
+}
+
+void minethd::work_main()
+{
+	if(affinity >= 0) //-1 means no affinity
+		pin_thd_affinity();
+
+	cn_hash_fun hash_fun;
+	cryptonight_ctx* ctx;
+	uint64_t iCount = 0;
+	uint64_t* piHashVal;
+	uint32_t* piNonce;
+	job_result result;
+
+	hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch);
+	ctx = minethd_alloc_ctx();
+
+	piHashVal = (uint64_t*)(result.bResult + 24);
+	piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
+	GlobalStates::inst().inst().iConsumeCnt++;
+
+	while (bQuit == 0)
+	{
+		if (oWork.bStall)
+		{
+			/*  We are stalled here because the executor didn't find a job for us yet,
+			    either because of network latency, or a socket problem. Since we are
+			    raison d'etre of this software it us sensible to just wait until we have something*/
+
+			while (GlobalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+				std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+			consume_work();
+			continue;
+		}
+
+		if(oWork.bNiceHash)
+			result.iNonce = calc_nicehash_nonce(*piNonce, oWork.iResumeCnt);
+		else
+			result.iNonce = calc_start_nonce(oWork.iResumeCnt);
+
+		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
+		memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID));
+
+		while(GlobalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		{
+			if ((iCount & 0xF) == 0) //Store stats every 16 hashes
+			{
+				using namespace std::chrono;
+				uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
+				iHashCount.store(iCount, std::memory_order_relaxed);
+				iTimestamp.store(iStamp, std::memory_order_relaxed);
+			}
+			iCount++;
+
+			*piNonce = ++result.iNonce;
+
+			hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
+
+			if (*piHashVal < oWork.iTarget)
+				executor::inst()->push_event(ex_event(result, oWork.iPoolId));
+
+			std::this_thread::yield();
+		}
+
+		consume_work();
+	}
+
+	cryptonight_free_ctx(ctx);
+}
+
+minethd::cn_hash_fun_dbl minethd::func_dbl_selector(bool bHaveAes, bool bNoPrefetch)
+{
+	// We have two independent flag bits in the functions
+	// therefore we will build a binary digit and select the
+	// function as a two digit binary
+	// Digit order SOFT_AES, NO_PREFETCH
+
+	static const cn_hash_fun_dbl func_table[4] = {
+		cryptonight_double_hash<0x80000, MEMORY, false, false>,
+		cryptonight_double_hash<0x80000, MEMORY, false, true>,
+		cryptonight_double_hash<0x80000, MEMORY, true, false>,
+		cryptonight_double_hash<0x80000, MEMORY, true, true>
+	};
+
+	std::bitset<2> digit;
+	digit.set(0, !bNoPrefetch);
+	digit.set(1, !bHaveAes);
+
+	return func_table[digit.to_ulong()];
+}
+
+void minethd::double_work_main()
+{
+	if(affinity >= 0) //-1 means no affinity
+		pin_thd_affinity();
+
+	cn_hash_fun_dbl hash_fun;
+	cryptonight_ctx* ctx0;
+	cryptonight_ctx* ctx1;
+	uint64_t iCount = 0;
+	uint64_t *piHashVal0, *piHashVal1;
+	uint32_t *piNonce0, *piNonce1;
+	uint8_t bDoubleHashOut[64];
+	uint8_t	bDoubleWorkBlob[sizeof(miner_work::bWorkBlob) * 2];
+	uint32_t iNonce;
+	job_result res;
+
+	hash_fun = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch);
+	ctx0 = minethd_alloc_ctx();
+	ctx1 = minethd_alloc_ctx();
+
+	piHashVal0 = (uint64_t*)(bDoubleHashOut + 24);
+	piHashVal1 = (uint64_t*)(bDoubleHashOut + 32 + 24);
+	piNonce0 = (uint32_t*)(bDoubleWorkBlob + 39);
+	piNonce1 = nullptr;
+
+	GlobalStates::inst().inst().iConsumeCnt++;
+
+	while (bQuit == 0)
+	{
+		if (oWork.bStall)
+		{
+			/*	We are stalled here because the executor didn't find a job for us yet,
+			either because of network latency, or a socket problem. Since we are
+			raison d'etre of this software it us sensible to just wait until we have something*/
+
+			while (GlobalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+				std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+			consume_work();
+			memcpy(bDoubleWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
+			memcpy(bDoubleWorkBlob + oWork.iWorkSize, oWork.bWorkBlob, oWork.iWorkSize);
+			piNonce1 = (uint32_t*)(bDoubleWorkBlob + oWork.iWorkSize + 39);
+			continue;
+		}
+
+		if(oWork.bNiceHash)
+			iNonce = calc_nicehash_nonce(*piNonce0, oWork.iResumeCnt);
+		else
+			iNonce = calc_start_nonce(oWork.iResumeCnt);
+
+		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
+
+		while (GlobalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		{
+			if ((iCount & 0x7) == 0) //Store stats every 16 hashes
+			{
+				using namespace std::chrono;
+				uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
+				iHashCount.store(iCount, std::memory_order_relaxed);
+				iTimestamp.store(iStamp, std::memory_order_relaxed);
+			}
+
+			iCount += 2;
+
+			*piNonce0 = ++iNonce;
+			*piNonce1 = ++iNonce;
+
+			hash_fun(bDoubleWorkBlob, oWork.iWorkSize, bDoubleHashOut, ctx0, ctx1);
+
+			if (*piHashVal0 < oWork.iTarget)
+				executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce-1, bDoubleHashOut), oWork.iPoolId));
+
+			if (*piHashVal1 < oWork.iTarget)
+				executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce, bDoubleHashOut + 32), oWork.iPoolId));
+
+			std::this_thread::yield();
+		}
+
+		consume_work();
+		memcpy(bDoubleWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
+		memcpy(bDoubleWorkBlob + oWork.iWorkSize, oWork.bWorkBlob, oWork.iWorkSize);
+		piNonce1 = (uint32_t*)(bDoubleWorkBlob + oWork.iWorkSize + 39);
+	}
+
+	cryptonight_free_ctx(ctx0);
+	cryptonight_free_ctx(ctx1);
+}
+
+} // namespace cpu
+} // namepsace xmrstak
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
new file mode 100644
index 0000000..40383cf
--- /dev/null
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -0,0 +1,74 @@
+#pragma once
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <mutex>
+#include "../../crypto/cryptonight.h"
+#include "../miner_work.h"
+#include "../IBackend.hpp"
+#include "../GlobalStates.hpp"
+#include <iostream>
+
+namespace xmrstak
+{
+namespace cpu
+{
+
+class minethd : public IBackend
+{
+public:
+	static std::vector<IBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork);
+	static bool self_test();
+
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+
+	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch);
+	static void thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id);
+
+	static cryptonight_ctx* minethd_alloc_ctx();
+
+private:
+
+	typedef void (*cn_hash_fun_dbl)(const void*, size_t, void*, cryptonight_ctx* __restrict, cryptonight_ctx* __restrict);
+	static cn_hash_fun_dbl func_dbl_selector(bool bHaveAes, bool bNoPrefetch);
+
+	minethd(miner_work& pWork, size_t iNo, bool double_work, bool no_prefetch, int64_t affinity);
+
+	// We use the top 10 bits of the nonce for thread and resume
+	// This allows us to resume up to 128 threads 4 times before
+	// we get nonce collisions
+	// Bottom 22 bits allow for an hour of work at 1000 H/s
+	inline uint32_t calc_start_nonce(uint32_t resume)
+	{
+		return reverseBits<uint32_t>(static_cast<uint32_t>(iThreadNo + GlobalStates::inst().iThreadCount * resume));
+	}
+
+	// Limited version of the nonce calc above
+	inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume)
+	{ 
+		return start | ( ( reverseBits<uint32_t>(static_cast<uint32_t>(iThreadNo + GlobalStates::inst().iThreadCount * resume)) >> 4u ) );
+	}
+
+	void work_main();
+	void double_work_main();
+	void consume_work();
+
+	uint64_t iJobNo;
+
+	static miner_work oGlobalWork;
+	miner_work oWork;
+
+	void pin_thd_affinity();
+	// Held by the creating context to prevent a race cond with oWorkThd = std::thread(...)
+	std::mutex work_thd_mtx;
+
+	std::thread oWorkThd;
+	uint8_t iThreadNo;
+	int64_t affinity;
+
+	bool bQuit;
+	bool bNoPrefetch;
+};
+
+} // namespace cpu
+} // namepsace xmrstak
diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp
new file mode 100644
index 0000000..5251da8
--- /dev/null
+++ b/xmrstak/backend/globalStates.cpp
@@ -0,0 +1,51 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include <assert.h>
+#include <cmath>
+#include <chrono>
+#include <cstring>
+
+
+#include "miner_work.h"
+#include "GlobalStates.hpp"
+
+namespace xmrstak
+{
+
+
+void GlobalStates::switch_work(miner_work& pWork)
+{
+	// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
+	// faster than threads can consume them. This should never happen in real life.
+	// Pool cant physically send jobs faster than every 250ms or so due to net latency.
+
+	while (iConsumeCnt.load(std::memory_order_seq_cst) < iThreadCount)
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+	oGlobalWork = pWork;
+	iConsumeCnt.store(0, std::memory_order_seq_cst);
+	iGlobalJobNo++;
+}
+
+} // namepsace xmrstak
diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp
new file mode 100644
index 0000000..a9818ba
--- /dev/null
+++ b/xmrstak/backend/globalStates.hpp
@@ -0,0 +1,35 @@
+#pragma once
+#include <atomic>
+#include "miner_work.h"
+#include "../Environment.hpp"
+
+namespace xmrstak
+{
+
+struct GlobalStates
+{
+
+	static inline GlobalStates& inst()
+	{
+		auto& env = Environment::inst();
+		if(env.pGlobalStates == nullptr)
+			env.pGlobalStates = new GlobalStates;
+		return *env.pGlobalStates;
+	}
+
+	void switch_work(miner_work& pWork);
+
+	miner_work oGlobalWork;
+	std::atomic<uint64_t> iGlobalJobNo;
+	std::atomic<uint64_t> iConsumeCnt;
+	uint64_t iThreadCount;
+
+	private:
+
+	GlobalStates() : iThreadCount(0)
+	{
+	}
+	
+};
+
+} // namepsace xmrstak
diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp
new file mode 100644
index 0000000..5037028
--- /dev/null
+++ b/xmrstak/backend/iBackend.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+
+namespace xmrstak
+{
+
+	struct IBackend
+	{
+		std::atomic<uint64_t> iHashCount;
+		std::atomic<uint64_t> iTimestamp;
+
+		IBackend() : iHashCount(0), iTimestamp(0)
+		{
+		}
+	};
+
+} // namepsace xmrstak
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
new file mode 100644
index 0000000..8349fda
--- /dev/null
+++ b/xmrstak/backend/miner_work.hpp
@@ -0,0 +1,103 @@
+#pragma once
+#include <thread>
+#include <atomic>
+#include <mutex>
+#include <cstdint>
+#include <climits>
+#include <iostream>
+#include <cassert>
+
+namespace xmrstak
+{
+	// only allowed for unsigned value \todo add static assert
+	template<typename T>
+	T reverseBits(T value)
+	{
+		/* init with value (to get LSB) */
+		T result = value;
+		/* extra shift needed at end */
+		int s = sizeof(T) * CHAR_BIT - 1;
+		for (value >>= 1; value; value >>= 1)
+		{
+			result <<= 1;
+			result |= value & 1;
+			s--;
+		}
+		/* shift when values highest bits are zero */
+		result <<= s;
+		return result;
+	}
+
+	struct miner_work
+	{
+		char        sJobID[64];
+		uint8_t     bWorkBlob[112];
+		uint32_t    iWorkSize;
+		uint32_t    iResumeCnt;
+		uint64_t    iTarget;
+		// \todo remove workaround needed for amd
+		uint32_t    iTarget32;
+		bool        bNiceHash;
+		bool        bStall;
+		size_t      iPoolId;
+
+		miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(0) { }
+
+		miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, uint32_t iResumeCnt,
+			uint64_t iTarget, size_t iPoolId) : iWorkSize(iWorkSize), iResumeCnt(iResumeCnt),
+			iTarget(iTarget), bNiceHash(false), bStall(false), iPoolId(iPoolId)
+		{
+			assert(iWorkSize <= sizeof(bWorkBlob));
+			memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
+			memcpy(this->bWorkBlob, bWork, iWorkSize);
+		}
+
+		miner_work(miner_work const&) = delete;
+
+		miner_work& operator=(miner_work const& from)
+		{
+			assert(this != &from);
+
+			iWorkSize = from.iWorkSize;
+			iResumeCnt = from.iResumeCnt;
+			iTarget = from.iTarget;
+			iTarget32 = from.iTarget32;
+			bNiceHash = from.bNiceHash;
+			bStall = from.bStall;
+			iPoolId = from.iPoolId;
+
+			assert(iWorkSize <= sizeof(bWorkBlob));
+			memcpy(sJobID, from.sJobID, sizeof(sJobID));
+			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+
+			return *this;
+		}
+
+		miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget),iTarget32(from.iTarget32),
+			bStall(from.bStall), iPoolId(from.iPoolId)
+		{
+			assert(iWorkSize <= sizeof(bWorkBlob));
+			memcpy(sJobID, from.sJobID, sizeof(sJobID));
+			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+		}
+
+		miner_work& operator=(miner_work&& from)
+		{
+			assert(this != &from);
+
+			iWorkSize = from.iWorkSize;
+			iResumeCnt = from.iResumeCnt;
+			iTarget = from.iTarget;
+			iTarget32 = from.iTarget32;
+			bNiceHash = from.bNiceHash;
+			bStall = from.bStall;
+			iPoolId = from.iPoolId;
+
+			assert(iWorkSize <= sizeof(bWorkBlob));
+			memcpy(sJobID, from.sJobID, sizeof(sJobID));
+			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+
+			return *this;
+		}
+	};
+} // namepsace xmrstak
diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
new file mode 100644
index 0000000..84c6dfc
--- /dev/null
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -0,0 +1,113 @@
+
+#pragma once
+
+#include "autoAdjust.hpp"
+
+#include "nvcc_code/cryptonight.h"
+#include "jconf.h"
+#include "../../console.h"
+#include "../../ConfigEditor.hpp"
+#include "../../Params.hpp"
+
+#include <vector>
+#include <cstdio>
+#include <sstream>
+#include <string>
+
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+class autoAdjust
+{    
+public:
+
+    autoAdjust()
+    {
+
+    }
+
+    /** print the adjusted values if needed
+     *
+     * Routine exit the application and print the adjusted values if needed else
+     * nothing is happened.
+     */
+    bool printConfig()
+    {
+		int deviceCount = 0;
+        if(cuda_get_devicecount(&deviceCount) == 0)
+            std::exit(0);
+        // evaluate config parameter for if auto adjustment is needed
+                // evaluate config parameter for if auto adjustment is needed
+        for(int i = 0; i < deviceCount; i++)
+        {
+       
+            nvid_ctx ctx;
+
+            ctx.device_id = i;
+            // -1 trigger auto adjustment
+            ctx.device_blocks = -1;
+            ctx.device_threads = -1;
+
+        // set all evice option those marked as auto (-1) to a valid value
+#ifndef _WIN32
+            ctx.device_bfactor = 0;
+            ctx.device_bsleep = 0;
+#else
+            // windows pass, try to avoid that windows kills the miner if the gpu is blocked for 2 seconds
+            ctx.device_bfactor = 6;
+            ctx.device_bsleep = 25;
+#endif
+            if( cuda_get_deviceinfo(&ctx) != 1 )
+            {
+                printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", i);
+                std::exit(0);
+            }
+            nvidCtxVec.push_back(ctx);
+
+        }
+
+        generateThreadConfig();
+		return true;
+
+    }
+
+private:
+    
+    void generateThreadConfig()
+    {
+		// load the template of the backend config into a char variable
+		const char *tpl =
+			#include "./config.tpl"
+		;
+
+		ConfigEditor configTpl{};
+		configTpl.set( std::string(tpl) );
+
+		constexpr size_t byte2mib = 1024u * 1024u;
+		std::string conf;
+        int i = 0;
+        for(auto& ctx : nvidCtxVec)
+        {
+			conf += std::string("  // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n";
+			conf += std::string("  //      memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/"  + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n";
+            conf += std::string("  { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" +
+                "    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
+                "    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
+                "    \"affine_to_cpu\" : false,\n" +
+                "  },\n";
+            ++i;
+        }
+
+		configTpl.replace("GPUCONFIG",conf);
+		configTpl.write(Params::inst().configFileNVIDIA);
+		printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", Params::inst().configFileNVIDIA.c_str());
+    }
+
+    std::vector<nvid_ctx> nvidCtxVec;
+};
+
+} // namespace nvidia
+} // namepsace xmrstak
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
new file mode 100644
index 0000000..99dc023
--- /dev/null
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -0,0 +1,28 @@
+R"===(
+/*
+ * GPU configuration. You should play around with threads and blocks as the fastest settings will vary.
+ * index         - GPU index number usually starts from 0.
+ * threads       - Number of GPU threads (nothing to do with CPU threads).
+ * blocks        - Number of GPU blocks (nothing to do with CPU threads).
+ * bfactor       - Enables running the Cryptonight kernel in smaller pieces.
+ *                 Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 8
+ * bsleep        - Insert a delay of X microseconds between kernel launches.
+ *                 Increase if you want to reduce GPU lag. Recommended setting on GUI systems - 100
+ * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
+ *
+ * On the first run the miner will look at your system and suggest a basic configuration that will work,
+ * you can try to tweak it from there to get the best performance.
+ *
+ * A filled out configuration should look like this:
+ * "gpu_threads_conf" :
+ * [
+ *     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0, "affine_to_cpu" : false},
+ * ],
+ */
+
+"gpu_threads_conf" :
+[
+GPUCONFIG
+],
+
+)==="
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
new file mode 100644
index 0000000..2184acd
--- /dev/null
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -0,0 +1,274 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include "jconf.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#include <intrin.h>
+#else
+#include <cpuid.h>
+#endif
+
+#include "../../rapidjson/document.h"
+#include "../../rapidjson/error/en.h"
+#include "../../jext.h"
+#include "../../console.h"
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+using namespace rapidjson;
+
+/*
+ * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
+ */
+enum configEnum { aGpuThreadsConf };
+
+struct configVal {
+	configEnum iName;
+	const char* sName;
+	Type iType;
+};
+
+// Same order as in configEnum, as per comment above
+// kNullType means any type
+configVal oConfigValues[] = {
+	{ aGpuThreadsConf, "gpu_threads_conf", kNullType }
+};
+
+inline bool checkType(Type have, Type want)
+{
+	if(want == have)
+		return true;
+	else if(want == kNullType)
+		return true;
+	else if(want == kTrueType && have == kFalseType)
+		return true;
+	else if(want == kFalseType && have == kTrueType)
+		return true;
+	else
+		return false;
+}
+
+constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+
+
+
+struct jconf::opaque_private
+{
+	Document jsonDoc;
+	const Value* configValues[iConfigCnt]; //Compile time constant
+
+	opaque_private()
+	{
+	}
+};
+
+
+bool jconf::NeedsAutoconf()
+{
+	return !prv->configValues[aGpuThreadsConf]->IsArray();
+}
+
+jconf* jconf::oInst = nullptr;
+
+jconf::jconf()
+{
+	prv = new opaque_private();
+}
+
+size_t jconf::GetGPUThreadCount()
+{
+	if(prv->configValues[aGpuThreadsConf]->IsArray())
+		return prv->configValues[aGpuThreadsConf]->Size();
+	else
+		return 0;
+}
+
+bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
+{
+	if(!prv->configValues[aGpuThreadsConf]->IsArray())
+		return false;
+
+	if(id >= prv->configValues[aGpuThreadsConf]->Size())
+		return false;
+
+	const Value& oThdConf = prv->configValues[aGpuThreadsConf]->GetArray()[id];
+
+	if(!oThdConf.IsObject())
+		return false;
+
+	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff;
+	gid = GetObjectMember(oThdConf, "index");
+	blocks = GetObjectMember(oThdConf, "blocks");
+	threads = GetObjectMember(oThdConf, "threads");
+	bfactor = GetObjectMember(oThdConf, "bfactor");
+	bsleep = GetObjectMember(oThdConf, "bsleep");
+	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+
+	if(gid == nullptr || blocks == nullptr || threads == nullptr ||
+		bfactor == nullptr || bsleep == nullptr || aff == nullptr)
+	{
+		return false;
+	}
+
+	if(!gid->IsNumber() || gid->GetInt() < 0)
+		return false;
+
+	if(!blocks->IsNumber() || blocks->GetInt() < 0)
+		return false;
+
+	if(!threads->IsNumber() || threads->GetInt() < 0)
+		return false;
+
+	if(!bfactor->IsNumber() || bfactor->GetInt() < 0)
+		return false;
+
+	if(!bsleep->IsNumber() || bsleep->GetInt() < 0)
+		return false;
+
+	if(!aff->IsUint64() && !aff->IsBool())
+		return false;
+
+	cfg.id = gid->GetInt();
+	cfg.blocks = blocks->GetInt();
+	cfg.threads = threads->GetInt();
+	cfg.bfactor = bfactor->GetInt();
+	cfg.bsleep = bsleep->GetInt();
+
+	if(aff->IsNumber())
+		cfg.cpu_aff = aff->GetInt();
+	else
+		cfg.cpu_aff = -1;
+
+	return true;
+}
+
+bool jconf::parse_config(const char* sFilename)
+{
+	FILE * pFile;
+	char * buffer;
+	size_t flen;
+
+	pFile = fopen(sFilename, "rb");
+	if (pFile == NULL)
+	{
+		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
+		return false;
+	}
+
+	fseek(pFile,0,SEEK_END);
+	flen = ftell(pFile);
+	rewind(pFile);
+
+	if(flen >= 64*1024)
+	{
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
+		return false;
+	}
+
+	if(flen <= 16)
+	{
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "File is empty or too short - %s.", sFilename);
+		return false;
+	}
+
+	buffer = (char*)malloc(flen + 3);
+	if(fread(buffer+1, flen, 1, pFile) != 1)
+	{
+		free(buffer);
+		fclose(pFile);
+		printer::inst()->print_msg(L0, "Read error while reading %s.", sFilename);
+		return false;
+	}
+	fclose(pFile);
+
+	//Replace Unicode BOM with spaces - we always use UTF-8
+	unsigned char* ubuffer = (unsigned char*)buffer;
+	if(ubuffer[1] == 0xEF && ubuffer[2] == 0xBB && ubuffer[3] == 0xBF)
+	{
+		buffer[1] = ' ';
+		buffer[2] = ' ';
+		buffer[3] = ' ';
+	}
+
+	buffer[0] = '{';
+	buffer[flen] = '}';
+	buffer[flen + 1] = '\0';
+
+	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	free(buffer);
+
+	if(prv->jsonDoc.HasParseError())
+	{
+		printer::inst()->print_msg(L0, "JSON config parse error(offset %llu): %s",
+			int_port(prv->jsonDoc.GetErrorOffset()), GetParseError_En(prv->jsonDoc.GetParseError()));
+		return false;
+	}
+
+
+	if(!prv->jsonDoc.IsObject())
+	{ //This should never happen as we created the root ourselves
+		printer::inst()->print_msg(L0, "Invalid config file. No root?\n");
+		return false;
+	}
+
+	for(size_t i = 0; i < iConfigCnt; i++)
+	{
+		if(oConfigValues[i].iName != i)
+		{
+			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s",oConfigValues[i].sName);
+			return false;
+		}
+
+		prv->configValues[i] = GetObjectMember(prv->jsonDoc, oConfigValues[i].sName);
+
+		if(prv->configValues[i] == nullptr)
+		{
+			printer::inst()->print_msg(L0, "Invalid config file. Missing value \"%s\".", oConfigValues[i].sName);
+			return false;
+		}
+
+		if(!checkType(prv->configValues[i]->GetType(), oConfigValues[i].iType))
+		{
+			printer::inst()->print_msg(L0, "Invalid config file. Value \"%s\" has unexpected type.", oConfigValues[i].sName);
+			return false;
+		}
+	}
+	if(NeedsAutoconf())
+		return true;
+
+	return true;
+}
+
+} // namespace nvidia
+} // namespace xmrstak
+\ No newline at end of file
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
new file mode 100644
index 0000000..8959088
--- /dev/null
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -0,0 +1,51 @@
+#pragma once
+#include <stdlib.h>
+#include <string>
+#include "../../Params.hpp"
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+class jconf
+{
+public:
+	static jconf* inst()
+	{
+		if (oInst == nullptr) oInst = new jconf;
+		return oInst;
+	};
+
+	bool parse_config(const char* sFilename = Params::inst().configFileNVIDIA.c_str());
+
+	struct thd_cfg {
+		uint32_t id;
+		uint32_t blocks;
+		uint32_t threads;
+		uint32_t bfactor;
+		uint32_t bsleep;
+		bool bDoubleMode;
+		bool bNoPrefetch;
+		int32_t cpu_aff;
+
+		long long iCpuAff;
+	};
+
+	size_t GetGPUThreadCount();
+
+	bool GetGPUThreadConfig(size_t id, thd_cfg &cfg);
+
+	bool NeedsAutoconf();
+
+private:
+	jconf();
+	static jconf* oInst;
+
+	struct opaque_private;
+	opaque_private* prv;
+
+};
+
+} // namespace nvidia
+} // namepsace xmrstak
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
new file mode 100644
index 0000000..cbee219
--- /dev/null
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -0,0 +1,273 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+#include <assert.h>
+#include <cmath>
+#include <chrono>
+#include <thread>
+#include <bitset>
+#include <vector>
+#include "../../console.h"
+#include "../../crypto/cryptonight_aesni.h"
+#include "../cpu/minethd.h"
+#include "../../Params.hpp"
+
+#include "../../executor.h"
+#include "minethd.h"
+#include "../../jconf.h"
+#include "../../crypto/cryptonight.h"
+#include "../../Environment.hpp"
+#include "autoAdjust.hpp"
+
+
+#ifndef USE_PRECOMPILED_HEADERS
+#ifdef WIN32
+#include <direct.h>
+#include <windows.h>
+#else
+#include <sys/types.h>
+#include <dlfcn.h>
+#endif
+#include <iostream>
+#endif
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+#ifdef WIN32
+    HINSTANCE lib_handle;
+#else
+    void *lib_handle;
+#endif
+	
+minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
+{
+	oWork = pWork;
+	bQuit = 0;
+	iThreadNo = (uint8_t)iNo;
+	iJobNo = 0;
+
+	ctx.device_id = (int)cfg.id;
+	ctx.device_blocks = (int)cfg.blocks;
+	ctx.device_threads = (int)cfg.threads;
+	ctx.device_bfactor = (int)cfg.bfactor;
+	ctx.device_bsleep = (int)cfg.bsleep;
+	
+	oWorkThd = std::thread(&minethd::work_main, this);
+}
+
+
+bool minethd::self_test()
+{
+	cryptonight_ctx* ctx0;
+	unsigned char out[32];
+	bool bResult = true;
+
+	ctx0 = new cryptonight_ctx;
+	if(::jconf::inst()->HaveHardwareAes())
+	{
+		//cryptonight_hash_ctx("This is a test", 14, out, ctx0);
+		bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+	}
+	else
+	{
+		//cryptonight_hash_ctx_soft("This is a test", 14, out, ctx0);
+		bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+	}
+	delete ctx0;
+
+	//if(!bResult)
+	//	printer::inst()->print_msg(L0,
+	//	    "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
+
+	return bResult;
+}
+
+
+extern "C"
+{
+#ifdef WIN32
+__declspec(dllexport)
+#endif
+std::vector<IBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, Environment& env)
+{
+	Environment::inst() = env;
+	return nvidia::minethd::thread_starter(threadOffset, pWork);
+}
+} // extern "C"
+
+std::vector<IBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork)
+{
+	std::vector<IBackend*>* pvThreads = new std::vector<IBackend*>();
+
+	if(!ConfigEditor::file_exist(Params::inst().configFileNVIDIA))
+	{
+		autoAdjust adjust;
+		if(!adjust.printConfig())
+			return pvThreads;
+	}
+
+	if(!jconf::inst()->parse_config())
+	{
+		win_exit();
+	}
+
+	int deviceCount = 0;
+	if(cuda_get_devicecount(&deviceCount) != 1)
+	{
+		std::cout<<"WARNING: NVIDIA no device found"<<std::endl;
+		return pvThreads;
+	}
+
+	size_t i, n = jconf::inst()->GetGPUThreadCount();
+	pvThreads->reserve(n);
+
+	jconf::thd_cfg cfg;
+	for (i = 0; i < n; i++)
+	{
+		jconf::inst()->GetGPUThreadConfig(i, cfg);
+		minethd* thd = new minethd(pWork, i + threadOffset, cfg);
+
+		if(cfg.cpu_aff >= 0)
+		{
+#if defined(__APPLE__)
+			printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+#endif
+			cpu::minethd::thd_setaffinity(thd->oWorkThd.native_handle(), cfg.cpu_aff);
+		}
+
+		pvThreads->push_back(thd);
+
+		if(cfg.cpu_aff >= 0)
+			printer::inst()->print_msg(L1, "Starting GPU thread, affinity: %d.", (int)cfg.cpu_aff);
+		else
+			printer::inst()->print_msg(L1, "Starting GPU thread, no affinity.");
+	}
+
+	return pvThreads;
+}
+
+void minethd::switch_work(miner_work& pWork)
+{
+	// iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work
+	// faster than threads can consume them. This should never happen in real life.
+	// Pool cant physically send jobs faster than every 250ms or so due to net latency.
+
+	while (GlobalStates::inst().iConsumeCnt.load(std::memory_order_seq_cst) < GlobalStates::inst().iThreadCount)
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+	GlobalStates::inst().oGlobalWork = pWork;
+	GlobalStates::inst().iConsumeCnt.store(0, std::memory_order_seq_cst);
+	GlobalStates::inst().iGlobalJobNo++;
+}
+
+void minethd::consume_work()
+{
+	memcpy(&oWork, &GlobalStates::inst().oGlobalWork, sizeof(miner_work));
+	iJobNo++;
+	GlobalStates::inst().iConsumeCnt++;
+}
+
+void minethd::work_main()
+{
+	uint64_t iCount = 0;
+	uint32_t iNonce;
+	cryptonight_ctx* cpu_ctx;
+	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
+	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/);
+
+	GlobalStates::inst().iConsumeCnt++;
+
+	if(/*cuda_get_deviceinfo(&ctx) != 1 ||*/ cryptonight_extra_cpu_init(&ctx) != 1)
+	{
+		printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
+		std::exit(0);
+	}
+	
+	while (bQuit == 0)
+	{
+		if (oWork.bStall)
+		{
+			/*  We are stalled here because the executor didn't find a job for us yet,
+			    either because of network latency, or a socket problem. Since we are
+			    raison d'etre of this software it us sensible to just wait until we have something*/
+
+			while (GlobalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+				std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+			consume_work();
+			continue;
+		}
+
+		cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize);
+		iNonce = calc_start_nonce(oWork.iResumeCnt);
+
+		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
+
+		while(GlobalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		{
+
+			uint32_t foundNonce[10];
+			uint32_t foundCount;
+
+			cryptonight_extra_cpu_prepare(&ctx, iNonce);
+			cryptonight_core_cpu_hash(&ctx);
+			cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce);
+
+			for(size_t i = 0; i < foundCount; i++)
+			{
+
+				uint8_t	bWorkBlob[112];
+				uint8_t	bResult[32];
+
+				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
+				memset(bResult, 0, sizeof(job_result::bResult));
+
+				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
+
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
+					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult), oWork.iPoolId));
+				else
+					executor::inst()->log_result_error("NVIDIA Invalid Result");
+			}
+
+			iCount += ctx.device_blocks * ctx.device_threads;
+			iNonce += ctx.device_blocks * ctx.device_threads;
+
+			using namespace std::chrono;
+			uint64_t iStamp = time_point_cast<milliseconds>(high_resolution_clock::now()).time_since_epoch().count();
+			iHashCount.store(iCount, std::memory_order_relaxed);
+			iTimestamp.store(iStamp, std::memory_order_relaxed);
+			std::this_thread::yield();
+		}
+
+		consume_work();
+	}
+}
+
+} // namespace xmrstak
+
+} //namespace nvidia
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
new file mode 100644
index 0000000..9f3993e
--- /dev/null
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -0,0 +1,66 @@
+#pragma once
+#include <thread>
+#include <atomic>
+#include <vector>
+#include "nvcc_code/cryptonight.h"
+#include "../../crypto/cryptonight.h"
+#include "../../jconf.h"
+#include "./jconf.h"
+#include "../IBackend.hpp"
+#include "../../Environment.hpp"
+#include <iostream>
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+class minethd : public IBackend
+{
+public:
+
+	static void switch_work(miner_work& pWork);
+	static std::vector<IBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
+	static bool self_test();
+
+private:
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	
+	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
+
+	// We use the top 10 bits of the nonce for thread and resume
+	// This allows us to resume up to 128 threads 4 times before
+	// we get nonce collisions
+	// Bottom 22 bits allow for an hour of work at 1000 H/s
+	inline uint32_t calc_start_nonce(uint32_t resume)
+	{
+		return reverseBits<uint32_t>(iThreadNo + GlobalStates::inst().iThreadCount * resume);
+	}
+
+	// Limited version of the nonce calc above
+	inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume)
+	{
+		return start | ( ( reverseBits(iThreadNo + GlobalStates::inst().iThreadCount * resume) >> 4u ) );
+	}
+
+	void work_main();
+	void consume_work();
+
+	static std::atomic<uint64_t> iGlobalJobNo;
+	static std::atomic<uint64_t> iConsumeCnt;
+	static uint64_t iThreadCount;
+	uint64_t iJobNo;
+
+	static miner_work oGlobalWork;
+	miner_work oWork;
+
+	std::thread oWorkThd;
+	uint8_t iThreadNo;
+
+	nvid_ctx ctx;
+
+	bool bQuit;
+};
+
+} // namespace nvidia
+} // namepsace xmrstak
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
new file mode 100644
index 0000000..784c38d
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <stdint.h>
+#include <string>
+
+typedef struct {
+	int device_id;
+	const char *device_name;
+	int device_arch[2];
+	int device_mpcount;
+	int device_blocks;
+	int device_threads;
+	int device_bfactor;
+	int device_bsleep;
+	
+	uint32_t *d_input;
+	uint32_t inputlen;
+	uint32_t *d_result_count;
+	uint32_t *d_result_nonce;
+	uint32_t *d_long_state;
+	uint32_t *d_ctx_state;
+	uint32_t *d_ctx_a;
+	uint32_t *d_ctx_b;
+	uint32_t *d_ctx_key1;
+	uint32_t *d_ctx_key2;
+	uint32_t *d_ctx_text;
+	std::string name;
+	size_t free_device_memory;
+	size_t total_device_memory;
+} nvid_ctx;
+
+extern "C" {
+
+/** get device count
+ *
+ * @param deviceCount[out] cuda device count
+ * @return error code: 0 == error is occurred, 1 == no error
+ */
+int cuda_get_devicecount( int* deviceCount);
+int cuda_get_deviceinfo(nvid_ctx *ctx);
+int cryptonight_extra_cpu_init(nvid_ctx *ctx);
+void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
+void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce);
+void cryptonight_core_cpu_hash(nvid_ctx* ctx);
+void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce);
+
+}
+
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
new file mode 100644
index 0000000..e478600
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
@@ -0,0 +1,305 @@
+
+#pragma once
+
+#include <stdint.h>
+
+#define N_COLS          4
+#define WPOLY           0x011b
+
+static __constant__ uint32_t d_t_fn[1024] =
+{
+	0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U,
+	0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
+	0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U,
+	0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
+	0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU,
+	0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
+	0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U,
+	0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU,
+	0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU,
+	0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U,
+	0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U,
+	0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU,
+	0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU,
+	0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU,
+	0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU,
+	0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU,
+	0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U,
+	0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU,
+	0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU,
+	0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U,
+	0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U,
+	0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U,
+	0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U,
+	0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U,
+	0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU,
+	0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U,
+	0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU,
+	0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU,
+	0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U,
+	0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U,
+	0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U,
+	0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU,
+	0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U,
+	0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU,
+	0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU,
+	0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U,
+	0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U,
+	0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU,
+	0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U,
+	0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU,
+	0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U,
+	0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U,
+	0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U,
+	0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U,
+	0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU,
+	0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U,
+	0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU,
+	0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U,
+	0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU,
+	0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U,
+	0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU,
+	0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU,
+	0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU,
+	0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU,
+	0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U,
+	0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U,
+	0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U,
+	0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U,
+	0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U,
+	0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U,
+	0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU,
+	0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U,
+	0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU,
+	0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU,
+	0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU,
+	0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U,
+	0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU,
+	0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU,
+	0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U,
+	0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU,
+	0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU,
+	0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU,
+	0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU,
+	0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU,
+	0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U,
+	0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU,
+	0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU,
+	0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U,
+	0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU,
+	0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU,
+	0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU,
+	0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU,
+	0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU,
+	0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U,
+	0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU,
+	0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU,
+	0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU,
+	0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU,
+	0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U,
+	0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U,
+	0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U,
+	0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U,
+	0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU,
+	0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U,
+	0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U,
+	0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU,
+	0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU,
+	0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U,
+	0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U,
+	0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U,
+	0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU,
+	0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U,
+	0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU,
+	0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U,
+	0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU,
+	0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U,
+	0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U,
+	0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU,
+	0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U,
+	0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U,
+	0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U,
+	0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U,
+	0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U,
+	0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U,
+	0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U,
+	0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U,
+	0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU,
+	0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U,
+	0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U,
+	0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U,
+	0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U,
+	0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U,
+	0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U,
+	0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU,
+	0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U,
+	0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U,
+	0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U,
+	0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU,
+	0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU,
+	0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U,
+	0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU,
+	0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U,
+	0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU,
+	0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U,
+	0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU,
+	0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U,
+	0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U,
+	0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU,
+	0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U,
+	0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U,
+	0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U,
+	0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU,
+	0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U,
+	0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U,
+	0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU,
+	0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U,
+	0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U,
+	0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U,
+	0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU,
+	0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU,
+	0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U,
+	0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU,
+	0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU,
+	0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U,
+	0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU,
+	0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U,
+	0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU,
+	0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U,
+	0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U,
+	0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U,
+	0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU,
+	0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U,
+	0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU,
+	0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U,
+	0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU,
+	0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U,
+	0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U,
+	0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU,
+	0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU,
+	0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU,
+	0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U,
+	0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U,
+	0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU,
+	0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U,
+	0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU,
+	0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U,
+	0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU,
+	0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U,
+	0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU,
+	0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU,
+	0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U,
+	0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU,
+	0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U,
+	0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU,
+	0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U,
+	0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U,
+	0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U,
+	0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU,
+	0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU,
+	0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U,
+	0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU,
+	0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U,
+	0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU,
+	0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U,
+	0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU,
+	0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U,
+	0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU,
+	0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U,
+	0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU,
+	0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U,
+	0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U,
+	0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU,
+	0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U,
+	0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U,
+	0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U,
+	0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU,
+	0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U,
+	0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U,
+	0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU,
+	0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U,
+	0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U,
+	0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U,
+	0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU,
+	0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU,
+	0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U,
+	0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU,
+	0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU,
+	0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U,
+	0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU,
+	0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U,
+	0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU,
+	0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U,
+	0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U,
+	0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U,
+	0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU,
+	0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U,
+	0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU,
+	0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U,
+	0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU,
+	0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U,
+	0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U,
+	0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU,
+	0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU,
+	0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU,
+	0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U,
+	0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U,
+	0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU,
+	0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U,
+	0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU,
+	0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U,
+	0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU,
+	0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U,
+	0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU,
+	0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU,
+	0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U,
+	0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU,
+	0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U,
+	0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU,
+	0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U,
+	0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U,
+	0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U,
+	0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU,
+	0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU,
+	0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
+	0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU,
+	0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U
+};
+
+#define t_fn0(x) (sharedMemory[      (x)])
+#define t_fn1(x) (sharedMemory[256 + (x)])
+#define t_fn2(x) (sharedMemory[512 + (x)])
+#define t_fn3(x) (sharedMemory[768 + (x)])
+
+
+#define round(dummy,y,x,k) \
+	y[0] = (k)[0]  ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \
+	y[1] = (k)[1]  ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \
+	y[2] = (k)[2]  ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \
+	y[3] = (k)[3]  ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) ));
+
+__device__ __forceinline__ static void cn_aes_single_round(uint32_t * __restrict__ sharedMemory, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t * __restrict__ expandedKey)
+{
+	round(sharedMemory, out, in, expandedKey);
+}
+
+__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * __restrict__ sharedMemory, uint32_t * __restrict__ val, const uint32_t * __restrict__ expandedKey)
+{
+	uint32_t b1[4];
+	round(sharedMemory, b1, val, expandedKey);
+	round(sharedMemory, val, b1, expandedKey + 1 * N_COLS);
+	round(sharedMemory, b1, val, expandedKey + 2 * N_COLS);
+	round(sharedMemory, val, b1, expandedKey + 3 * N_COLS);
+	round(sharedMemory, b1, val, expandedKey + 4 * N_COLS);
+	round(sharedMemory, val, b1, expandedKey + 5 * N_COLS);
+	round(sharedMemory, b1, val, expandedKey + 6 * N_COLS);
+	round(sharedMemory, val, b1, expandedKey + 7 * N_COLS);
+	round(sharedMemory, b1, val, expandedKey + 8 * N_COLS);
+	round(sharedMemory, val, b1, expandedKey + 9 * N_COLS);
+}
+
+__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
+{
+	for(int i = threadIdx.x; i < 1024; i += blockDim.x)
+		sharedMemory[i] = d_t_fn[i];
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
new file mode 100644
index 0000000..07ae169
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
@@ -0,0 +1,193 @@
+#pragma once
+
+typedef struct {
+	uint32_t h[8], s[4], t[2];
+	int buflen, nullt;
+	uint8_t buf[64];
+} blake_state;
+
+#define U8TO32(p) \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+	((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+
+#define U32TO8(p, v) \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define BLAKE_ROT(x,n) ROTR32(x, n)
+#define BLAKE_G(a,b,c,d,e) \
+	v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a],16); \
+	v[c] += v[d];                     \
+	v[b] = BLAKE_ROT(v[b] ^ v[c],12); \
+	v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \
+	v[c] += v[d];                     \
+	v[b] = BLAKE_ROT(v[b] ^ v[c], 7);
+
+__constant__ uint8_t d_blake_sigma[14][16] =
+{
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}
+};
+__constant__ uint32_t d_blake_cst[16]
+= {
+	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
+	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+};
+
+__device__ void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * __restrict__ block)
+{
+	uint32_t v[16], m[16], i;
+
+	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
+	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
+	v[ 8] = S->s[0] ^ 0x243F6A88;
+	v[ 9] = S->s[1] ^ 0x85A308D3;
+	v[10] = S->s[2] ^ 0x13198A2E;
+	v[11] = S->s[3] ^ 0x03707344;
+	v[12] = 0xA4093822;
+	v[13] = 0x299F31D0;
+	v[14] = 0x082EFA98;
+	v[15] = 0xEC4E6C89;
+
+	if (S->nullt == 0)
+	{
+		v[12] ^= S->t[0];
+		v[13] ^= S->t[0];
+		v[14] ^= S->t[1];
+		v[15] ^= S->t[1];
+	}
+
+	for (i = 0; i < 14; ++i)
+	{
+		BLAKE_G(0, 4,  8, 12,  0);
+		BLAKE_G(1, 5,  9, 13,  2);
+		BLAKE_G(2, 6, 10, 14,  4);
+		BLAKE_G(3, 7, 11, 15,  6);
+		BLAKE_G(3, 4,  9, 14, 14);
+		BLAKE_G(2, 7,  8, 13, 12);
+		BLAKE_G(0, 5, 10, 15,  8);
+		BLAKE_G(1, 6, 11, 12, 10);
+	}
+
+	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
+	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+}
+
+__device__ void cn_blake_update(blake_state * __restrict__ S, const uint8_t * __restrict__ data, uint64_t datalen)
+{
+	int left = S->buflen >> 3;
+	int fill = 64 - left;
+
+	if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) 
+	{
+		memcpy((void *) (S->buf + left), (void *) data, fill);
+		S->t[0] += 512;
+		if (S->t[0] == 0) S->t[1]++;
+		cn_blake_compress(S, S->buf);
+		data += fill;
+		datalen -= (fill << 3);
+		left = 0;
+	}
+
+	while (datalen >= 512) 
+	{
+		S->t[0] += 512;
+		if (S->t[0] == 0) S->t[1]++;
+		cn_blake_compress(S, data);
+		data += 64;
+		datalen -= 512;
+	}
+
+	if (datalen > 0) 
+	{
+		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+		S->buflen = (left << 3) + datalen;
+	}
+	else 
+	{
+		S->buflen = 0;
+	}
+}
+
+__device__ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
+{
+	const uint8_t padding[] = 
+	{
+		0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+	};
+
+	uint8_t pa = 0x81, pb = 0x01;
+	uint8_t msglen[8];
+	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
+	if (lo < (unsigned) S->buflen) hi++;
+	U32TO8(msglen + 0, hi);
+	U32TO8(msglen + 4, lo);
+
+	if (S->buflen == 440) 
+	{
+		S->t[0] -= 8;
+		cn_blake_update(S, &pa, 8);
+	} 
+	else 
+	{
+		if (S->buflen < 440) 
+		{
+			if (S->buflen == 0) S->nullt = 1;
+			S->t[0] -= 440 - S->buflen;
+			cn_blake_update(S, padding, 440 - S->buflen);
+		}
+		else 
+		{
+			S->t[0] -= 512 - S->buflen;
+			cn_blake_update(S, padding, 512 - S->buflen);
+			S->t[0] -= 440;
+			cn_blake_update(S, padding + 1, 440);
+			S->nullt = 1;
+		}
+		cn_blake_update(S, &pb, 8);
+		S->t[0] -= 8;
+	}
+	S->t[0] -= 64;
+	cn_blake_update(S, msglen, 64);
+
+	U32TO8(digest +  0, S->h[0]);
+	U32TO8(digest +  4, S->h[1]);
+	U32TO8(digest +  8, S->h[2]);
+	U32TO8(digest + 12, S->h[3]);
+	U32TO8(digest + 16, S->h[4]);
+	U32TO8(digest + 20, S->h[5]);
+	U32TO8(digest + 24, S->h[6]);
+	U32TO8(digest + 28, S->h[7]);
+}
+
+__device__ void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
+{
+	blake_state bs;
+	blake_state *S = (blake_state *)&bs;
+
+	S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372;
+	S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C;
+	S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19;
+	S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
+	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
+
+	cn_blake_update(S, (uint8_t *)in, inlen * 8);
+	cn_blake_final(S, (uint8_t *)out);
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
new file mode 100644
index 0000000..7590cf5
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -0,0 +1,343 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#ifdef _WIN32
+#include <windows.h>
+extern "C" void compat_usleep(uint64_t waitTime)
+{
+    if (waitTime > 0)
+    {
+        if (waitTime > 100)
+        {
+            // use a waitable timer for larger intervals > 0.1ms
+
+            HANDLE timer;
+            LARGE_INTEGER ft;
+
+            ft.QuadPart = -(10*waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time
+
+            timer = CreateWaitableTimer(NULL, TRUE, NULL);
+            SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0);
+            WaitForSingleObject(timer, INFINITE);
+            CloseHandle(timer);
+        }
+        else
+        {
+            // use a polling loop for short intervals <= 100ms
+
+            LARGE_INTEGER perfCnt, start, now;
+            __int64 elapsed;
+
+            QueryPerformanceFrequency(&perfCnt);
+            QueryPerformanceCounter(&start);
+            do {
+		SwitchToThread();
+                QueryPerformanceCounter((LARGE_INTEGER*) &now);
+                elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000);
+            } while ( elapsed < waitTime );
+        }
+    }
+}
+#else
+#include <unistd.h>
+extern "C" void compat_usleep(uint64_t waitTime)
+{
+	usleep(waitTime);
+}
+#endif
+
+#include "cryptonight.h"
+#include "cuda_extra.h"
+#include "cuda_aes.hpp"
+#include "cuda_device.hpp"
+
+/* sm_2X is limited to 2GB due to the small TLB
+ * therefore we never use 64bit indices
+ */
+#if defined(XMR_STAK_LARGEGRID) && (__CUDA_ARCH__ >= 300)
+typedef uint64_t IndexType;
+#else
+typedef int IndexType;
+#endif
+
+__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi )
+{
+	*product_hi = __umul64hi( multiplier, multiplicand );
+	return (multiplier * multiplicand );
+}
+
+template< typename T >
+__device__ __forceinline__ T loadGlobal64( T * const addr )
+{
+	T x;
+	asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) );
+	return x;
+}
+
+template< typename T >
+__device__ __forceinline__ T loadGlobal32( T * const addr )
+{
+	T x;
+	asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) );
+	return x;
+}
+
+
+template< typename T >
+__device__ __forceinline__ void storeGlobal32( T* addr, T const & val )
+{
+	asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) );
+}
+
+__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1 )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
+	__syncthreads( );
+
+	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
+	const int sub = ( threadIdx.x & 7 ) << 2;
+
+	const int batchsize = 0x80000 >> bfactor;
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+
+	if ( thread >= threads )
+		return;
+
+	uint32_t key[40], text[4];
+
+	MEMCPY8( key, ctx_key1 + thread * 40, 20 );
+
+	if( partidx == 0 )
+	{
+		// first round
+		MEMCPY8( text, ctx_state + thread * 50 + sub + 16, 2 );
+	}
+	else
+	{
+		// load previous text data
+		MEMCPY8( text, &long_state[( (uint64_t) thread << 19 ) + sub + start - 32], 2 );
+	}
+	__syncthreads( );
+	for ( int i = start; i < end; i += 32 )
+	{
+		cn_aes_pseudo_round_mut( sharedMemory, text, key );
+		MEMCPY8(&long_state[((uint64_t) thread << 19) + (sub + i)], text, 2);
+	}
+}
+
+/** avoid warning `unused parameter` */
+template< typename T >
+__forceinline__ __device__ void unusedVar( const T& )
+{
+}
+
+/** shuffle data for
+ *
+ * - this method can be used with all compute architectures
+ * - for <sm_30 shared memory is needed
+ *
+ * @param ptr pointer to shared memory, size must be `threadIdx.x * sizeof(uint32_t)`
+ *            value can be NULL for compute architecture >=sm_30
+ * @param sub thread number within the group, range [0;4)
+ * @param value value to share with other threads within the group
+ * @param src thread number within the group from where the data is read, range [0;4)
+ */
+__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src)
+{
+#if( __CUDA_ARCH__ < 300 )
+    ptr[sub] = val;
+    return ptr[src&3];
+#else
+    unusedVar( ptr );
+    unusedVar( sub );
+    return __shfl( val, src, 4 );
+#endif
+}
+
+#ifdef XMR_STAK_THREADS
+__launch_bounds__( XMR_STAK_THREADS * 4 )
+#endif
+__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
+
+	__syncthreads( );
+
+	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
+	const int sub = threadIdx.x & 3;
+	const int sub2 = sub & 2;
+
+#if( __CUDA_ARCH__ < 300 )
+        extern __shared__ uint32_t shuffleMem[];
+        volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
+#else
+        volatile uint32_t* sPtr = NULL;
+#endif
+	if ( thread >= threads )
+		return;
+
+	int i, k;
+        uint32_t j;
+	const int batchsize = ITER >> ( 2 + bfactor );
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+	uint32_t * long_state = &d_long_state[(IndexType) thread << 19];
+	uint32_t * ctx_a = d_ctx_a + thread * 4;
+	uint32_t * ctx_b = d_ctx_b + thread * 4;
+	uint32_t a, d[2];
+	uint32_t t1[2], t2[2], res;
+
+	a = ctx_a[sub];
+	d[1] = ctx_b[sub];
+	#pragma unroll 2
+	for ( i = start; i < end; ++i )
+	{
+		#pragma unroll 2
+		for ( int x = 0; x < 2; ++x )
+		{
+			j = ( ( shuffle(sPtr,sub, a, 0) & 0x1FFFF0 ) >> 2 ) + sub;
+
+			const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
+			const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1);
+			const uint32_t x_2 = shuffle(sPtr,sub, x_0, sub + 2);
+			const uint32_t x_3 = shuffle(sPtr,sub, x_0, sub + 3);
+			d[x] = a ^
+				t_fn0( x_0 & 0xff ) ^
+				t_fn1( (x_1 >> 8) & 0xff ) ^
+				t_fn2( (x_2 >> 16) & 0xff ) ^
+				t_fn3( ( x_3 >> 24 ) );
+
+
+			//XOR_BLOCKS_DST(c, b, &long_state[j]);
+			t1[0] = shuffle(sPtr,sub, d[x], 0);
+			//long_state[j] = d[0] ^ d[1];
+			storeGlobal32( long_state + j, d[0] ^ d[1] );
+
+			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
+			j = ( ( *t1 & 0x1FFFF0 ) >> 2 ) + sub;
+
+			uint32_t yy[2];
+			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
+			uint32_t zz[2];
+			zz[0] = shuffle(sPtr,sub, yy[0], 0);
+			zz[1] = shuffle(sPtr,sub, yy[1], 0);
+
+			t1[1] = shuffle(sPtr,sub, d[x], 1);
+			#pragma unroll
+			for ( k = 0; k < 2; k++ )
+				t2[k] = shuffle(sPtr,sub, a, k + sub2);
+
+            *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) );
+
+			res = *( (uint64_t *) t2 )  >> ( sub & 1 ? 32 : 0 );
+
+			storeGlobal32( long_state + j, res );
+			a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
+		}
+	}
+
+	if ( bfactor > 0 )
+	{
+		ctx_a[sub] = a;
+		ctx_b[sub] = d[1];
+	}
+}
+
+__global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
+	__syncthreads( );
+
+	int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
+	int sub = ( threadIdx.x & 7 ) << 2;
+
+	const int batchsize = 0x80000 >> bfactor;
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+
+	if ( thread >= threads )
+		return;
+
+	uint32_t key[40], text[4];
+	MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
+	MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 );
+
+	__syncthreads( );
+	for ( int i = start; i < end; i += 32 )
+	{
+#pragma unroll
+		for ( int j = 0; j < 4; ++j )
+			text[j] ^= long_state[((IndexType) thread << 19) + (sub + i + j)];
+
+		cn_aes_pseudo_round_mut( sharedMemory, text, key );
+	}
+
+	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
+}
+
+extern "C" void cryptonight_core_cpu_hash(nvid_ctx* ctx)
+{
+	dim3 grid( ctx->device_blocks );
+	dim3 block( ctx->device_threads );
+	dim3 block4( ctx->device_threads << 2 );
+	dim3 block8( ctx->device_threads << 3 );
+
+	int partcount = 1 << ctx->device_bfactor;
+
+	/* bfactor for phase 1 and 3
+	 *
+	 * phase 1 and 3 consume less time than phase 2, therefore we begin with the
+	 * kernel splitting if the user defined a `bfactor >= 5`
+	 */
+	int bfactorOneThree = ctx->device_bfactor - 4;
+	if( bfactorOneThree < 0 )
+		bfactorOneThree = 0;
+
+	int partcountOneThree = 1 << bfactorOneThree;
+
+	for ( int i = 0; i < partcountOneThree; i++ )
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+			bfactorOneThree, i,
+			ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key1 ));
+
+		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+	}
+	if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+
+	for ( int i = 0; i < partcount; i++ )
+	{
+        CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase2<<<
+            grid,
+            block4,
+            block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+        >>>(
+            ctx->device_blocks*ctx->device_threads,
+            ctx->device_bfactor,
+            i,
+            ctx->d_long_state,
+            ctx->d_ctx_a,
+            ctx->d_ctx_b
+        ));
+
+		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+	}
+
+	for ( int i = 0; i < partcountOneThree; i++ )
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+			bfactorOneThree, i,
+			ctx->d_long_state,
+			ctx->d_ctx_state, ctx->d_ctx_key2 ));
+	}
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
new file mode 100644
index 0000000..078c165
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
@@ -0,0 +1,30 @@
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <iostream>
+#include <string>
+
+/** execute and check a CUDA api command
+ *
+ * @param id gpu id (thread id)
+ * @param ... CUDA api command
+ */
+#define CUDA_CHECK(id, ...) {														\
+	cudaError_t error = __VA_ARGS__;											\
+	if(error!=cudaSuccess){														\
+		std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__ << std::endl; \
+		throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error)));	\
+	}																			\
+}																				\
+( (void) 0 )
+
+/** execute and check a CUDA kernel
+ *
+ * @param id gpu id (thread id)
+ * @param ... CUDA kernel call
+ */
+#define CUDA_CHECK_KERNEL(id, ...)													\
+	__VA_ARGS__;																\
+	CUDA_CHECK(id, cudaGetLastError())
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
new file mode 100644
index 0000000..7052bc8
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -0,0 +1,367 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_functions.hpp>
+#include  <algorithm>
+
+#ifdef __CUDACC__
+__constant__
+#else
+const
+#endif
+uint64_t keccakf_rndc[24] ={
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+
+#include "cryptonight.h"
+#include "cuda_extra.h"
+#include "cuda_keccak.hpp"
+#include "cuda_blake.hpp"
+#include "cuda_groestl.hpp"
+#include "cuda_jh.hpp"
+#include "cuda_skein.hpp"
+#include "cuda_device.hpp"
+
+__constant__ uint8_t d_sub_byte[16][16] ={
+	{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
+	{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
+	{0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 },
+	{0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 },
+	{0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 },
+	{0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf },
+	{0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 },
+	{0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 },
+	{0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 },
+	{0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb },
+	{0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 },
+	{0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 },
+	{0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a },
+	{0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e },
+	{0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf },
+	{0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }
+};
+
+__device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__ key, const uint32_t * __restrict__ data )
+{
+	int i, j;
+	uint8_t temp[4];
+	const uint32_t aes_gf[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
+
+	MEMSET4( key, 0, 40 );
+	MEMCPY4( key, data, 8 );
+
+#pragma unroll
+	for ( i = 8; i < 40; i++ )
+	{
+		*(uint32_t *) temp = key[i - 1];
+		if ( i % 8 == 0 )
+		{
+			*(uint32_t *) temp = ROTR32( *(uint32_t *) temp, 8 );
+			for ( j = 0; j < 4; j++ )
+				temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f];
+			*(uint32_t *) temp ^= aes_gf[i / 8 - 1];
+		}
+		else
+		{
+			if ( i % 8 == 4 )
+			{
+#pragma unroll
+				for ( j = 0; j < 4; j++ )
+					temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f];
+			}
+		}
+
+		key[i] = key[( i - 8 )] ^ *(uint32_t *) temp;
+	}
+}
+
+__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
+{
+	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
+
+	if ( thread >= threads )
+		return;
+
+	uint32_t ctx_state[50];
+	uint32_t ctx_a[4];
+	uint32_t ctx_b[4];
+	uint32_t ctx_key1[40];
+	uint32_t ctx_key2[40];
+	uint32_t input[21];
+
+	memcpy( input, d_input, len );
+	//*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
+	uint32_t nonce = startNonce + thread;
+	for ( int i = 0; i < sizeof (uint32_t ); ++i )
+		( ( (char *) input ) + 39 )[i] = ( (char*) ( &nonce ) )[i]; //take care of pointer alignment
+
+	cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state );
+	cryptonight_aes_set_key( ctx_key1, ctx_state );
+	cryptonight_aes_set_key( ctx_key2, ctx_state + 8 );
+	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
+	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
+
+	memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
+	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
+	memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+	memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
+	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
+}
+
+__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state )
+{
+	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if ( thread >= threads )
+		return;
+
+	int i;
+	uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50;
+	uint64_t hash[4];
+	uint32_t state[50];
+
+#pragma unroll
+	for ( i = 0; i < 50; i++ )
+		state[i] = ctx_state[i];
+
+	cn_keccakf2( (uint64_t *) state );
+
+	switch ( ( (uint8_t *) state )[0] & 0x03 )
+	{
+	case 0:
+		cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash );
+		break;
+	case 1:
+		cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash );
+		break;
+	case 2:
+		cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash );
+		break;
+	case 3:
+		cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash );
+		break;
+	default:
+		break;
+	}
+
+	// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+	// and expect an accurate result for target > 32-bit without implementing carries
+
+	if ( hash[3] < target )
+	{
+		uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+
+		if(idx < 10)
+			d_res_nonce[idx] = thread;
+	}
+}
+
+extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len )
+{
+	ctx->inputlen = len;
+	CUDA_CHECK(ctx->device_id, cudaMemcpy( ctx->d_input, data, len, cudaMemcpyHostToDevice ));
+}
+
+extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
+{
+	cudaError_t err;
+	err = cudaSetDevice(ctx->device_id);
+	if(err != cudaSuccess)
+	{
+		printf("GPU %d: %s", ctx->device_id, cudaGetErrorString(err));
+		return 0;
+	}
+
+	cudaDeviceReset();
+	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+
+	size_t wsize = ctx->device_blocks * ctx->device_threads;
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, (size_t)MEMORY * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key1, 40 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_key2, 40 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_text, 32 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, 4 * sizeof(uint32_t) * wsize));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
+	return 1;
+}
+
+extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce)
+{
+	int threadsperblock = 128;
+	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
+
+	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
+	dim3 block( threadsperblock );
+
+	CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+		ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+}
+
+extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce)
+{
+	int threadsperblock = 128;
+	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
+
+	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
+	dim3 block( threadsperblock );
+
+	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) ));
+
+	CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_final<<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state ));
+
+	CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+	CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+
+	/* There is only a 32bit limit for the counter on the device side
+	 * therefore this value can be greater than 10, in that case limit rescount
+	 * to 10 entries.
+	 */
+	if(*rescount > 10)
+		*rescount = 10;
+	for(int i=0; i < *rescount; i++)
+		resnonce[i] += startNonce;
+}
+
+extern "C" int cuda_get_devicecount( int* deviceCount)
+{
+	cudaError_t err;
+	*deviceCount = 0;
+	err = cudaGetDeviceCount(deviceCount);
+	if(err != cudaSuccess)
+	{
+		if(err == cudaErrorNoDevice)
+			printf("ERROR: NVIDIA no CUDA device found!\n");
+		else if(err == cudaErrorInsufficientDriver)
+			printf("WARNING: NVIDIA Insufficient driver!\n");
+		else
+			printf("WARNING: NVIDIA Unable to query number of CUDA devices!\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
+{
+	cudaError_t err;
+	int version;
+
+	err = cudaDriverGetVersion(&version);
+	if(err != cudaSuccess)
+	{
+		printf("Unable to query CUDA driver version! Is an nVidia driver installed?\n");
+		return 0;
+	}
+
+	if(version < CUDART_VERSION)
+	{
+		printf("Driver does not support CUDA %d.%d API! Update your nVidia driver!\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
+		return 0;
+	}
+
+	int GPU_N;
+	if(cuda_get_devicecount(&GPU_N) == 0)
+	{
+		return 0;
+	}
+
+	if(ctx->device_id >= GPU_N)
+	{
+		printf("Invalid device ID!\n");
+		return 0;
+	}
+
+	cudaDeviceProp props;
+	err = cudaGetDeviceProperties(&props, ctx->device_id);
+	if(err != cudaSuccess)
+	{
+		printf("\nGPU %d: %s\n%s line %d\n", ctx->device_id, cudaGetErrorString(err), __FILE__, __LINE__);
+		return 0;
+	}
+
+	ctx->device_name = strdup(props.name);
+	ctx->device_mpcount = props.multiProcessorCount;
+	ctx->device_arch[0] = props.major;
+	ctx->device_arch[1] = props.minor;
+
+	ctx->name = std::string(props.name);
+
+	// set all evice option those marked as auto (-1) to a valid value
+	if(ctx->device_blocks == -1)
+	{
+		/* good values based of my experience
+		 *	 - 3 * SMX count >=sm_30
+		 *   - 2 * SMX count for <sm_30
+		 */
+		ctx->device_blocks = props.multiProcessorCount *
+			( props.major < 3 ? 2 : 3 );
+	}
+	if(ctx->device_threads == -1)
+	{
+		/* sm_20 devices can only run 512 threads per cuda block
+		 * `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts
+		 * `8 * ctx->device_threads` threads per block
+		 */
+		ctx->device_threads = 64;
+		constexpr size_t byte2mib = 1024u * 1024u;
+		
+		// no limit by default 1TiB
+		size_t maxMemUsage = byte2mib * byte2mib;
+		if(props.major < 6)
+		{
+			// limit memory usage for GPUs before pascal
+			maxMemUsage = size_t(2048u) * byte2mib;
+		}
+		if(props.major == 2)
+		{
+			// limit memory usage for sm 20 GPUs
+			maxMemUsage = size_t(1024u) * byte2mib;
+		}
+
+		size_t freeMemory = 0;
+		size_t totalMemory = 0;
+		CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory));
+		
+		ctx->total_device_memory = totalMemory;
+		ctx->free_device_memory = freeMemory;
+
+		// keep 64MiB memory free (value is randomly chosen)
+		// 200byte are meta data memory (result nonce, ...)
+		size_t availableMem = freeMemory - (64u * 1024 * 1024) - 200u;
+		size_t limitedMemory = std::min(availableMem, maxMemUsage);
+		// up to 920bytes extra memory is used per thread for some kernel (lmem/local memory)
+		// 680bytes are extra meta data memory per hash
+		size_t perThread = size_t(MEMORY) + 740u + 680u;
+		size_t max_intensity = limitedMemory / perThread;
+		ctx->device_threads = max_intensity / ctx->device_blocks;
+		// use only odd number of threads
+		ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
+
+		if(props.major == 2 && ctx->device_threads > 64)
+		{
+			// Fermi gpus only support 512 threads per block (we need start 4 * configured threads)
+			ctx->device_threads = 64;
+		}
+
+	}
+
+	return 1;
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
new file mode 100644
index 0000000..3ccdcd6
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
@@ -0,0 +1,104 @@
+#pragma once
+
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 520
+/* avoid red underlining */
+
+struct uint3
+{
+	unsigned int x, y, z;
+};
+
+struct uint3  threadIdx;
+struct uint3  blockIdx;
+struct uint3  blockDim;
+#define __funnelshift_r(a,b,c) 1
+#define __syncthreads()
+#define asm(x)
+#define __shfl(a,b,c) 1
+#endif
+
+#define MEMORY         (1 << 21) // 2 MiB / 2097152 B
+#define ITER           (1 << 20) // 1048576
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
+
+#define C32(x)    ((uint32_t)(x ## U))
+#define T32(x) ((x) & C32(0xFFFFFFFF))
+
+#if __CUDA_ARCH__ >= 350
+__forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset)
+{
+	uint2 result;
+	if(offset >= 32)
+	{
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} 
+	else 
+	{
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#define ROTL64(x, n) (cuda_ROTL64(x, n))
+#else
+#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
+#if __CUDA_ARCH__ < 350
+#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#endif
+
+#define MEMSET8(dst,what,cnt) { \
+	int i_memset8; \
+	uint64_t *out_memset8 = (uint64_t *)(dst); \
+	for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
+		out_memset8[i_memset8] = (what); }
+
+#define MEMSET4(dst,what,cnt) { \
+	int i_memset4; \
+	uint32_t *out_memset4 = (uint32_t *)(dst); \
+	for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
+		out_memset4[i_memset4] = (what); }
+
+#define MEMCPY8(dst,src,cnt) { \
+	int i_memcpy8; \
+	uint64_t *in_memcpy8 = (uint64_t *)(src); \
+	uint64_t *out_memcpy8 = (uint64_t *)(dst); \
+	for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
+		out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
+
+#define MEMCPY4(dst,src,cnt) { \
+	int i_memcpy4; \
+	uint32_t *in_memcpy4 = (uint32_t *)(src); \
+	uint32_t *out_memcpy4 = (uint32_t *)(dst); \
+	for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
+		out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
+
+#define XOR_BLOCKS(a,b) { \
+	((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
+	((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }
+
+#define XOR_BLOCKS_DST(x,y,z) { \
+	((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
+	((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
+
+#define MUL_SUM_XOR_DST(a,c,dst) { \
+	const uint64_t dst0 = ((uint64_t *)dst)[0]; \
+	uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \
+	hi += ((uint64_t *)c)[0]; \
+	((uint64_t *)c)[0] = dst0 ^ hi; \
+	((uint64_t *)dst)[0] = hi; \
+	((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \
+	}
+
+#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
+
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
new file mode 100644
index 0000000..a37934c
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
@@ -0,0 +1,357 @@
+#pragma once
+
+#define GROESTL_ROWS 8
+#define GROESTL_LENGTHFIELDLEN GROESTL_ROWS
+#define GROESTL_COLS512 8
+
+#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512)
+
+#define GROESTL_ROUNDS512 10
+#define GROESTL_HASH_BIT_LEN 256
+
+#define GROESTL_ROTL32(v, n) ROTL32(v, n)
+
+
+#define li_32(h) 0x##h##u
+#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
+
+#define u32BIG(a)	\
+	((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a,24) & li_32(FF00FF00)))
+
+typedef struct {
+	uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)];            /* actual state */
+	uint32_t block_counter1,
+	block_counter2;         /* message block counter(s) */
+	BitSequence buffer[GROESTL_SIZE512];      /* data buffer */
+	int buf_ptr;              /* data buffer pointer */
+	int bits_in_last_byte;    /* no. of message bits in last byte of data buffer */
+} groestlHashState;
+
+
+__constant__ uint32_t d_groestl_T[512] =
+{
+  0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
+, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
+, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
+, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
+, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
+, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
+, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
+, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
+, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
+, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
+, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
+, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
+, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
+, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
+, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
+, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
+, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
+, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
+, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
+, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
+, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
+, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
+, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
+, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
+, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
+, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
+, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
+, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
+, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
+, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
+, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
+, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e
+};
+
+#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \
+	{ temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
+		v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
+		v1 = temp_var; }
+
+#define GROESTL_COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \
+	tu = d_groestl_T[2*(uint32_t)x[4*c0+0]];	\
+	tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1];	\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t)		\
+	tu ^= tv1;									\
+	tl ^= tv2;									\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t)		\
+	tu ^= tv1;									\
+	tl ^= tv2;   								\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t)		\
+	tu ^= tv1;									\
+	tl ^= tv2;									\
+	tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]];	\
+	tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1];	\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t)		\
+	tl ^= tv1;									\
+	tu ^= tv2;									\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t)		\
+	tl ^= tv1;									\
+	tu ^= tv2;   								\
+	tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]];	\
+	tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1];	\
+	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t)		\
+	tl ^= tv1;									\
+	tu ^= tv2;									\
+	y[i] = tu;									\
+	y[i+1] = tl;
+
+__device__ void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[ 0] ^= 0x00000000^r;
+	x32[ 2] ^= 0x00000010^r;
+	x32[ 4] ^= 0x00000020^r;
+	x32[ 6] ^= 0x00000030^r;
+	x32[ 8] ^= 0x00000040^r;
+	x32[10] ^= 0x00000050^r;
+	x32[12] ^= 0x00000060^r;
+	x32[14] ^= 0x00000070^r;
+	GROESTL_COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+__device__ void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[ 0] = ~x32[ 0];
+	x32[ 1] ^= 0xffffffff^r;
+	x32[ 2] = ~x32[ 2];
+	x32[ 3] ^= 0xefffffff^r;
+	x32[ 4] = ~x32[ 4];
+	x32[ 5] ^= 0xdfffffff^r;
+	x32[ 6] = ~x32[ 6];
+	x32[ 7] ^= 0xcfffffff^r;
+	x32[ 8] = ~x32[ 8];
+	x32[ 9] ^= 0xbfffffff^r;
+	x32[10] = ~x32[10];
+	x32[11] ^= 0xafffffff^r;
+	x32[12] = ~x32[12];
+	x32[13] ^= 0x9fffffff^r;
+	x32[14] = ~x32[14];
+	x32[15] ^= 0x8fffffff^r;
+	GROESTL_COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+__device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m)
+{
+	int i;
+	uint32_t Ptmp[2*GROESTL_COLS512];
+	uint32_t Qtmp[2*GROESTL_COLS512];
+	uint32_t y[2*GROESTL_COLS512];
+	uint32_t z[2*GROESTL_COLS512];
+
+	for (i = 0; i < 2*GROESTL_COLS512; i++) 
+	{
+		z[i] = m[i];
+		Ptmp[i] = h[i]^m[i];
+	}
+
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x01000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x02000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x03000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x04000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x05000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x06000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x07000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x08000000);
+	cn_groestl_RND512Q((uint8_t*)y, Qtmp, 0x09000000);
+
+	cn_groestl_RND512P((uint8_t*)Ptmp, y, 0x00000000);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000001);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000002);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000003);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000004);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000005);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000006);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000007);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
+	cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009);
+
+	for (i = 0; i < 2*GROESTL_COLS512; i++)
+		h[i] ^= Ptmp[i]^Qtmp[i];
+}
+
+__device__ void cn_groestl_outputtransformation(groestlHashState *ctx)
+{
+	int j;
+	uint32_t temp[2*GROESTL_COLS512];
+	uint32_t y[2*GROESTL_COLS512];
+	uint32_t z[2*GROESTL_COLS512];
+
+	for (j = 0; j < 2*GROESTL_COLS512; j++)
+		temp[j] = ctx->chaining[j];
+
+	cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000001);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000002);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000003);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000004);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000005);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000006);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000007);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
+	cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009);
+
+	for (j = 0; j < 2*GROESTL_COLS512; j++)
+		ctx->chaining[j] ^= temp[j];
+}
+
+__device__ void cn_groestl_transform(groestlHashState * __restrict__ ctx,
+	const uint8_t * __restrict__ input, int msglen)
+{
+	for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) 
+	{
+		cn_groestl_F512(ctx->chaining,(uint32_t*)input);
+		ctx->block_counter1++;
+
+		if (ctx->block_counter1 == 0) 
+			ctx->block_counter2++;
+	}
+}
+
+__device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx, 
+	BitSequence* __restrict__  output)
+{
+	int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8;
+	uint8_t *s = (BitSequence*)ctx->chaining;
+
+	if (ctx->bits_in_last_byte) 
+	{
+		ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<ctx->bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte);
+		ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte);
+		ctx->bits_in_last_byte = 0;
+	}
+	else
+	{
+		ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+	}
+
+	if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) 
+	{
+		while (ctx->buf_ptr < GROESTL_SIZE512) 
+			ctx->buffer[(int)ctx->buf_ptr++] = 0;
+
+		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+		ctx->buf_ptr = 0;
+	}
+
+	while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+		ctx->buffer[(int)ctx->buf_ptr++] = 0;
+
+	ctx->block_counter1++;
+	if (ctx->block_counter1 == 0)
+		ctx->block_counter2++;
+	ctx->buf_ptr = GROESTL_SIZE512;
+
+	while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t))
+	{
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
+		ctx->block_counter1 >>= 8;
+	}
+	while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+	{
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
+		ctx->block_counter2 >>= 8;
+	}
+	cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+	cn_groestl_outputtransformation(ctx);
+
+	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++)
+		output[j] = s[i];
+
+	for (i = 0; i < GROESTL_COLS512; i++)
+		ctx->chaining[i] = 0;
+	for (i = 0; i < GROESTL_SIZE512; i++)
+		ctx->buffer[i] = 0;
+}
+
+__device__ void cn_groestl_update(groestlHashState* __restrict__ ctx,
+	const BitSequence* __restrict__ input, DataLength databitlen)
+{
+	int index = 0;
+	int msglen = (int)(databitlen/8);
+	int rem = (int)(databitlen%8);
+
+	if (ctx->buf_ptr) 
+	{
+		while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen)
+			ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+
+		if (ctx->buf_ptr < GROESTL_SIZE512) 
+		{
+			if (rem) 
+			{
+				ctx->bits_in_last_byte = rem;
+				ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+			}
+			return;
+		}
+
+		ctx->buf_ptr = 0;
+		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+	}
+
+	cn_groestl_transform(ctx, input+index, msglen-index);
+	index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512;
+
+	while (index < msglen)
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+
+	if (rem)
+	{
+		ctx->bits_in_last_byte = rem;
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+	}
+}
+
+__device__ void cn_groestl_init(groestlHashState* ctx)
+{
+	int i = 0;
+
+	for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++)
+		ctx->chaining[i] = 0;
+
+	ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN);
+	ctx->buf_ptr = 0;
+	ctx->block_counter1 = 0;
+	ctx->block_counter2 = 0;
+	ctx->bits_in_last_byte = 0;
+}
+
+__device__ void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+{
+	DataLength databitlen = len << 3;
+	groestlHashState context;
+
+	cn_groestl_init(&context);
+	cn_groestl_update(&context, data, databitlen);
+	cn_groestl_final(&context, hashval);
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
new file mode 100644
index 0000000..679046e
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
@@ -0,0 +1,301 @@
+#include <stdint.h>
+
+typedef struct {
+	int hashbitlen;
+	unsigned long long databitlen;
+	unsigned long long datasize_in_buffer;
+	uint64_t x[8][2];
+	unsigned char buffer[64];
+} jhHashState;
+
+__constant__ unsigned char d_JH256_H0[512] =
+{
+	0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1,
+	0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3,
+	0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77,
+	0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8,
+	0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62,
+	0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c,
+	0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf,
+	0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69
+};
+
+__constant__ unsigned char d_E8_rc[42][32] =
+{
+	{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
+	{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
+	{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
+	{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
+	{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
+	{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
+	{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
+	{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
+	{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
+	{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
+	{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
+	{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
+	{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
+	{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
+	{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
+	{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
+	{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
+	{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
+	{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
+	{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
+	{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
+	{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
+	{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
+	{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
+	{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
+	{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
+	{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
+	{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
+	{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
+	{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
+	{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
+	{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
+	{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
+	{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
+	{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
+	{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
+	{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
+	{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
+	{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
+	{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
+	{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
+	{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}
+};
+
+#define JH_SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
+#define JH_SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
+#define JH_SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
+#define JH_SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
+#define JH_SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
+#define JH_SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
+
+#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \
+	(m4) ^= (m1);                \
+	(m5) ^= (m2);                \
+	(m6) ^= (m0) ^ (m3);         \
+	(m7) ^= (m0);                \
+	(m0) ^= (m5);                \
+	(m1) ^= (m6);                \
+	(m2) ^= (m4) ^ (m7);         \
+	(m3) ^= (m4);
+
+#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
+	m3  = ~(m3);                  \
+	m7  = ~(m7);                  \
+	m0 ^= ((~(m2)) & (cc0));      \
+	m4 ^= ((~(m6)) & (cc1));      \
+	temp0 = (cc0) ^ ((m0) & (m1));\
+	temp1 = (cc1) ^ ((m4) & (m5));\
+	m0 ^= ((m2) & (m3));          \
+	m4 ^= ((m6) & (m7));          \
+	m3 ^= ((~(m1)) & (m2));       \
+	m7 ^= ((~(m5)) & (m6));       \
+	m1 ^= ((m0) & (m2));          \
+	m5 ^= ((m4) & (m6));          \
+	m2 ^= ((m0) & (~(m3)));       \
+	m6 ^= ((m4) & (~(m7)));       \
+	m0 ^= ((m1) | (m3));          \
+	m4 ^= ((m5) | (m7));          \
+	m3 ^= ((m1) & (m2));          \
+	m7 ^= ((m5) & (m6));          \
+	m1 ^= (temp0 & (m0));         \
+	m5 ^= (temp1 & (m4));         \
+	m2 ^= temp0;                  \
+	m6 ^= temp1;
+
+__device__ void cn_jh_E8(jhHashState *state)
+{
+	uint64_t i,roundnumber,temp0,temp1;
+
+	for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) 
+	{
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++)
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++) 
+		{
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+		}
+
+		for (i = 1; i < 8; i = i+2) 
+		{
+			temp0 = state->x[i][0];
+			state->x[i][0] = state->x[i][1];
+			state->x[i][1] = temp0;
+		}
+	}
+}
+
+__device__ void cn_jh_F8(jhHashState *state)
+{
+	uint64_t i;
+
+	for (i = 0; i < 8; i++)
+		state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i];
+
+	cn_jh_E8(state);
+
+	for (i = 0; i < 8; i++)
+		state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i];
+}
+
+__device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
+{
+	DataLength index;
+
+	state->databitlen += databitlen;
+	index = 0;
+
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  ) 
+	{
+		if ( (databitlen & 7) == 0 )
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3));
+		else
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1);
+		state->datasize_in_buffer += databitlen;
+		databitlen = 0;
+	}
+
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  )
+	{
+		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
+		index = 64-(state->datasize_in_buffer >> 3);
+		databitlen = databitlen - (512 - state->datasize_in_buffer);
+		cn_jh_F8(state);
+		state->datasize_in_buffer = 0;
+	}
+
+	for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512)
+	{
+		memcpy(state->buffer, data+index, 64);
+		cn_jh_F8(state);
+	}
+
+	if ( databitlen > 0) 
+	{
+		if ((databitlen & 7) == 0)
+			memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
+		else
+			memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
+		state->datasize_in_buffer = databitlen;
+	}
+}
+
+/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
+__device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval)
+{
+	unsigned int i;
+	//uint32_t *bufptr = (uint32_t *)state->buffer;
+
+	if ( (state->databitlen & 0x1ff) == 0 )
+	{
+		/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
+		memset(state->buffer, 0, 64);
+		//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
+		state->buffer[0]  = 0x80;
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8)  & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		cn_jh_F8(state);
+	}
+	else 
+	{
+		/*set the rest of the bytes in the buffer to 0*/
+		if ( (state->datasize_in_buffer & 7) == 0)
+		{
+			for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)
+				state->buffer[i] = 0;
+		}
+		else
+		{
+			for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)
+				state->buffer[i] = 0;
+		}
+
+		/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
+		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
+
+		cn_jh_F8(state);
+		memset(state->buffer, 0, 64);
+		//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		cn_jh_F8(state);
+	}
+
+	memcpy(hashval,(unsigned char*)state->x+64+32,32);
+}
+
+__device__ void cn_jh_init(jhHashState *state, int hashbitlen)
+{
+	state->databitlen = 0;
+	state->datasize_in_buffer = 0;
+	state->hashbitlen = hashbitlen;
+	memcpy(state->x, d_JH256_H0, 128);
+}
+
+__device__ void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+{
+	int hashbitlen = 256;
+	DataLength databitlen = len << 3;
+	jhHashState state;
+
+	cn_jh_init(&state, hashbitlen);
+	cn_jh_update(&state, data, databitlen);
+	cn_jh_final(&state, hashval);
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
new file mode 100644
index 0000000..99c6516
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
@@ -0,0 +1,197 @@
+#if __CUDA_ARCH__ >= 350
+	__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
+	{
+		uint2 result;
+		if(offset >= 32)
+		{
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		}
+		else
+		{
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		}
+		return  __double_as_longlong(__hiloint2double(result.y, result.x));
+	}
+	#define rotl64_1(x, y) (cuda_rotl64((x), (y)))
+#else
+	#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
+#endif
+
+#define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y))
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
+
+__device__ __forceinline__ void cn_keccakf2(uint64_t *s)
+{
+	uint8_t i;
+
+	for(i = 0; i < 24; ++i)
+	{
+		uint64_t bc[5], tmpxor[5], tmp1, tmp2;
+
+		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = rotl64_2(s[6] ^ bc[0], 12);
+		s[6] = rotl64_1(s[9] ^ bc[3], 20);
+		s[9] = rotl64_2(s[22] ^ bc[1], 29);
+		s[22] = rotl64_2(s[14] ^ bc[3], 7);
+		s[14] = rotl64_1(s[20] ^ bc[4], 18);
+		s[20] = rotl64_2(s[2] ^ bc[1], 30);
+		s[2] = rotl64_2(s[12] ^ bc[1], 11);
+		s[12] = rotl64_1(s[13] ^ bc[2], 25);
+		s[13] = rotl64_1(s[19] ^ bc[3], 8);
+		s[19] = rotl64_2(s[23] ^ bc[2], 24);
+		s[23] = rotl64_2(s[15] ^ bc[4], 9);
+		s[15] = rotl64_1(s[4] ^ bc[3], 27);
+		s[4] = rotl64_1(s[24] ^ bc[3], 14);
+		s[24] = rotl64_1(s[21] ^ bc[0], 2);
+		s[21] = rotl64_2(s[8] ^ bc[2], 23);
+		s[8] = rotl64_2(s[16] ^ bc[0], 13);
+		s[16] = rotl64_2(s[5] ^ bc[4], 4);
+		s[5] = rotl64_1(s[3] ^ bc[2], 28);
+		s[3] = rotl64_1(s[18] ^ bc[2], 21);
+		s[18] = rotl64_1(s[17] ^ bc[1], 15);
+		s[17] = rotl64_1(s[11] ^ bc[0], 10);
+		s[11] = rotl64_1(s[7] ^ bc[1], 6);
+		s[7] = rotl64_1(s[10] ^ bc[4], 3);
+		s[10] = rotl64_1(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccakf_rndc[i];
+	}
+}
+
+__device__ __forceinline__ void cn_keccakf(uint64_t *s)
+{
+	uint64_t bc[5], tmpxor[5], tmp1, tmp2;
+
+	tmpxor[0] = s[0] ^ s[5];
+	tmpxor[1] = s[1] ^ s[6] ^ 0x8000000000000000ULL;
+	tmpxor[2] = s[2] ^ s[7];
+	tmpxor[3] = s[3] ^ s[8];
+	tmpxor[4] = s[4] ^ s[9];
+
+	bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+	bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+	bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+	bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+	bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+	tmp1 = s[1] ^ bc[0];
+
+	s[0] ^= bc[4];
+	s[1] = rotl64_2(s[6] ^ bc[0], 12);
+	s[6] = rotl64_1(s[9] ^ bc[3], 20);
+	s[9] = rotl64_2(bc[1], 29);
+	s[22] = rotl64_2(bc[3], 7);
+	s[14] = rotl64_1(bc[4], 18);
+	s[20] = rotl64_2(s[2] ^ bc[1], 30);
+	s[2] = rotl64_2(bc[1], 11);
+	s[12] = rotl64_1(bc[2], 25);
+	s[13] = rotl64_1(bc[3], 8);
+	s[19] = rotl64_2(bc[2], 24);
+	s[23] = rotl64_2(bc[4], 9);
+	s[15] = rotl64_1(s[4] ^ bc[3], 27);
+	s[4] = rotl64_1(bc[3], 14);
+	s[24] = rotl64_1(bc[0], 2);
+	s[21] = rotl64_2(s[8] ^ bc[2], 23);
+	s[8] = rotl64_2(0x8000000000000000ULL ^ bc[0], 13);
+	s[16] = rotl64_2(s[5] ^ bc[4], 4);
+	s[5] = rotl64_1(s[3] ^ bc[2], 28);
+	s[3] = rotl64_1(bc[2], 21);
+	s[18] = rotl64_1(bc[1], 15);
+	s[17] = rotl64_1(bc[0], 10);
+	s[11] = rotl64_1(s[7] ^ bc[1], 6);
+	s[7] = rotl64_1(bc[4], 3);
+	s[10] = rotl64_1(tmp1, 1);
+
+	tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+	tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+	tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+	tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+	tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+	s[0] ^= 0x0000000000000001;
+
+	for(int i = 1; i < 24; ++i)
+	{
+		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = rotl64_2(s[6] ^ bc[0], 12);
+		s[6] = rotl64_1(s[9] ^ bc[3], 20);
+		s[9] = rotl64_2(s[22] ^ bc[1], 29);
+		s[22] = rotl64_2(s[14] ^ bc[3], 7);
+		s[14] = rotl64_1(s[20] ^ bc[4], 18);
+		s[20] = rotl64_2(s[2] ^ bc[1], 30);
+		s[2] = rotl64_2(s[12] ^ bc[1], 11);
+		s[12] = rotl64_1(s[13] ^ bc[2], 25);
+		s[13] = rotl64_1(s[19] ^ bc[3], 8);
+		s[19] = rotl64_2(s[23] ^ bc[2], 24);
+		s[23] = rotl64_2(s[15] ^ bc[4], 9);
+		s[15] = rotl64_1(s[4] ^ bc[3], 27);
+		s[4] = rotl64_1(s[24] ^ bc[3], 14);
+		s[24] = rotl64_1(s[21] ^ bc[0], 2);
+		s[21] = rotl64_2(s[8] ^ bc[2], 23);
+		s[8] = rotl64_2(s[16] ^ bc[0], 13);
+		s[16] = rotl64_2(s[5] ^ bc[4], 4);
+		s[5] = rotl64_1(s[3] ^ bc[2], 28);
+		s[3] = rotl64_1(s[18] ^ bc[2], 21);
+		s[18] = rotl64_1(s[17] ^ bc[1], 15);
+		s[17] = rotl64_1(s[11] ^ bc[0], 10);
+		s[11] = rotl64_1(s[7] ^ bc[1], 6);
+		s[7] = rotl64_1(s[10] ^ bc[4], 3);
+		s[10] = rotl64_1(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccakf_rndc[i];
+	}
+}
+
+__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md)
+{
+	uint64_t st[25];
+
+	MEMSET8(st + 8, 0x00, 25 - 8);
+	memcpy(st, in, len);
+	((uint8_t*)st)[len] = 0x01;
+	st[16] = 0x8000000000000000ULL;
+
+	cn_keccakf(st);
+
+	MEMCPY8(md, st, 25);
+	return;
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
new file mode 100644
index 0000000..041a593
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
@@ -0,0 +1,347 @@
+#pragma once
+
+typedef unsigned int    uint_t;             /* native unsigned integer */
+
+#define SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+
+#define SKEIN_256_STATE_WORDS ( 4)
+#define SKEIN_512_STATE_WORDS ( 8)
+#define SKEIN1024_STATE_WORDS (16)
+
+#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+
+#define SKEIN_T1_FLAG_FIRST     (((uint64_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_BIT_PAD   (((uint64_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+#define SKEIN_T1_FLAG_FINAL     (((uint64_t)  1 ) << SKEIN_T1_POS_FINAL)
+
+#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \
+  Skein_Set_T0(ctxPtr,(T0)); \
+  Skein_Set_T1(ctxPtr,(T1)); }
+
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
+{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)
+#define ts              (kw + KW_TWK_BASE)
+
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \
+	X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \
+	X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \
+	X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \
+	X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6;
+
+#define I512(R) \
+	X0   += ks[((R)+1) % 9]; \
+	X1   += ks[((R)+2) % 9]; \
+	X2   += ks[((R)+3) % 9]; \
+	X3   += ks[((R)+4) % 9]; \
+	X4   += ks[((R)+5) % 9]; \
+	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
+	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
+	X7   += ks[((R)+8) % 9] + (R)+1;
+
+
+#define R512_8_rounds(R) \
+	R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
+	R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
+	R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
+	R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
+	I512(2*(R)); \
+	R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
+	R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
+	R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
+	R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
+	I512(2*(R)+1);
+
+typedef struct
+{
+	size_t  hashBitLen;
+	size_t  bCnt;
+	uint64_t  T[SKEIN_MODIFIER_WORDS];
+} Skein_Ctxt_Hdr_t;
+
+typedef struct {
+	Skein_Ctxt_Hdr_t h;
+	uint64_t  X[SKEIN_256_STATE_WORDS];
+	uint8_t  b[SKEIN_256_BLOCK_BYTES];
+} Skein_256_Ctxt_t;
+
+typedef struct {
+	Skein_Ctxt_Hdr_t h;
+	uint64_t  X[SKEIN_512_STATE_WORDS];
+	uint8_t  b[SKEIN_512_BLOCK_BYTES];
+} Skein_512_Ctxt_t;
+
+typedef struct {
+	Skein_Ctxt_Hdr_t h;
+	uint64_t  X[SKEIN1024_STATE_WORDS];
+	uint8_t  b[SKEIN1024_BLOCK_BYTES];
+} Skein1024_Ctxt_t;
+
+typedef struct {
+	uint_t  statebits;
+	union {
+		Skein_Ctxt_Hdr_t h;
+		Skein_256_Ctxt_t ctx_256;
+		Skein_512_Ctxt_t ctx_512;
+		Skein1024_Ctxt_t ctx1024;
+	} u;
+} skeinHashState;
+
+__device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen)
+{
+	const uint64_t SKEIN_512_IV_256[] =
+	{
+		SKEIN_MK_64(0xCCD044A1,0x2FDB3E13),
+		SKEIN_MK_64(0xE8359030,0x1A79A9EB),
+		SKEIN_MK_64(0x55AEA061,0x4F816E6F),
+		SKEIN_MK_64(0x2A2767A4,0xAE9B94DB),
+		SKEIN_MK_64(0xEC06025E,0x74DD7683),
+		SKEIN_MK_64(0xE7A436CD,0xC4746251),
+		SKEIN_MK_64(0xC36FBAF9,0x393AD185),
+		SKEIN_MK_64(0x3EEDBA18,0x33EDFC13)
+	};
+
+	Skein_512_Ctxt_t *ctx = &state->u.ctx_512;
+
+	ctx->h.hashBitLen = hashBitLen;
+
+	memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X));
+
+	Skein_Start_New_Type(ctx, MSG);
+}
+
+__device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
+{
+	enum {
+		R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+		R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+		R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+		R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+		R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+		R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+		R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+		R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22
+	};
+
+	uint64_t X0,X1,X2,X3,X4,X5,X6,X7;
+	uint64_t w[SKEIN_512_STATE_WORDS];
+	uint64_t kw[SKEIN_512_STATE_WORDS+4];
+
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+
+	do
+	{
+
+		ts[0] += byteCntAdd;
+
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		memcpy(w, blkPtr, SKEIN_512_STATE_WORDS << 3);
+
+		X0 = w[0] + ks[0];
+		X1 = w[1] + ks[1];
+		X2 = w[2] + ks[2];
+		X3 = w[3] + ks[3];
+		X4 = w[4] + ks[4];
+		X5 = w[5] + ks[5] + ts[0];
+		X6 = w[6] + ks[6] + ts[1];
+		X7 = w[7] + ks[7];
+
+		blkPtr += SKEIN_512_BLOCK_BYTES;
+
+		R512_8_rounds( 0);
+		R512_8_rounds( 1);
+		R512_8_rounds( 2);
+		R512_8_rounds( 3);
+		R512_8_rounds( 4);
+		R512_8_rounds( 5);
+		R512_8_rounds( 6);
+		R512_8_rounds( 7);
+		R512_8_rounds( 8);
+
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+		ctx->X[4] = X4 ^ w[4];
+		ctx->X[5] = X5 ^ w[5];
+		ctx->X[6] = X6 ^ w[6];
+		ctx->X[7] = X7 ^ w[7];
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	} 
+	while (--blkCnt);
+
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+__device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
+{
+	size_t i,n,byteCnt;
+	uint64_t X[SKEIN_512_STATE_WORDS];
+	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
+	//size_t tmp;
+	//uint8_t *p8;
+	//uint64_t *p64;
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
+
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) 
+	{
+		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+		//p8 = &ctx->b[ctx->h.bCnt];
+		//tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+		//for( i = 0; i < tmp; i++ ) *(p8+i) = 0;
+	}
+
+	cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt);
+
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	//uint8_t  b[SKEIN_512_BLOCK_BYTES] == 64
+	memset(ctx->b,0,sizeof(ctx->b));
+	//p64 = (uint64_t *)ctx->b;
+	//for( i = 0; i < 8; i++ ) *(p64+i) = 0;
+
+	memcpy(X,ctx->X,sizeof(X));
+
+	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) 
+	{
+		((uint64_t *)ctx->b)[0]= (uint64_t)i;
+		Skein_Start_New_Type(ctx,OUT_FINAL);
+		cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t));
+		n = byteCnt - i*SKEIN_512_BLOCK_BYTES;
+		if (n >= SKEIN_512_BLOCK_BYTES)
+		n  = SKEIN_512_BLOCK_BYTES;
+		memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);
+		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+	}
+}
+
+__device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) 
+	{
+
+		if (ctx->h.bCnt) 
+		{
+
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+
+			if (n) 
+			{
+				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+				msgByteCnt  -= n;
+				msg         += n;
+				ctx->h.bCnt += n;
+			}
+
+			cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+
+		if (msgByteCnt > SKEIN_512_BLOCK_BYTES) 
+		{
+			n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;
+			cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+			msg        += n * SKEIN_512_BLOCK_BYTES;
+		}
+	}
+
+	if (msgByteCnt) 
+	{
+		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+}
+
+__device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
+{
+	if ((databitlen & 7) == 0) 
+	{
+		cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3);
+	}
+	else 
+	{
+
+		size_t bCnt = (databitlen >> 3) + 1;
+		uint8_t b,mask;
+
+		mask = (uint8_t) (1u << (7 - (databitlen & 7)));
+		b    = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask);
+
+		cn_skein512_update(&state->u.ctx_512,data,bCnt-1);
+		cn_skein512_update(&state->u.ctx_512,&b  ,  1   );
+
+		Skein_Set_Bit_Pad_Flag(state->u.h);
+	}
+}
+
+__device__ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+{
+	int hashbitlen = 256;
+	DataLength databitlen = len << 3;
+	skeinHashState state;
+
+	state.statebits = 64*SKEIN_512_STATE_WORDS;
+
+	cn_skein_init(&state, hashbitlen);
+	cn_skein_update(&state, data, databitlen);
+	cn_skein_final(&state, hashval);
+}
diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp
new file mode 100644
index 0000000..9ba9716
--- /dev/null
+++ b/xmrstak/backend/plugin.hpp
@@ -0,0 +1,96 @@
+#pragma once
+#include <thread>
+#include <atomic>
+#include <vector>
+#include <string>
+#include "IBackend.hpp"
+#include <iostream>
+#include "../Environment.hpp"
+
+#ifndef USE_PRECOMPILED_HEADERS
+#ifdef WIN32
+#include <direct.h>
+#include <windows.h>
+#else
+#include <sys/types.h>
+#include <dlfcn.h>
+#endif
+#include <iostream>
+#endif
+
+namespace xmrstak
+{
+
+struct Plugin
+{
+
+	Plugin(const std::string backendName, const std::string libName) : fn_starterBackend(nullptr), m_backendName(backendName)
+	{
+#ifdef WIN32
+		libBackend = LoadLibrary(TEXT((libName + ".dll").c_str()));
+		if(!libBackend)
+		{
+			std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << (libName + ".dll") << std::endl;
+			return;
+		}
+#else
+		libBackend = dlopen((std::string("./lib") + libName + ".so").c_str(), RTLD_LAZY);
+		if(!libBackend)
+		{
+			std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << dlerror() << std::endl;
+			return;
+		}
+#endif
+
+#ifdef WIN32
+		fn_starterBackend = (starterBackend_t) GetProcAddress(libBackend, "xmrstak_start_backend");
+		if (!fn_starterBackend)
+		{
+			std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " <<GetLastError()<< std::endl;
+		}
+#else
+		// reset last error
+		dlerror();
+		fn_starterBackend = (starterBackend_t) dlsym(libBackend, "xmrstak_start_backend");
+		const char* dlsym_error = dlerror();
+		if(dlsym_error)
+		{
+			std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " << dlsym_error << std::endl;
+		}
+#endif
+	}
+
+	std::vector<IBackend*>* startBackend(uint32_t threadOffset, miner_work& pWork, Environment& env)
+	{
+		if(fn_starterBackend == nullptr)
+		{
+			std::vector<IBackend*>* pvThreads = new std::vector<IBackend*>();
+			std::cerr << "WARNING: " << m_backendName << " Backend disabled"<< std::endl;
+			return pvThreads;
+		}
+
+		return fn_starterBackend(threadOffset, pWork, env);
+	}
+
+	std::string m_backendName;
+
+	typedef std::vector<IBackend*>* (*starterBackend_t)(uint32_t threadOffset, miner_work& pWork, Environment& env);
+
+	starterBackend_t fn_starterBackend;
+
+#ifdef WIN32
+    HINSTANCE libBackend;
+#else
+    void *libBackend;
+#endif
+
+/* \todo add unload to destructor and change usage of Plugin that libs keeped open until the miner endss
+#ifdef WIN32
+    FreeLibrary(libBackend);
+#else
+    dlclose(libBackend);
+#endif
+ * */
+};
+
+} // namepsace xmrstak
author	psychocrypt <psychocrypt@users.noreply.github.com>	2017-09-29 20:32:31 +0200
committer	psychocrypt <psychocrypt@users.noreply.github.com>	2017-09-30 23:46:08 +0200
commit	cc429b68fadc502b981fd0acd64a5ff6e2ae1d15 (patch)
tree	3fb23fc4db15dbdd08af4c7ea20134b9d82e58fd /xmrstak/backend
parent	e5b0319d5a9f58762fa934ad700113908940cb31 (diff)
download	xmr-stak-cc429b68fadc502b981fd0acd64a5ff6e2ae1d15.zip xmr-stak-cc429b68fadc502b981fd0acd64a5ff6e2ae1d15.tar.gz