Add CMake as an alternative build system

Add support for Windows x64 (requires YASM)
author: Jukka Ojanen <jukka.ojanen@linkotec.net> 2014-10-31 17:55:21 +0200
committer: Jukka Ojanen <jukka.ojanen@linkotec.net> 2014-10-31 17:55:21 +0200
commit: 196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c (patch)
tree: a35307258c8cd76cf4c41af630938943c2c47e09
parent: 7b999686ec4c732d28efd344065606fccba84ae4 (diff)
download: ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.zip
ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.tar.gz
10 files changed, 2878 insertions, 1325 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..365ec32
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,158 @@
+cmake_minimum_required(VERSION 2.8)
+
+project(ffts C ASM)
+
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+# default build type is Debug which means no optimization
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release")
+endif(NOT CMAKE_BUILD_TYPE)
+
+# common options
+option(ENABLE_SSE
+  "Enables the use of SSE instructions." ON
+)
+
+option(ENABLE_NEON
+  "Enables the use of NEON instructions." OFF
+)
+
+option(ENABLE_VFP
+  "Enables the use of VFP instructions." OFF
+)
+
+option(DISABLE_DYNAMIC_CODE
+  "Disables the use of dynamic machine code generation." OFF
+)
+
+option(ENABLE_SHARED
+  "Enable building a shared library." OFF
+)
+
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pedantic -pipe -Wall")
+add_definitions(-DFFTS_CMAKE_GENERATED)
+
+include(CheckIncludeFile)
+include(CheckLibraryExists)
+
+if(MSVC)
+  add_definitions(-D_USE_MATH_DEFINES)
+else()
+  # some systems need libm for some of the math functions to work
+  check_library_exists(m pow "" HAVE_LIBM)
+  if(HAVE_LIBM)
+    list(APPEND CMAKE_REQUIRED_LIBRARIES m)
+    list(APPEND FFTS_EXTRA_LIBRARIES m)
+  endif(HAVE_LIBM)
+endif(MSVC)
+
+include_directories(src)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+set(FFTS_SOURCES
+  src/ffts_attributes.h
+  src/ffts.c
+  src/ffts.h
+  src/ffts_nd.c
+  src/ffts_nd.h
+  src/ffts_real.h
+  src/ffts_real.c  
+  src/ffts_real_nd.c
+  src/ffts_real_nd.h
+  src/ffts_small.c
+  src/macros.h
+  src/patterns.c
+  src/patterns.h
+  src/types.h
+)
+
+if(ENABLE_SSE)
+  list(APPEND FFTS_SOURCES
+    src/macros-sse.h
+  )
+
+  if(MSVC)
+    set(CMAKE_ASM-ATT_COMPILER yasm)
+    enable_language(ASM-ATT)
+
+    add_custom_command(
+      OUTPUT sse_win64.obj
+      COMMAND ${CMAKE_ASM-ATT_COMPILER} -f win64 -m amd64
+      -o ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj -p gas
+      ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      COMMENT "Generating sse_win64.obj"
+    )
+
+    list(APPEND FFTS_SOURCES
+      ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj
+      src/sse_win64.s
+    )
+  else()
+    list(APPEND FFTS_SOURCES
+      src/sse.s
+    )
+  endif(MSVC)
+
+  add_definitions(-D_USE_MATH_DEFINES)
+  add_definitions(-D__x86_64__)
+  add_definitions(-DHAVE_SSE -msse2)
+endif()
+
+if(ENABLE_NEON)
+  if(DISABLE_DYNAMIC_CODE)
+    list(APPEND FFTS_SOURCES
+      source/neon_static_f.s
+      source/neon_static_i.s
+    )
+  else()
+    list(APPEND FFTS_SOURCES
+      source/neon.s
+      source/arch/neon.c
+    )
+  endif()
+
+  add_definitions(-DHAVE_NEON)
+endif()
+
+if(ENABLE_VFP)
+  list(APPEND FFTS_SOURCES
+    source/vfp.s
+    source/arch/vfp.c
+  )
+
+  add_definitions(-DHAVE_VFP)
+endif()
+
+if(ENABLE_SINGLE)
+  add_definitions(-DHAVE_SINGLE)
+endif()
+
+if(DISABLE_DYNAMIC_CODE)
+  list(APPEND FFTS_SOURCES
+    src/ffts_static.c
+  )
+
+  add_definitions(-DDYNAMIC_DISABLED)
+else()
+  list(APPEND FFTS_SOURCES
+    src/codegen.c
+    src/codegen.h
+  )
+endif()
+
+add_library(ffts_static
+  ${FFTS_SOURCES}
+)
+
+add_executable(ffts_test
+  tests/test.c
+)
+
+target_link_libraries(ffts_test
+  ffts_static
+  ${FFTS_EXTRA_LIBRARIES}
+)
+\ No newline at end of file
diff --git a/src/codegen.c b/src/codegen.c
index 79aaca6..0cc3d24 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -1,10 +1,10 @@
 /*
- 
+
  This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
  Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -35,698 +35,951 @@
 #include "macros.h"
 #include "ffts.h"
 
-#ifdef __APPLE__
-	#include <libkern/OSCacheControl.h>
-#endif
-
-#include <sys/types.h>
-#include <sys/mman.h>
-
 #ifdef HAVE_NEON
-	#include "codegen_arm.h"
-	#include "neon.h"
+#include "codegen_arm.h"
+#include "neon.h"
 #elif HAVE_VFP
-	#include "codegen_arm.h"
-	#include "vfp.h"
+#include "codegen_arm.h"
+#include "vfp.h"
 #else
-	#include "codegen_sse.h"
-	#include "macros-sse.h"
+#include "codegen_sse.h"
+#include "macros-sse.h"
 #endif
 
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+/* #include <stdio.h> */
+#include <stdlib.h>
+#include <string.h>
+
 #ifdef __ANDROID__
-	#include <unistd.h>
+#include <unistd.h>
 #endif
 
-int tree_count(int N, int leafN, int offset) {
-	
-	if(N <= leafN) return 0;
-	int count = 0;	
-	count += tree_count(N/4, leafN, offset);
-	count += tree_count(N/8, leafN, offset + N/4);
-	count += tree_count(N/8, leafN, offset + N/4 + N/8);
-	count += tree_count(N/4, leafN, offset + N/2);
-	count += tree_count(N/4, leafN, offset + 3*N/4);
-
-	return 1 + count;
-}
-
-void elaborate_tree(size_t **p, int N, int leafN, int offset) {
-	
-	if(N <= leafN) return;
-	elaborate_tree(p, N/4, leafN, offset);
-	elaborate_tree(p, N/8, leafN, offset + N/4);
-	elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
-	elaborate_tree(p, N/4, leafN, offset + N/2);
-	elaborate_tree(p, N/4, leafN, offset + 3*N/4);
-
-	(*p)[0] = N;
-	(*p)[1] = offset*2;
+#ifdef __arm__
+typedef uint32_t insns_t;
+#else
+typedef uint8_t insns_t;
+#endif
 
-	(*p)+=2;
-}
+#define P(x) (*(*p)++ = x)
 
+static int ffts_tree_count(int N, int leaf_N, int offset)
+{
+    int count;
 
+    if (N <= leaf_N) {
+        return 0;
+    }
 
+    count  = ffts_tree_count(N/4, leaf_N, offset);
+    count += ffts_tree_count(N/8, leaf_N, offset + N/4);
+    count += ffts_tree_count(N/8, leaf_N, offset + N/4 + N/8);
+    count += ffts_tree_count(N/4, leaf_N, offset + N/2);
+    count += ffts_tree_count(N/4, leaf_N, offset + 3*N/4);
 
-uint32_t LUT_offset(size_t N, size_t leafN) {
-		int i;
-		size_t p_lut_size = 0;
-		size_t lut_size = 0;
-		int hardcoded = 0;
-		size_t n_luts = __builtin_ctzl(N/leafN);
-		int n = leafN*2;
-		//if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
-		
-		for(i=0;i<n_luts-1;i++) {
-			p_lut_size = lut_size;
-			if(!i || hardcoded) {
-			#ifdef __arm__
-				if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
-			else lut_size += n/4 * sizeof(cdata_t);
-			#else
-				lut_size += n/4 * 2 * sizeof(cdata_t);
-			#endif
-			//	n *= 2;
-			} else {
-			#ifdef __arm__ 
-				lut_size += n/8 * 3 * sizeof(cdata_t);
-			#else
-				lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
-			#endif
-			}
-			n *= 2;
-		}
-	return lut_size;
+    return 1 + count;
 }
 
-#ifdef __arm__ 
-	typedef uint32_t insns_t;
-#else
-	typedef uint8_t insns_t;
-#endif
+static void ffts_elaborate_tree(size_t **p, int N, int leaf_N, int offset)
+{
+    if (N <= leaf_N) {
+        return;
+    }
 
-#define P(x) (*(*p)++ = x)
+    ffts_elaborate_tree(p, N/4, leaf_N, offset);
+    ffts_elaborate_tree(p, N/8, leaf_N, offset + N/4);
+    ffts_elaborate_tree(p, N/8, leaf_N, offset + N/4 + N/8);
+    ffts_elaborate_tree(p, N/4, leaf_N, offset + N/2);
+    ffts_elaborate_tree(p, N/4, leaf_N, offset + 3*N/4);
+
+    (*p)[0] = N;
+    (*p)[1] = 2 * offset;
 
-void insert_nops(uint8_t **p, uint32_t count) {
-	switch(count) {
-		case 0: break;
-		case 2: P(0x66); 
-		case 1: P(0x90); break;
-		case 3: P(0x0F); P(0x1F); P(0x00); break;	
-		case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break;	
-		case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;	
-		case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;	
-		case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break;	
-		case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;	
-		case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;	
-		default:
-			P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); 
-			insert_nops(p, count-9);
-			break;	
-	}
+    (*p) += 2;
 }
 
+static void ffts_insert_nops(uint8_t **p, uint32_t count)
+{
+    if (count >= 9) {
+        P(0x66);
+        P(0x0F);
+        P(0x1F);
+        P(0x84);
+        P(0x00);
+        P(0x00);
+        P(0x00);
+        P(0x00);
+        P(0x00);
+
+        if (count > 9) {
+            ffts_insert_nops(p, count - 9);
+        }
+    } else {
+        switch(count) {
+        case 0:
+            break;
+        case 2:
+            P(0x66);
+        /* fall through */
+        case 1:
+            P(0x90);
+            break;
+        case 3:
+            P(0x0F);
+            P(0x1F);
+            P(0x00);
+            break;
+        case 4:
+            P(0x0F);
+            P(0x1F);
+            P(0x40);
+            P(0x00);
+            break;
+        case 5:
+            P(0x0F);
+            P(0x1F);
+            P(0x44);
+            P(0x00);
+            P(0x00);
+            break;
+        case 6:
+            P(0x66);
+            P(0x0F);
+            P(0x1F);
+            P(0x44);
+            P(0x00);
+            P(0x00);
+            break;
+        case 7:
+            P(0x0F);
+            P(0x1F);
+            P(0x80);
+            P(0x00);
+            P(0x00);
+            P(0x00);
+            P(0x00);
+            break;
+        case 8:
+        default:
+            P(0x0F);
+            P(0x1F);
+            P(0x84);
+            P(0x00);
+            P(0x00);
+            P(0x00);
+            P(0x00);
+            P(0x00);
+            break;
+        }
+    }
+}
 
-void align_mem16(uint8_t **p, uint32_t offset) {
+static void ffts_align_mem16(uint8_t **p, uint32_t offset)
+{
 #ifdef __x86_64__
-	int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf);
-	r = (16 + r) & 0xf;
-	insert_nops(p, r);	
+    int r = (16 - (offset & 0xf)) - ((uintptr_t)(*p) & 0xf);
+    r = (16 + r) & 0xf;
+    ffts_insert_nops(p, r);
 #endif
 }
 
-void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
-	int count = tree_count(N, leafN, 0) + 1;
-	size_t *ps = malloc(count * 2 * sizeof(size_t));
-	size_t *pps = ps;
+transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
+{
+    uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
+    uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
+
+    int32_t pAddr = 0;
+    int32_t pN = 0;
+    int32_t pLUT = 0;
+
+    insns_t  *fp;
+    insns_t  *start;
+    insns_t  *x_4_addr;
+    insns_t  *x_8_addr;
+    uint32_t  loop_count;
+
+    int       count;
+    int       i;
+    ptrdiff_t len;
+
+    size_t   *ps;
+    size_t   *pps;
+
+    count = ffts_tree_count(N, leaf_N, 0) + 1;
+
+    ps = pps = malloc(2 * count * sizeof(*ps));
+    if (!ps) {
+        return NULL;
+    }
+
+    ffts_elaborate_tree(&pps, N, leaf_N, 0);
+
+    pps[0] = 0;
+    pps[1] = 0;
+
+    pps = ps;
 
 #ifdef __x86_64__
-	if(sign < 0) p->constants = sse_constants;
-	else         p->constants = sse_constants_inv;
+    if (sign < 0) {
+        p->constants = sse_constants;
+    } else {
+        p->constants = sse_constants_inv;
+    }
 #endif
 
-	elaborate_tree(&pps, N, leafN, 0);
-	pps[0] = 0;
-	pps[1] = 0;
+    fp = (insns_t*) p->transform_base;
 
-	pps = ps;
+#ifdef __arm__
+#ifdef HAVE_NEON
+    memcpy(fp, neon_x8, neon_x8_t - neon_x8);
+    /*
+    * Changes adds to subtracts and  vice versa to allow the computation
+    * of both the IFFT and FFT
+    */
+    if(sign < 0) {
+        fp[31] ^= 0x00200000;
+        fp[32] ^= 0x00200000;
+        fp[33] ^= 0x00200000;
+        fp[34] ^= 0x00200000;
+        fp[65] ^= 0x00200000;
+        fp[66] ^= 0x00200000;
+        fp[70] ^= 0x00200000;
+        fp[74] ^= 0x00200000;
+        fp[97] ^= 0x00200000;
+        fp[98] ^= 0x00200000;
+        fp[102] ^= 0x00200000;
+        fp[104] ^= 0x00200000;
+    }
+    fp += (neon_x8_t - neon_x8) / 4;
+#else
+    memcpy(fp, vfp_x8, vfp_end - vfp_x8);
+    if(sign > 0) {
+        fp[65] ^= 0x00000040;
+        fp[66] ^= 0x00000040;
+        fp[68] ^= 0x00000040;
+        fp[70] ^= 0x00000040;
+        fp[103] ^= 0x00000040;
+        fp[104] ^= 0x00000040;
+        fp[105] ^= 0x00000040;
+        fp[108] ^= 0x00000040;
+        fp[113] ^= 0x00000040;
+        fp[114] ^= 0x00000040;
+        fp[117] ^= 0x00000040;
+        fp[118] ^= 0x00000040;
+    }
+    fp += (vfp_end - vfp_x8) / 4;
+#endif
+#else
+    /* align call destination */
+    ffts_align_mem16(&fp, 0);
+    x_8_addr = fp;
 
-#ifdef __arm__ 
-	if(N < 8192) p->transform_size = 8192;
-	else p->transform_size = N;
+    /* align loop/jump destination */
+#ifdef _M_AMD64
+    ffts_align_mem16(&fp, 6);
 #else
-	if(N < 2048) p->transform_size = 16384;
-	else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
+    ffts_align_mem16(&fp, 5);
+#endif
+
+    /* copy function */
+    assert((char*) x8_soft_end > (char*) x8_soft);
+    len = (char*) x8_soft_end - (char*) x8_soft;
+    memcpy(fp, x8_soft, (size_t) len);
+    fp += len;
 #endif
+    //uint32_t *x_8_t_addr = fp;
+    //memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
+    //fp += (neon_end - neon_x8_t) / 4;
 
-#ifdef __APPLE__
-	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
+#ifdef __arm__
+#ifdef HAVE_NEON
+    memcpy(fp, neon_x4, neon_x8 - neon_x4);
+    if(sign < 0) {
+        fp[26] ^= 0x00200000;
+        fp[28] ^= 0x00200000;
+        fp[31] ^= 0x00200000;
+        fp[32] ^= 0x00200000;
+    }
+    fp += (neon_x8 - neon_x4) / 4;
 #else
-#define MAP_ANONYMOUS 0x20	
-	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+    memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
+    if(sign > 0) {
+        fp[36] ^= 0x00000040;
+        fp[38] ^= 0x00000040;
+        fp[43] ^= 0x00000040;
+        fp[44] ^= 0x00000040;
+    }
+    fp += (vfp_x8 - vfp_x4) / 4;
+#endif
+#else
+    /* align call destination */
+    ffts_align_mem16(&fp, 0);
+    x_4_addr = fp;
+
+    /* copy function */
+    assert((char*) x8_soft > (char*) x4);
+    len = (char*) x8_soft - (char*) x4;
+    memcpy(fp, x4, (size_t) len);
+    fp += len;
 #endif
 
-/*
-	if(p->transform_base == MAP_FAILED) {
-		fprintf(stderr, "MAP FAILED\n");
-		exit(1);
-	}*/
-	insns_t *func = p->transform_base;//valloc(8192);
-	insns_t *fp = func;
-
-//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
-//fprintf(stderr, "Base address = %016p\n", func);
-
-	if(!func) { 
-		fprintf(stderr, "NOMEM\n");
-		exit(1);
-	}
-
-	insns_t *x_8_addr = fp;
 #ifdef __arm__
+    start = fp;
+
+    *fp = PUSH_LR();
+    fp++;
+    *fp = 0xed2d8b10;
+    fp++;
+
+    ADDI(&fp, 3, 1, 0);
+    ADDI(&fp, 7, 1, N);
+    ADDI(&fp, 5, 1, 2*N);
+    ADDI(&fp, 10, 7, 2*N);
+    ADDI(&fp, 4, 5, 2*N);
+    ADDI(&fp, 8, 10, 2*N);
+    ADDI(&fp, 6, 4, 2*N);
+    ADDI(&fp, 9, 8, 2*N);
+
+    *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p));
+    fp++; // load offsets into r12
+    //  *fp++ = LDRI(1, 0, 4); // load ws into r1
+    ADDI(&fp, 1, 0, 0);
+
+    ADDI(&fp, 0, 2, 0), // mov out into r0
+         *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
+    fp++;
+
 #ifdef HAVE_NEON
-	memcpy(fp, neon_x8, neon_x8_t - neon_x8);
-	/* 
-	 * Changes adds to subtracts and  vice versa to allow the computation 
-	 * of both the IFFT and FFT
-	 */
-	if(sign < 0) {
-		fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
-		fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
-		fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
-	}
-	fp += (neon_x8_t - neon_x8) / 4;
+    MOVI(&fp, 11, p->i0);
+#else
+    MOVI(&fp, 11, p->i0);
+#endif
+#else
+    /* align call destination */
+    ffts_align_mem16(&fp, 0);
+    start = fp;
+
+    /* save nonvolatile registers */
+#ifdef _M_AMD64
+    /* use the shadow space to save first 3 registers */
+
+    /* mov [rsp + 8], rbx */
+    *fp++ = 0x48;
+    *fp++ = 0x89;
+    *fp++ = 0x5C;
+    *fp++ = 0x24;
+    *fp++ = 0x08;
+
+    /* mov [rsp + 16], rsi */
+    *fp++ = 0x48;
+    *fp++ = 0x89;
+    *fp++ = 0x74;
+    *fp++ = 0x24;
+    *fp++ = 0x10;
+
+    /* mov [rsp + 24], rdi */
+    *fp++ = 0x48;
+    *fp++ = 0x89;
+    *fp++ = 0x7C;
+    *fp++ = 0x24;
+    *fp++ = 0x18;
 #else
-	memcpy(fp, vfp_x8, vfp_end - vfp_x8);
-	if(sign > 0) {
-		fp[65] ^= 0x00000040; 
-		fp[66] ^= 0x00000040; 
-		fp[68] ^= 0x00000040; 
-		fp[70] ^= 0x00000040; 
-		fp[103] ^= 0x00000040; 
-		fp[104] ^= 0x00000040; 
-		fp[105] ^= 0x00000040; 
-		fp[108] ^= 0x00000040; 
-		fp[113] ^= 0x00000040; 
-		fp[114] ^= 0x00000040; 
-		fp[117] ^= 0x00000040; 
-		fp[118] ^= 0x00000040; 
-	}
-	fp += (vfp_end - vfp_x8) / 4;
+    PUSH(&fp, RBP);
+    PUSH(&fp, RBX);
+    PUSH(&fp, R10);
+    PUSH(&fp, R11);
+    PUSH(&fp, R12);
+    PUSH(&fp, R13);
+    PUSH(&fp, R14);
+    PUSH(&fp, R15);
 #endif
+
+    /* assign loop counter register */
+    loop_count = p->i0 * 4;
+#ifdef _M_AMD64
+    MOVI(&fp, EBX, loop_count);
 #else
-	align_mem16(&fp, 0);
-	x_8_addr = fp;
-	align_mem16(&fp, 5);
-	memcpy(fp, x8_soft, x8_hard - x8_soft);
-	fp += (x8_hard - x8_soft);
-//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
+    MOVI(&fp, ECX, loop_count);
+#endif
 #endif
-//uint32_t *x_8_t_addr = fp;
-//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
-//fp += (neon_end - neon_x8_t) / 4;
-	insns_t *x_4_addr = fp;
+
 #ifdef __arm__
-	#ifdef HAVE_NEON
-		memcpy(fp, neon_x4, neon_x8 - neon_x4);
-		if(sign < 0) {
-			fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
-		}
-		fp += (neon_x8 - neon_x4) / 4;
-	#else
-		memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
-		if(sign > 0) {
-			fp[36] ^= 0x00000040; 
-			fp[38] ^= 0x00000040; 
-			fp[43] ^= 0x00000040; 
-			fp[44] ^= 0x00000040;
-		}
-		fp += (vfp_x8 - vfp_x4) / 4;
-	#endif
+#ifdef HAVE_NEON
+    memcpy(fp, neon_ee, neon_oo - neon_ee);
+    if (sign < 0) {
+        fp[33] ^= 0x00200000;
+        fp[37] ^= 0x00200000;
+        fp[38] ^= 0x00200000;
+        fp[39] ^= 0x00200000;
+        fp[40] ^= 0x00200000;
+        fp[41] ^= 0x00200000;
+        fp[44] ^= 0x00200000;
+        fp[45] ^= 0x00200000;
+        fp[46] ^= 0x00200000;
+        fp[47] ^= 0x00200000;
+        fp[48] ^= 0x00200000;
+        fp[57] ^= 0x00200000;
+    }
+
+    fp += (neon_oo - neon_ee) / 4;
+#else
+    memcpy(fp, vfp_e, vfp_o - vfp_e);
+
+    if (sign > 0) {
+        fp[64] ^= 0x00000040;
+        fp[65] ^= 0x00000040;
+        fp[68] ^= 0x00000040;
+        fp[75] ^= 0x00000040;
+        fp[76] ^= 0x00000040;
+        fp[79] ^= 0x00000040;
+        fp[80] ^= 0x00000040;
+        fp[83] ^= 0x00000040;
+        fp[84] ^= 0x00000040;
+        fp[87] ^= 0x00000040;
+        fp[91] ^= 0x00000040;
+        fp[93] ^= 0x00000040;
+    }
+    fp += (vfp_o - vfp_e) / 4;
+#endif
 #else
-	align_mem16(&fp, 0);
-	x_4_addr = fp;
-	memcpy(fp, x4, x8_soft - x4);
-	fp += (x8_soft - x4);
+    //fprintf(stderr, "Body start address = %016p\n", start);
 
+    /* copy function */
+    assert((char*) leaf_ee > (char*) leaf_ee_init);
+    len = (char*) leaf_ee - (char*) leaf_ee_init;
+    memcpy(fp, leaf_ee_init, (size_t) len);
+    fp += len;
+
+    /* align loop/jump destination */
+#ifdef _M_AMD64
+    ffts_align_mem16(&fp, 8);
+#else
+    ffts_align_mem16(&fp, 9);
 #endif
-	insns_t *start = fp;
-
-#ifdef __arm__ 
-	*fp = PUSH_LR(); fp++;
-	*fp = 0xed2d8b10; fp++;
-
-	ADDI(&fp, 3, 1, 0);
-	ADDI(&fp, 7, 1, N);
-	ADDI(&fp, 5, 1, 2*N);
-	ADDI(&fp, 10, 7, 2*N);
-	ADDI(&fp, 4, 5, 2*N);
-	ADDI(&fp, 8, 10, 2*N);
-	ADDI(&fp, 6, 4, 2*N);
-	ADDI(&fp, 9, 8, 2*N);
-	
-  *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
-//  *fp++ = LDRI(1, 0, 4); // load ws into r1
-	ADDI(&fp, 1, 0, 0);
-
-	ADDI(&fp, 0, 2, 0), // mov out into r0
+
+    /* copy function */
+    assert((char*) leaf_oo > (char*) leaf_ee);
+    len = (char*) leaf_oo - (char*) leaf_ee;
+    memcpy(fp, leaf_ee, (size_t) len);
+
+    /* patch offsets */
+    for (i = 0; i < 8; i++) {
+        IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets[i]);
+    }
+
+    fp += len;
+
+    if (ffts_ctzl(N) & 1) {
+        if (p->i1) {
+            loop_count += 4 * p->i1;
+
+            /* align loop/jump destination */
+#ifdef _M_AMD64
+            MOVI(&fp, EBX, loop_count);
+            ffts_align_mem16(&fp, 3);
+#else
+            MOVI(&fp, ECX, loop_count);
+            ffts_align_mem16(&fp, 4);
 #endif
 
+            /* copy function */
+            assert((char*) leaf_eo > (char*) leaf_oo);
+            len = (char*) leaf_eo - (char*) leaf_oo;
+            memcpy(fp, leaf_oo, len);
 
-#ifdef __arm__
-		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
-  	#ifdef HAVE_NEON
-  		MOVI(&fp, 11, p->i0);
-  	#else 
-  		MOVI(&fp, 11, p->i0);
-  	#endif
+            /* patch offsets */
+            for (i = 0; i < 8; i++) {
+                IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]);
+            }
+
+            fp += len;
+        }
+
+        loop_count += 4;
+
+        /* copy function */
+        assert((char*) leaf_end > (char*) leaf_oe);
+        len = (char*) leaf_end - (char*) leaf_oe;
+        memcpy(fp, leaf_oe, len);
+
+        /* patch offsets */
+        for (i = 0; i < 8; i++) {
+            IMM32_NI(fp + sse_leaf_oe_offsets[i], 4 * offsets_o[i]);
+        }
 
+        fp += len;
+    } else {
+        loop_count += 4;
+
+        /* copy function */
+        assert((char*) leaf_oe > (char*) leaf_eo);
+        len = (char*) leaf_oe - (char*) leaf_eo;
+        memcpy(fp, leaf_eo, len);
+
+        /* patch offsets */
+        for (i = 0; i < 8; i++) {
+            IMM32_NI(fp + sse_leaf_eo_offsets[i], 4 * offsets[i]);
+        }
+
+        fp += len;
+
+        if (p->i1) {
+            loop_count += 4 * p->i1;
+
+            /* align loop/jump destination */
+#ifdef _M_AMD64
+            MOVI(&fp, EBX, loop_count);
+            ffts_align_mem16(&fp, 3);
 #else
-	align_mem16(&fp, 0);
-	start = fp;
-	
-	*fp++ = 0x4c;
-	*fp++ = 0x8b;
-	*fp++ = 0x07;
-	uint32_t lp_cnt = p->i0 * 4;
-	MOVI(&fp, RCX, lp_cnt);
-	
-	//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); 
+            MOVI(&fp, ECX, loop_count);
+            ffts_align_mem16(&fp, 4);
 #endif
-	//fp++;
-#ifdef __arm__
-#ifdef HAVE_NEON
-		memcpy(fp, neon_ee, neon_oo - neon_ee);
-		if(sign < 0) {
-			fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
-			fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
-			fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
-		}
-		fp += (neon_oo - neon_ee) / 4;
+
+            /* copy function */
+            assert((char*) leaf_eo > (char*) leaf_oo);
+            len = (char*) leaf_eo - (char*) leaf_oo;
+            memcpy(fp, leaf_oo, len);
+
+            for (i = 0; i < 8; i++) {
+                IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]);
+            }
+
+            fp += len;
+        }
+    }
+
+    if (p->i1) {
+        uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
+
+        loop_count += 4 * p->i1;
+
+        /* align loop/jump destination */
+#ifdef _M_AMD64
+        MOVI(&fp, EBX, loop_count);
+        ffts_align_mem16(&fp, 8);
 #else
-			memcpy(fp, vfp_e, vfp_o - vfp_e);
-			if(sign > 0) {
-				fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
-				fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
-				fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
-			}
-			fp += (vfp_o - vfp_e) / 4;
+        MOVI(&fp, ECX, loop_count);
+        ffts_align_mem16(&fp, 9);
 #endif
+
+        assert((char*) leaf_oo > (char*) leaf_ee);
+        len = (char*) leaf_oo - (char*) leaf_ee;
+        memcpy(fp, leaf_ee, len);
+
+        for (i = 0; i < 8; i++) {
+            IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets_oe[i]);
+        }
+
+        fp += len;
+    }
+
+    //fprintf(stderr, "Body start address = %016p\n", fp);
+    //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
+    memcpy(fp, x_init, (char*) x4 - (char*) x_init);
+    //IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
+    fp += ((char*) x4 - (char*) x_init);
+
+    count = 2;
+    while (pps[0]) {
+        size_t ws_is;
+
+        if (!pN) {
+#ifdef _M_AMD64
+            MOVI(&fp, EBX, pps[0]);
 #else
-//fprintf(stderr, "Body start address = %016p\n", start);
-	
-	PUSH(&fp, RBP);	
-	PUSH(&fp, RBX);
-	PUSH(&fp, R10);
-	PUSH(&fp, R11);
-	PUSH(&fp, R12);
-	PUSH(&fp, R13);
-	PUSH(&fp, R14);
-	PUSH(&fp, R15);
-	
-	int i;
-	memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
-	
-//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
-//fprintf(stderr, "Constants address = %016p\n", sse_constants);
-//fprintf(stderr, "Constants address = %016p\n", p->constants);
-	
-//int32_t val = READ_IMM32(fp + 3);
-//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));
-
-//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
-//fprintf(stderr, "IMM = 0x%llx\n", v2);
-
-//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); 
-	fp += (leaf_ee - leaf_ee_init);
-
-//fprintf(stderr, "Leaf start address = %016p\n", fp);
-	align_mem16(&fp, 9);
-	memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
-
-
-	uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
-	uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
-	uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
-	
-	for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); 
-	
-	fp += (leaf_oo - leaf_ee);
-	
-	if(__builtin_ctzl(N) & 1){
-
-		if(p->i1) {
-			lp_cnt += p->i1 * 4;
-			MOVI(&fp, RCX, lp_cnt);
-			align_mem16(&fp, 4);
-			memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
-			for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
-			fp += (leaf_eo - leaf_oo);
-		}
-		
-
-		memcpy(fp, leaf_oe, leaf_end - leaf_oe);
-		lp_cnt += 4;
-		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); 
-		fp += (leaf_end - leaf_oe);
-
-	}else{
-
-
-		memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
-		lp_cnt += 4;
-		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); 
-		fp += (leaf_oe - leaf_eo);
-
-		if(p->i1) {
-			lp_cnt += p->i1 * 4;
-			MOVI(&fp, RCX, lp_cnt);
-			align_mem16(&fp, 4);
-			memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
-			for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
-			fp += (leaf_eo - leaf_oo);
-		}
-
-	}
-	if(p->i1) {
-		lp_cnt += p->i1 * 4;
-		MOVI(&fp, RCX, lp_cnt);
-		align_mem16(&fp, 9);
-		memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
-		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); 
-		fp += (leaf_oo - leaf_ee);
-
-	}
-	
-//fprintf(stderr, "Body start address = %016p\n", fp);
-  //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); 
-	memcpy(fp, x_init, x4 - x_init);
-//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); 
-	fp += (x4 - x_init);
-
-	int32_t pAddr = 0;
-	int32_t pN = 0;
-	int32_t pLUT = 0;
-	count = 2;
-	while(pps[0]) {
-	
-		if(!pN) {
-			MOVI(&fp, RCX, pps[0] / 4);
-		}else{
-  		if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
-			if(pps[0] > leafN && pps[0] - pN) {
-				
-				int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
-				*fp++ = 0xc1; 
-				
-				if(diff > 0) {
-					*fp++ = 0xe1;
-					*fp++ = (diff & 0xff);
-				}else{
-					*fp++ = 0xe9;
-					*fp++ = ((-diff) & 0xff);
-				}
-			}
-		}
-		
-  		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
-  			ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 
-
-
-		if(pps[0] == 2*leafN) {
-			CALL(&fp, x_4_addr);
-	//  	}else if(!pps[2]){
-	//	  //uint32_t *x_8_t_addr = fp;
-	//		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
-	//		fp += (neon_ee - neon_x8_t) / 4;
-	//		//*fp++ = BL(fp+2, x_8_t_addr);
-		}else{
-			CALL(&fp, x_8_addr);
-		}
-
-		pAddr = pps[1] * 4;
-		if(pps[0] > leafN) 
-			pN = pps[0];
-		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
-//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
-		count += 4;
-		pps += 2;
-	}
+            MOVI(&fp, ECX, pps[0] / 4);
 #endif
+        } else {
+            int offset = (4 * pps[1]) - pAddr;
+            if (offset) {
+#ifdef _M_AMD64
+                ADDI(&fp, R8, offset);
+#else
+                ADDI(&fp, RDX, offset);
+#endif
+            }
+
+            if (pps[0] > leaf_N && pps[0] - pN) {
+                int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN);
+
+#ifdef _M_AMD64
+                SHIFT(&fp, EBX, factor);
+#else
+                SHIFT(&fp, ECX, factor);
+#endif
+            }
+        }
+
+        ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1];
+        if (ws_is != pLUT) {
+            int offset = (int) (ws_is - pLUT);
+
+#ifdef _M_AMD64
+            ADDI(&fp, RDI, offset);
+#else
+            ADDI(&fp, R8, offset);
+#endif
+        }
+
+        if (pps[0] == 2 * leaf_N) {
+            CALL(&fp, x_4_addr);
+        } else {
+            CALL(&fp, x_8_addr);
+        }
+
+        pAddr = 4 * pps[1];
+        if (pps[0] > leaf_N) {
+            pN = pps[0];
+        }
+
+        pLUT = ws_is;//LUT_offset(pps[0], leafN);
+        //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
+        count += 4;
+        pps += 2;
+    }
+#endif
+
 #ifdef __arm__
 #ifdef HAVE_NEON
-	if(__builtin_ctzl(N) & 1){
-		ADDI(&fp, 2, 7, 0);
-		ADDI(&fp, 7, 9, 0);
-		ADDI(&fp, 9, 2, 0);
-
-		ADDI(&fp, 2, 8, 0);
-		ADDI(&fp, 8, 10, 0);
-		ADDI(&fp, 10, 2, 0);
-	
-		if(p->i1) {
-			MOVI(&fp, 11, p->i1);
-			memcpy(fp, neon_oo, neon_eo - neon_oo);
-			if(sign < 0) {
-				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
-				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
-				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
-			}
-			fp += (neon_eo - neon_oo) / 4;
-		}
-		
-		*fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; 
-
-		memcpy(fp, neon_oe, neon_end - neon_oe);
-		if(sign < 0) {
-			fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
-			fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
-			fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
-		}
-		fp += (neon_end - neon_oe) / 4;
-
-	}else{
-		
-		*fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; 
-
-		memcpy(fp, neon_eo, neon_oe - neon_eo);
-		if(sign < 0) {
-			fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
-			fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
-			fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
-		}
-		fp += (neon_oe - neon_eo) / 4;
-		
-		ADDI(&fp, 2, 7, 0);
-		ADDI(&fp, 7, 9, 0);
-		ADDI(&fp, 9, 2, 0);
-
-		ADDI(&fp, 2, 8, 0);
-		ADDI(&fp, 8, 10, 0);
-		ADDI(&fp, 10, 2, 0);
-	
-		if(p->i1) {
-			MOVI(&fp, 11, p->i1);
-			memcpy(fp, neon_oo, neon_eo - neon_oo);
-			if(sign < 0) {
-				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
-				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
-				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
-			}
-			fp += (neon_eo - neon_oo) / 4;
-		}
-
-	}
-
-
-	if(p->i1) {
-		ADDI(&fp, 2, 3, 0);
-		ADDI(&fp, 3, 7, 0);
-		ADDI(&fp, 7, 2, 0);
-
-		ADDI(&fp, 2, 4, 0);
-		ADDI(&fp, 4, 8, 0);
-		ADDI(&fp, 8, 2, 0);
-		
-		ADDI(&fp, 2, 5, 0);
-		ADDI(&fp, 5, 9, 0);
-		ADDI(&fp, 9, 2, 0);
-
-		ADDI(&fp, 2, 6, 0);
-		ADDI(&fp, 6, 10, 0);
-		ADDI(&fp, 10, 2, 0);
-
-		ADDI(&fp, 2, 9, 0);
-		ADDI(&fp, 9, 10, 0);
-		ADDI(&fp, 10, 2, 0);
-
-		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
-	  MOVI(&fp, 11, p->i1);
-  	memcpy(fp, neon_ee, neon_oo - neon_ee);
-  	if(sign < 0) {
-  		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
-  		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
-  		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
-  	}
-		fp += (neon_oo - neon_ee) / 4;
-
-	}
+    if(__builtin_ctzl(N) & 1) {
+        ADDI(&fp, 2, 7, 0);
+        ADDI(&fp, 7, 9, 0);
+        ADDI(&fp, 9, 2, 0);
+
+        ADDI(&fp, 2, 8, 0);
+        ADDI(&fp, 8, 10, 0);
+        ADDI(&fp, 10, 2, 0);
+
+        if(p->i1) {
+            MOVI(&fp, 11, p->i1);
+            memcpy(fp, neon_oo, neon_eo - neon_oo);
+            if(sign < 0) {
+                fp[12] ^= 0x00200000;
+                fp[13] ^= 0x00200000;
+                fp[14] ^= 0x00200000;
+                fp[15] ^= 0x00200000;
+                fp[27] ^= 0x00200000;
+                fp[29] ^= 0x00200000;
+                fp[30] ^= 0x00200000;
+                fp[31] ^= 0x00200000;
+                fp[46] ^= 0x00200000;
+                fp[47] ^= 0x00200000;
+                fp[48] ^= 0x00200000;
+                fp[57] ^= 0x00200000;
+            }
+            fp += (neon_eo - neon_oo) / 4;
+        }
+
+        *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
+        fp++;
+
+        memcpy(fp, neon_oe, neon_end - neon_oe);
+        if(sign < 0) {
+            fp[19] ^= 0x00200000;
+            fp[20] ^= 0x00200000;
+            fp[22] ^= 0x00200000;
+            fp[23] ^= 0x00200000;
+            fp[37] ^= 0x00200000;
+            fp[38] ^= 0x00200000;
+            fp[40] ^= 0x00200000;
+            fp[41] ^= 0x00200000;
+            fp[64] ^= 0x00200000;
+            fp[65] ^= 0x00200000;
+            fp[66] ^= 0x00200000;
+            fp[67] ^= 0x00200000;
+        }
+        fp += (neon_end - neon_oe) / 4;
+
+    } else {
+
+        *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p));
+        fp++;
+
+        memcpy(fp, neon_eo, neon_oe - neon_eo);
+        if(sign < 0) {
+            fp[10] ^= 0x00200000;
+            fp[11] ^= 0x00200000;
+            fp[13] ^= 0x00200000;
+            fp[14] ^= 0x00200000;
+            fp[31] ^= 0x00200000;
+            fp[33] ^= 0x00200000;
+            fp[34] ^= 0x00200000;
+            fp[35] ^= 0x00200000;
+            fp[59] ^= 0x00200000;
+            fp[60] ^= 0x00200000;
+            fp[61] ^= 0x00200000;
+            fp[62] ^= 0x00200000;
+        }
+        fp += (neon_oe - neon_eo) / 4;
+
+        ADDI(&fp, 2, 7, 0);
+        ADDI(&fp, 7, 9, 0);
+        ADDI(&fp, 9, 2, 0);
+
+        ADDI(&fp, 2, 8, 0);
+        ADDI(&fp, 8, 10, 0);
+        ADDI(&fp, 10, 2, 0);
+
+        if(p->i1) {
+            MOVI(&fp, 11, p->i1);
+            memcpy(fp, neon_oo, neon_eo - neon_oo);
+            if(sign < 0) {
+                fp[12] ^= 0x00200000;
+                fp[13] ^= 0x00200000;
+                fp[14] ^= 0x00200000;
+                fp[15] ^= 0x00200000;
+                fp[27] ^= 0x00200000;
+                fp[29] ^= 0x00200000;
+                fp[30] ^= 0x00200000;
+                fp[31] ^= 0x00200000;
+                fp[46] ^= 0x00200000;
+                fp[47] ^= 0x00200000;
+                fp[48] ^= 0x00200000;
+                fp[57] ^= 0x00200000;
+            }
+            fp += (neon_eo - neon_oo) / 4;
+        }
+
+    }
+
+
+    if(p->i1) {
+        ADDI(&fp, 2, 3, 0);
+        ADDI(&fp, 3, 7, 0);
+        ADDI(&fp, 7, 2, 0);
+
+        ADDI(&fp, 2, 4, 0);
+        ADDI(&fp, 4, 8, 0);
+        ADDI(&fp, 8, 2, 0);
+
+        ADDI(&fp, 2, 5, 0);
+        ADDI(&fp, 5, 9, 0);
+        ADDI(&fp, 9, 2, 0);
+
+        ADDI(&fp, 2, 6, 0);
+        ADDI(&fp, 6, 10, 0);
+        ADDI(&fp, 10, 2, 0);
+
+        ADDI(&fp, 2, 9, 0);
+        ADDI(&fp, 9, 10, 0);
+        ADDI(&fp, 10, 2, 0);
+
+        *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
+        fp++;
+        MOVI(&fp, 11, p->i1);
+        memcpy(fp, neon_ee, neon_oo - neon_ee);
+        if(sign < 0) {
+            fp[33] ^= 0x00200000;
+            fp[37] ^= 0x00200000;
+            fp[38] ^= 0x00200000;
+            fp[39] ^= 0x00200000;
+            fp[40] ^= 0x00200000;
+            fp[41] ^= 0x00200000;
+            fp[44] ^= 0x00200000;
+            fp[45] ^= 0x00200000;
+            fp[46] ^= 0x00200000;
+            fp[47] ^= 0x00200000;
+            fp[48] ^= 0x00200000;
+            fp[57] ^= 0x00200000;
+        }
+        fp += (neon_oo - neon_ee) / 4;
+
+    }
 #else
-		ADDI(&fp, 2, 7, 0);
-		ADDI(&fp, 7, 9, 0);
-		ADDI(&fp, 9, 2, 0);
-
-		ADDI(&fp, 2, 8, 0);
-		ADDI(&fp, 8, 10, 0);
-		ADDI(&fp, 10, 2, 0);
-	
-			MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
-  		memcpy(fp, vfp_o, vfp_x4 - vfp_o);
-		if(sign > 0) {
-			fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
-			fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
-		}
-  		fp += (vfp_x4 - vfp_o) / 4;
-		
-		ADDI(&fp, 2, 3, 0);
-		ADDI(&fp, 3, 7, 0);
-		ADDI(&fp, 7, 2, 0);
-
-		ADDI(&fp, 2, 4, 0);
-		ADDI(&fp, 4, 8, 0);
-		ADDI(&fp, 8, 2, 0);
-		
-		ADDI(&fp, 2, 5, 0);
-		ADDI(&fp, 5, 9, 0);
-		ADDI(&fp, 9, 2, 0);
-
-		ADDI(&fp, 2, 6, 0);
-		ADDI(&fp, 6, 10, 0);
-		ADDI(&fp, 10, 2, 0);
-
-		ADDI(&fp, 2, 9, 0);
-		ADDI(&fp, 9, 10, 0);
-		ADDI(&fp, 10, 2, 0);
-
-		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
-	  MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
-  	memcpy(fp, vfp_e, vfp_o - vfp_e);
-		if(sign > 0) {
-			fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
-			fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
-			fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
-		}
-  	fp += (vfp_o - vfp_e) / 4;
+    ADDI(&fp, 2, 7, 0);
+    ADDI(&fp, 7, 9, 0);
+    ADDI(&fp, 9, 2, 0);
+
+    ADDI(&fp, 2, 8, 0);
+    ADDI(&fp, 8, 10, 0);
+    ADDI(&fp, 10, 2, 0);
+
+    MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
+    memcpy(fp, vfp_o, vfp_x4 - vfp_o);
+    if(sign > 0) {
+        fp[22] ^= 0x00000040;
+        fp[24] ^= 0x00000040;
+        fp[25] ^= 0x00000040;
+        fp[26] ^= 0x00000040;
+        fp[62] ^= 0x00000040;
+        fp[64] ^= 0x00000040;
+        fp[65] ^= 0x00000040;
+        fp[66] ^= 0x00000040;
+    }
+    fp += (vfp_x4 - vfp_o) / 4;
+
+    ADDI(&fp, 2, 3, 0);
+    ADDI(&fp, 3, 7, 0);
+    ADDI(&fp, 7, 2, 0);
+
+    ADDI(&fp, 2, 4, 0);
+    ADDI(&fp, 4, 8, 0);
+    ADDI(&fp, 8, 2, 0);
+
+    ADDI(&fp, 2, 5, 0);
+    ADDI(&fp, 5, 9, 0);
+    ADDI(&fp, 9, 2, 0);
+
+    ADDI(&fp, 2, 6, 0);
+    ADDI(&fp, 6, 10, 0);
+    ADDI(&fp, 10, 2, 0);
+
+    ADDI(&fp, 2, 9, 0);
+    ADDI(&fp, 9, 10, 0);
+    ADDI(&fp, 10, 2, 0);
+
+    *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
+    fp++;
+    MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
+    memcpy(fp, vfp_e, vfp_o - vfp_e);
+    if(sign > 0) {
+        fp[64] ^= 0x00000040;
+        fp[65] ^= 0x00000040;
+        fp[68] ^= 0x00000040;
+        fp[75] ^= 0x00000040;
+        fp[76] ^= 0x00000040;
+        fp[79] ^= 0x00000040;
+        fp[80] ^= 0x00000040;
+        fp[83] ^= 0x00000040;
+        fp[84] ^= 0x00000040;
+        fp[87] ^= 0x00000040;
+        fp[91] ^= 0x00000040;
+        fp[93] ^= 0x00000040;
+    }
+    fp += (vfp_o - vfp_e) / 4;
 
 #endif
-  *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
-	//ADDI(&fp, 2, 1, 0);
-	MOVI(&fp, 1, 0);
-	
-	// args: r0 - out
-	//       r1 - N
-	//       r2 - ws
-//	ADDI(&fp, 3, 1, 0); // put N into r3 for counter
-
-	int32_t pAddr = 0;
-	int32_t pN = 0;
-	int32_t pLUT = 0;
-	count = 2;
-	while(pps[0]) {
-	
-//	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
-		if(!pN) {
-			MOVI(&fp, 1, pps[0]);
-		}else{
-  		if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
-			if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
-		}
-		
-		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
-			ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 
-
-
-  	if(pps[0] == 2*leafN) {
-        *fp = BL(fp+2, x_4_addr); fp++;
-  	}else if(!pps[2]){
-  	  //uint32_t *x_8_t_addr = fp;
+    *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p));
+    fp++; // load offsets into r12
+    //ADDI(&fp, 2, 1, 0);
+    MOVI(&fp, 1, 0);
+
+    // args: r0 - out
+    //       r1 - N
+    //       r2 - ws
+    //	ADDI(&fp, 3, 1, 0); // put N into r3 for counter
+
+    int32_t pAddr = 0;
+    int32_t pN = 0;
+    int32_t pLUT = 0;
+    count = 2;
+    while(pps[0]) {
+
+        //	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
+        if(!pN) {
+            MOVI(&fp, 1, pps[0]);
+        } else {
+            if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
+            if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
+        }
+
+        if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
+            ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
+
+
+        if(pps[0] == 2*leafN) {
+            *fp = BL(fp+2, x_4_addr);
+            fp++;
+        } else if(!pps[2]) {
+            //uint32_t *x_8_t_addr = fp;
 #ifdef HAVE_NEON
-  		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
-  		if(sign < 0) {
-  			fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
-  			fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
-   			fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
-  		}
-  		fp += (neon_ee - neon_x8_t) / 4;
-  		//*fp++ = BL(fp+2, x_8_t_addr);
+            memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
+            if(sign < 0) {
+                fp[31] ^= 0x00200000;
+                fp[32] ^= 0x00200000;
+                fp[33] ^= 0x00200000;
+                fp[34] ^= 0x00200000;
+                fp[65] ^= 0x00200000;
+                fp[66] ^= 0x00200000;
+                fp[70] ^= 0x00200000;
+                fp[74] ^= 0x00200000;
+                fp[97] ^= 0x00200000;
+                fp[98] ^= 0x00200000;
+                fp[102] ^= 0x00200000;
+                fp[104] ^= 0x00200000;
+            }
+            fp += (neon_ee - neon_x8_t) / 4;
+            //*fp++ = BL(fp+2, x_8_t_addr);
 
 #else
-    		*fp = BL(fp+2, x_8_addr); fp++;
+            *fp = BL(fp+2, x_8_addr);
+            fp++;
 #endif
-  	}else{
-    		*fp = BL(fp+2, x_8_addr); fp++;
-  	}
-
-		pAddr = pps[1] * 4;
-		pN = pps[0];
-		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
-//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
-		count += 4;
-		pps += 2;
-	}
-	
-	*fp++ = 0xecbd8b10;
-	*fp++ = POP_LR(); count++;
+        } else {
+            *fp = BL(fp+2, x_8_addr);
+            fp++;
+        }
+
+        pAddr = pps[1] * 4;
+        pN = pps[0];
+        pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
+        //	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
+        count += 4;
+        pps += 2;
+    }
+
+    *fp++ = 0xecbd8b10;
+    *fp++ = POP_LR();
+    count++;
 #else
-	POP(&fp, R15);
-	POP(&fp, R14);
-	POP(&fp, R13);
-	POP(&fp, R12);
-	POP(&fp, R11);
-	POP(&fp, R10);
-	POP(&fp, RBX);
-	POP(&fp, RBP);
-	RET(&fp);	
-
 
-//uint8_t *pp = func;
-//int counter = 0;
-//do{ 
-//	printf("%02x ", *pp);
-//	if(counter++ % 16 == 15) printf("\n");
-//} while(++pp < fp);
+    /* restore nonvolatile registers */
+#ifdef _M_AMD64
+    /* mov rbx, [rsp + 8] */
+    *fp++ = 0x48;
+    *fp++ = 0x8B;
+    *fp++ = 0x5C;
+    *fp++ = 0x24;
+    *fp++ = 0x08;
+
+    /* mov rsi, [rsp + 16] */
+    *fp++ = 0x48;
+    *fp++ = 0x8B;
+    *fp++ = 0x74;
+    *fp++ = 0x24;
+    *fp++ = 0x10;
+
+    /* mov rdi, [rsp + 24] */
+    *fp++ = 0x48;
+    *fp++ = 0x8B;
+    *fp++ = 0x7C;
+    *fp++ = 0x24;
+    *fp++ = 0x18;
+#else
+    POP(&fp, R15);
+    POP(&fp, R14);
+    POP(&fp, R13);
+    POP(&fp, R12);
+    POP(&fp, R11);
+    POP(&fp, R10);
+    POP(&fp, RBX);
+    POP(&fp, RBP);
+#endif
 
-//printf("\n");
+    RET(&fp);
 
+    //uint8_t *pp = func;
+    //int counter = 0;
+    //do{
+    //	printf("%02x ", *pp);
+    //	if(counter++ % 16 == 15) printf("\n");
+    //} while(++pp < fp);
 
+    //printf("\n");
 #endif
 
+    //	*fp++ = B(14); count++;
 
-//	*fp++ = B(14); count++;
-
-//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) 
-//	fprintf(stderr, "%08x\n", x_4_addr[i]);
-//fprintf(stderr, "\n");
-//for(int i=0;i<count;i++) 
-
-	free(ps);
-	
-  if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
-  	perror("Couldn't mprotect");
-  	exit(1);
-  }
-#ifdef __APPLE__
-	sys_icache_invalidate(func, p->transform_size);
-#elif __ANDROID__
-	cacheflush((long)(func), (long)(func) + p->transform_size, 0);
-#elif __linux__
-#ifdef __GNUC__
-	__clear_cache((long)(func), (long)(func) + p->transform_size);
-#endif
-#endif
+    //for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
+    //	fprintf(stderr, "%08x\n", x_4_addr[i]);
+    //fprintf(stderr, "\n");
+    //for(int i=0;i<count;i++)
 
-//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);
+    fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));
 
-	p->transform = (void *) (start);
-}
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+    free(ps);
+
+    return (transform_func_t) start;
+}
+\ No newline at end of file
diff --git a/src/codegen.h b/src/codegen.h
index c07144f..e3c2381 100644
--- a/src/codegen.h
+++ b/src/codegen.h
@@ -1,10 +1,10 @@
 /*
- 
+
  This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
  Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -31,20 +31,15 @@
 
 */
 
-#ifndef __CODEGEN_H__
-#define __CODEGEN_H__
+#ifndef FFTS_CODEGEN_H
+#define FFTS_CODEGEN_H
 
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <string.h>
-#include <limits.h>	   /* for PAGESIZE */
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
 
 #include "ffts.h"
 
-void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign); 
+transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign);
 
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+#endif /* FFTS_CODEGEN_H */
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index 6a38671..a63d21d 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -1,10 +1,10 @@
 /*
- 
+
  This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
  Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,8 @@
 */
 
 
-#ifndef __CODEGEN_SSE_H__
-#define __CODEGEN_SSE_H__
+#ifndef FFTS_CODEGEN_SSE_H
+#define FFTS_CODEGEN_SSE_H
 
 void neon_x4(float *, size_t, float *);
 void neon_x8(float *, size_t, float *);
@@ -47,7 +47,7 @@ void leaf_end();
 void x_init();
 void x4();
 void x8_soft();
-void x8_hard();
+void x8_soft_end();
 
 void sse_constants();
 void sse_constants_inv();
@@ -74,123 +74,157 @@ extern const uint32_t sse_leaf_oe_offsets[8];
 #define RSI 6
 #define RDI 7
 #define RBP 5
-#define R8 8 
-#define R9 9 
-#define R10 10 
-#define R11 11 
-#define R12 12 
-#define R13 13 
-#define R14 14 
-#define R15 15 
-
-void IMM8(uint8_t **p, int32_t imm) {
-		*(*p)++ = (imm & 0xff);
+#define R8 8
+#define R9 9
+#define R10 10
+#define R11 11
+#define R12 12
+#define R13 13
+#define R14 14
+#define R15 15
+
+void IMM8(uint8_t **p, int32_t imm)
+{
+    *(*p)++ = (imm & 0xff);
 }
 
-void IMM16(uint8_t **p, int32_t imm) {
-	int i;
-	for(i=0;i<2;i++) {
-		*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
-	} 
+void IMM16(uint8_t **p, int32_t imm)
+{
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        *(*p)++ = (imm & (0xff << (8 * i))) >> (8 * i);
+    }
 }
-void IMM32(uint8_t **p, int32_t imm) {
-	int i;
-	for(i=0;i<4;i++) {
-		*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
-	} 
+
+void IMM32(uint8_t **p, int32_t imm)
+{
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        *(*p)++ = (imm & (0xff << (8 * i))) >> (8 * i);
+    }
 }
-void IMM32_NI(uint8_t *p, int32_t imm) {
-	int i;
-	for(i=0;i<4;i++) {
-		*(p+i) = (imm & (0xff << (i*8))) >> (i*8);
-	} 
+
+void IMM32_NI(uint8_t *p, int32_t imm)
+{
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        *(p+i) = (imm & (0xff << (8 * i))) >> (8 * i);
+    }
 }
 
-int32_t READ_IMM32(uint8_t *p) {
-	int32_t rval = 0;
-	int i;
-	for(i=0;i<4;i++) {
-		rval |= *(p+i) << (i*8);
-	} 
-	return rval;
+int32_t READ_IMM32(uint8_t *p)
+{
+    int32_t rval = 0;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        rval |= *(p+i) << (8 * i);
+    }
+
+    return rval;
 }
 
-void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) {
-//  if(imm < 65536) *(*p)++ = 0x66; 
-  if(dst >= 8) *(*p)++ = 0x41;
-
-  //if(imm < 65536 && imm >= 256) *(*p)++ = 0x66; 
-
-  //if(imm >= 256) 
-	*(*p)++ = 0xb8 | (dst & 0x7);
-//  else           *(*p)++ = 0xb0 | (dst & 0x7);
-
- // if(imm < 256) IMM8(p, imm);
-//  else 
-//if(imm < 65536) IMM16(p, imm);
-//else 
-	IMM32(p, imm);
-
-//if(dst < 8) {
-//	*(*p)++ = 0xb8 + dst; 
-//}else{
-//	*(*p)++ = 0x49;
-//	*(*p)++ = 0xc7;
-//	*(*p)++ = 0xc0 | (dst - 8);
-//}
-//IMM32(p, imm);
+void MOVI(uint8_t **p, uint8_t dst, uint32_t imm)
+{
+    if (dst >= 8) {
+        *(*p)++ = 0x41;
+    }
+
+    *(*p)++ = 0xb8 | (dst & 0x7);
+    IMM32(p, imm);
 }
 
-void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) {
-	if(disp == 0) {
-		*(*p)++ = (rm & 7) | ((reg & 7) << 3);
-	}else if(disp <= 127 || disp >= -128) {
-		*(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
-		IMM8(p, disp);
-	}else{
-		*(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
-		IMM32(p, disp);
-	}
+void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp)
+{
+    if (disp == 0) {
+        *(*p)++ = (rm & 7) | ((reg & 7) << 3);
+    } else if (disp <= 127 || disp >= -128) {
+        *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
+        IMM8(p, disp);
+    } else {
+        *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
+        IMM32(p, disp);
+    }
 }
 
-void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) {
+void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp)
+{
+    *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
+    *(*p)++ = 0x8d;
+    ADDRMODE(p, dst, base, disp);
+}
 
-	*(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
-	*(*p)++ = 0x8d;
-	ADDRMODE(p, dst, base, disp);		
+void RET(uint8_t **p)
+{
+    *(*p)++ = 0xc3;
 }
 
-void RET(uint8_t **p) {
-	*(*p)++ = 0xc3;
+void ADDI(uint8_t **p, uint8_t dst, int32_t imm)
+{
+    if (dst >= 8) {
+        *(*p)++ = 0x49;
+    } else {
+        *(*p)++ = 0x48;
+    }
+
+    if (imm > 127 || imm <= -128) {
+        *(*p)++ = 0x81;
+    } else {
+        *(*p)++ = 0x83;
+    }
+
+    *(*p)++ = 0xc0 | (dst & 0x7);
+
+    if (imm > 127 || imm <= -128) {
+        IMM32(p, imm);
+    } else {
+        IMM8(p, imm);
+    }
 }
 
-void ADDI(uint8_t **p, uint8_t dst, int32_t imm) {
-	
-	if(dst >= 8) *(*p)++ = 0x49;
-	else         *(*p)++ = 0x48;
-	
-	if(imm > 127 || imm <= -128) *(*p)++ = 0x81;
-	else          *(*p)++ = 0x83;
-	
-	*(*p)++ = 0xc0 | (dst & 0x7);
-
-	if(imm > 127 || imm <= -128) IMM32(p, imm);
-	else          IMM8(p, imm);
+void CALL(uint8_t **p, uint8_t *func)
+{
+    *(*p)++ = 0xe8;
+    IMM32(p, func - *p - 4);
 }
 
-void CALL(uint8_t **p, uint8_t *func) {
-	*(*p)++ = 0xe8;
-	IMM32(p, ((void *)func) - (void *)(*p) - 4); 
+void PUSH(uint8_t **p, uint8_t reg)
+{
+    if (reg >= 8) {
+        *(*p)++ = 0x41;
+    }
+
+    *(*p)++ = 0x50 | (reg & 7);
 }
 
-void PUSH(uint8_t **p, uint8_t reg) {
-	if(reg >= 8) *(*p)++ = 0x41;
-	*(*p)++ = 0x50 | (reg & 7);
+void POP(uint8_t **p, uint8_t reg)
+{
+    if (reg >= 8) {
+        *(*p)++ = 0x41;
+    }
+
+    *(*p)++ = 0x58 | (reg & 7);
 }
-void POP(uint8_t **p, uint8_t reg) {
-	if(reg >= 8) *(*p)++ = 0x41;
-	*(*p)++ = 0x58 | (reg & 7);
+
+void SHIFT(uint8_t **p, uint8_t reg, int shift)
+{
+    if (reg >= 8) {
+        *(*p)++ = 0x49;
+    }
+
+
+    *(*p)++ = 0xc1;
+    if (shift > 0) {
+        *(*p)++ = 0xe0 | (reg & 7);
+        *(*p)++ = (shift & 0xff);
+    } else {
+        *(*p)++ = 0xe8 | (reg & 7);
+        *(*p)++ = ((-shift) & 0xff);
+    }
 }
 
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+
+#endif /* FFTS_CODEGEN_SSE_H */
+\ No newline at end of file
diff --git a/src/ffts.c b/src/ffts.c
index b413c2b..9ceb97f 100644
--- a/src/ffts.c
+++ b/src/ffts.c
@@ -1,418 +1,618 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
+
 #include "ffts.h"
 #include "macros.h"
-//#include "mini_macros.h"
 #include "patterns.h"
 #include "ffts_small.h"
 
 #ifdef DYNAMIC_DISABLED
-	#include "ffts_static.h"
+#include "ffts_static.h"
 #else
-	#include "codegen.h"
+#include "codegen.h"
 #endif
 
-#include <errno.h>
-  #include <sys/mman.h>
-  #include <string.h>
-  #include <limits.h>	   /* for PAGESIZE */
-
 #if __APPLE__
-  #include <libkern/OSCacheControl.h>
+#include <libkern/OSCacheControl.h>
+#endif
+
+#if _WIN32
+#include <windows.h>
 #else
+#include <sys/mman.h>
+#endif
+
+#ifdef __arm__
+static const FFTS_ALIGN(64) float w_data[16] = {
+    0.70710678118654757273731092936941f,
+    0.70710678118654746171500846685376f,
+    -0.70710678118654757273731092936941f,
+    -0.70710678118654746171500846685376f,
+    1.0f,
+    0.70710678118654757273731092936941f,
+    -0.0f,
+    -0.70710678118654746171500846685376f,
+    0.70710678118654757273731092936941f,
+    0.70710678118654746171500846685376f,
+    0.70710678118654757273731092936941f,
+    0.70710678118654746171500846685376f,
+    1.0f,
+    0.70710678118654757273731092936941f,
+    0.0f,
+    0.70710678118654746171500846685376f
+};
 #endif
 
-void ffts_execute(ffts_plan_t *p, const void *  in, void *  out) {
+static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
+{
+    int result;
 
-//TODO: Define NEEDS_ALIGNED properly instead 
-#if defined(HAVE_SSE) || defined(HAVE_NEON)
-	if(((int)in % 16) != 0) {
-		LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n");
-	}
+#ifdef _WIN32
+    DWORD old_protect;
+    result = !VirtualProtect(start, len, PAGE_EXECUTE_READ, &old_protect);
+#else
+    result = mprotect(start, len, PROT_READ | PROT_EXEC);
+#endif
+
+    return result;
+}
+
+static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
+{
+    int result;
+
+#ifdef _WIN32
+    DWORD old_protect;
+    result = (int) VirtualProtect(start, len, PAGE_READWRITE, &old_protect);
+#else
+    result = mprotect(start, len, PROT_READ | PROT_WRITE);
+#endif
+
+    return result;
+}
+
+static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
+{
+#ifdef __APPLE__
+    sys_icache_invalidate(start, length);
+#elif __ANDROID__
+    cacheflush((long) start, (long) start + length, 0);
+#elif __linux__
+#if GCC_VERSION_AT_LEAST(4,3)
+    __builtin___clear_cache(start, (char*) start + length);
+#elif __GNUC__
+    __clear_cache((long) start, (long) start + length);
+#endif
+#elif _WIN32
+    return !FlushInstructionCache(GetCurrentProcess(), start, length);
+#endif
+    return 0;
+}
 
-	if(((int)out % 16) != 0) {
-		LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n");
-	}
+static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
+{
+#if __APPLE__
+    return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
+#elif _WIN32
+    return VirtualAlloc(NULL, length, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+#else
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS 0x20
 #endif
 
-	p->transform(p, (const float *)in, (float *)out);
+    return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+#endif
 }
 
-void ffts_free(ffts_plan_t *p) {
-	p->destroy(p);
+static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length)
+{
+#ifdef _WIN32
+    (void) length;
+    VirtualFree(addr, 0, MEM_RELEASE);
+#else
+    munmap(addr, length);
+#endif
 }
 
-void ffts_free_1d(ffts_plan_t *p) {
-	
-	size_t i;
+void ffts_execute(ffts_plan_t *p, const void *in, void *out)
+{
+    /* TODO: Define NEEDS_ALIGNED properly instead */
+#if defined(HAVE_SSE) || defined(HAVE_NEON)
+    if (((int) in % 16) != 0) {
+        LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n");
+    }
+
+    if (((int) out % 16) != 0) {
+        LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n");
+    }
+#endif
+
+    p->transform(p, (const float*) in, (float*) out);
+}
 
-	if(p->ws) {
-		FFTS_FREE(p->ws);
-	}
-	if(p->is) free(p->is);
-	if(p->ws_is) free(p->ws_is);
-	if(p->offsets)		free(p->offsets);
-	//free(p->transforms);
-	if(p->transforms) free(p->transforms);
+void ffts_free(ffts_plan_t *p)
+{
+    if (p) {
+        p->destroy(p);
+    }
+}
 
+void ffts_free_1d(ffts_plan_t *p)
+{
 #if !defined(DYNAMIC_DISABLED)
-	if(p->transform_base) {
-		if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
-			perror("Couldn't mprotect");
-			exit(errno);
-		}
-		munmap(p->transform_base, p->transform_size);
-		//free(p->transform_base);
-	}
+    if (p->transform_base) {
+        ffts_deny_execute(p->transform_base, p->transform_size);
+        ffts_vmem_free(p->transform_base, p->transform_size);
+    }
 #endif
-	free(p);
+
+    if (p->ws_is) {
+        free(p->ws_is);
+    }
+
+    if (p->ws) {
+        FFTS_FREE(p->ws);
+    }
+
+    if (p->transforms) {
+        free(p->transforms);
+    }
+
+    if (p->is) {
+        free(p->is);
+    }
+
+    if (p->offsets) {
+        free(p->offsets);
+    }
+
+    free(p);
 }
 
-ffts_plan_t *ffts_init_1d(size_t N, int sign) {
-	if(N == 0 || (N & (N - 1)) != 0){
-		LOG("FFT size must be a power of two\n");
-		return NULL;
-	}
+static int ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
+{
+    V MULI_SIGN;
+    int hardcoded;
+    size_t lut_size;
+    size_t n_luts;
+    float *tmp;
+    cdata_t *w;
+    size_t i;
+    int n;
+
+#ifdef __arm__
+    /* #ifdef HAVE_NEON */
+    V MULI_SIGN;
+
+    if(sign < 0) {
+        MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+    } else {
+        MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+    }
+
+    /* #endif */
+#else
+    if (sign < 0) {
+        MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+    } else {
+        MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+    }
+#endif
+
+    /* LUTS */
+    n_luts = ffts_ctzl(N / leaf_N);
+    if (N < 32) {
+        n_luts = ffts_ctzl(N / 4);
+        hardcoded = 1;
+    } else {
+        hardcoded = 0;
+    }
+
+    if (n_luts >= 32) {
+        n_luts = 0;
+    }
 
-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
-	size_t leafN = 8;	
-	size_t i;	
+    /* fprintf(stderr, "n_luts = %zu\n", n_luts); */
 
+    n = leaf_N * 2;
+    if (hardcoded) {
+        n = 8;
+    }
+
+    lut_size = 0;
+
+    for (i = 0; i < n_luts; i++) {
+        if (!i || hardcoded) {
+#ifdef __arm__
+            if (N <= 32) {
+                lut_size += n/4 * 2 * sizeof(cdata_t);
+            } else {
+                lut_size += n/4 * sizeof(cdata_t);
+            }
+#else
+            lut_size += n/4 * 2 * sizeof(cdata_t);
+#endif
+            n *= 2;
+        } else {
 #ifdef __arm__
-//#ifdef HAVE_NEON
-	V MULI_SIGN;
-	
-	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
-	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
-//#endif 
+            lut_size += n/8 * 3 * sizeof(cdata_t);
 #else
-	V MULI_SIGN;
-	
-	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
-	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+            lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
+#endif
+        }
+        n *= 2;
+    }
+
+    /* lut_size *= 16; */
+
+    /* fprintf(stderr, "lut size = %zu\n", lut_size); */
+
+    if (n_luts) {
+        p->ws = FFTS_MALLOC(lut_size, 32);
+        if (!p->ws) {
+            goto cleanup;
+        }
+
+        p->ws_is = malloc(n_luts * sizeof(*p->ws_is));
+        if (!p->ws_is) {
+            goto cleanup;
+        }
+    }
+
+    w = p->ws;
+
+    n = leaf_N * 2;
+    if (hardcoded) {
+        n = 8;
+    }
+
+#ifdef HAVE_NEON
+    V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
 #endif
 
-	p->transform = NULL;
-	p->transform_base = NULL;
-	p->transforms = NULL;
-	p->is = NULL;
-	p->ws_is = NULL;
-	p->ws = NULL;
-	p->offsets = NULL;
-	p->destroy = ffts_free_1d;
-
-	if(N >= 32) {
-		ffts_init_offsets(p, N, leafN);
+    for (i = 0; i < n_luts; i++) {
+        p->ws_is[i] = w - (cdata_t *)p->ws;
+        //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
+
+        if(!i || hardcoded) {
+            cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+            float *fw0 = (float*) w0;
+            float *fw = (float *)w;
+
+            size_t j;
+            for (j = 0; j < n/4; j++) {
+                w0[j][0] = W_re(n,j);
+                w0[j][1] = W_im(n,j);
+            }
+
 #ifdef __arm__
+            if (N < 32) {
+                // w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+                float *fw = (float *)w;
+                V temp0, temp1, temp2;
+                for (j=0; j<n/4; j+=2) {
+                    //	#ifdef HAVE_NEON
+                    temp0 = VLD(fw0 + j*2);
+                    V re, im;
+                    re = VDUPRE(temp0);
+                    im = VDUPIM(temp0);
+#ifdef HAVE_NEON
+                    im = VXOR(im, MULI_SIGN);
+                    //im = IMULI(sign>0, im);
+#else
+                    im = MULI(sign>0, im);
+#endif
+                    VST(fw + j*4  , re);
+                    VST(fw + j*4+4, im);
+                    //		#endif
+                }
+                w += n/4 * 2;
+            } else {
+                //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+                float *fw = (float *)w;
 #ifdef HAVE_NEON
-		ffts_init_is(p, N, leafN, 1);
+                VS temp0, temp1, temp2;
+                for (j=0; j<n/4; j+=4) {
+                    temp0 = VLD2(fw0 + j*2);
+                    temp0.val[1] = VXOR(temp0.val[1], neg);
+                    STORESPR(fw + j*2, temp0);
+                }
 #else
-		ffts_init_is(p, N, leafN, 1);
+                for (j=0; j<n/4; j+=1) {
+                    fw[j*2] = fw0[j*2];
+                    fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+                }
 #endif
+                w += n/4;
+            }
 #else
-		ffts_init_is(p, N, leafN, 1);
+            //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+            for (j = 0; j < n/4; j += 2) {
+                V re, im, temp0;
+                temp0 = VLD(fw0 + j*2);
+                re = VDUPRE(temp0);
+                im = VDUPIM(temp0);
+                im = VXOR(im, MULI_SIGN);
+                VST(fw + j*4 + 0, re);
+                VST(fw + j*4 + 4, im);
+            }
+
+            w += n/4 * 2;
 #endif
-		
-		p->i0 = N/leafN/3+1;
-		p->i1 = N/leafN/3;
-		if((N/leafN) % 3 > 1) p->i1++;
-		p->i2 = N/leafN/3;
-		
-		#ifdef __arm__	
-		#ifdef HAVE_NEON
-		p->i0/=2;
-		p->i1/=2;
-		#endif
-		#else
-		p->i0/=2;
-		p->i1/=2;
-		#endif
-
-	}else{
-		p->transforms = malloc(2 * sizeof(transform_index_t));
-		p->transforms[0] = 0;
-		p->transforms[1] = 1;
-		if(N == 2) p->transform = &firstpass_2;
-		else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
-		else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
-		else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
-		else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
-		else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
-		else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;
-
-		p->is = NULL;
-		p->offsets = NULL;
-	}
-
-		int hardcoded = 0;
-
-		/*      LUTS           */
-		size_t n_luts = __builtin_ctzl(N/leafN);
-		if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
-
-		if(n_luts >= 32) n_luts = 0;
-
-//		fprintf(stderr, "n_luts = %zu\n", n_luts);
-		
-		cdata_t *w;
-
-		int n = leafN*2;
-		if(hardcoded) n = 8;
-		
-		size_t lut_size = 0;
-
-		for(i=0;i<n_luts;i++) {
-			if(!i || hardcoded) {
-			#ifdef __arm__ 
-				if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
-				else lut_size += n/4 * sizeof(cdata_t);
-			#else
-				lut_size += n/4 * 2 * sizeof(cdata_t);
-			#endif
-				n *= 2;
-			} else {
-			#ifdef __arm__
-				lut_size += n/8 * 3 * sizeof(cdata_t);
-			#else
-				lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
-			#endif
-			}
-			n *= 2;
-		}
-		
-//		lut_size *= 16;
-		
-	//	fprintf(stderr, "lut size = %zu\n", lut_size);
-		if(n_luts) {
-			p->ws = FFTS_MALLOC(lut_size,32);
-			p->ws_is = malloc(n_luts * sizeof(size_t));
-		}else{
-			p->ws = NULL;
-			p->ws_is = NULL;
-		}
-		w = p->ws;
-
-		n = leafN*2;
-		if(hardcoded) n = 8;
-		
-		#ifdef HAVE_NEON
-			V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
-		#endif
-		
-		for(i=0;i<n_luts;i++) {
-			p->ws_is[i] = w - (cdata_t *)p->ws;	
-			//fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);	
-			
-			if(!i || hardcoded) {
-				cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
-
-				size_t j;
-				for(j=0;j<n/4;j++) {
-					w0[j][0]	= W_re(n,j);
-					w0[j][1]	= W_im(n,j);
-				}
-
-
-				float *fw0 = (float *)w0;
-				#ifdef __arm__
-					if(N < 32) {
-						//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
-						float *fw = (float *)w;
-						V temp0, temp1, temp2;
-						for(j=0;j<n/4;j+=2) {
-						//	#ifdef HAVE_NEON
-							temp0 = VLD(fw0 + j*2);
-							V re, im;
-							re = VDUPRE(temp0);
-							im = VDUPIM(temp0);
-							#ifdef HAVE_NEON 
-								im = VXOR(im, MULI_SIGN);
-								//im = IMULI(sign>0, im);
-							#else
-								im = MULI(sign>0, im);
-							#endif
-							VST(fw + j*4  , re);
-							VST(fw + j*4+4, im);
-					//		#endif
-						}
-						w += n/4 * 2;
-					}else{
-						//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
-						float *fw = (float *)w;
-						#ifdef HAVE_NEON
-							VS temp0, temp1, temp2;
-							for(j=0;j<n/4;j+=4) {
-								temp0 = VLD2(fw0 + j*2);
-								temp0.val[1] = VXOR(temp0.val[1], neg);
-								STORESPR(fw + j*2, temp0);
-							}
-						#else
-							for(j=0;j<n/4;j+=1) {
-								fw[j*2] = fw0[j*2];
-								fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
-							}
-						#endif
-						w += n/4;
-					}
-				#else
-					//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
-					float *fw = (float *)w;
-					V temp0, temp1, temp2;
-					for(j=0;j<n/4;j+=2) {
-						temp0 = VLD(fw0 + j*2);
-						V re, im;
-						re = VDUPRE(temp0);
-						im = VDUPIM(temp0);
-						im = VXOR(im, MULI_SIGN);
-						VST(fw + j*4  , re);
-						VST(fw + j*4+4, im);
-					}
-					w += n/4 * 2;
-				#endif
-
-				FFTS_FREE(w0);
-			}else{
-
-				cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
-				cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
-				cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
-
-				size_t j;
-				for(j=0;j<n/8;j++) {
-					w0[j][0]	= W_re(n,j*2);
-					w0[j][1]	= W_im(n,j*2);
-					w1[j][0]	= W_re(n,j);
-					w1[j][1]	= W_im(n,j);
-					w2[j][0]	= W_re(n,j + (n/8));
-					w2[j][1]	= W_im(n,j + (n/8));
-
-				}
-
-				float *fw0 = (float *)w0;
-				float *fw1 = (float *)w1;
-				float *fw2 = (float *)w2;
-				#ifdef __arm__
-					//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
-					float *fw = (float *)w;
-					#ifdef HAVE_NEON	
-						VS temp0, temp1, temp2;
-						for(j=0;j<n/8;j+=4) {
-							temp0 = VLD2(fw0 + j*2);
-							temp0.val[1] = VXOR(temp0.val[1], neg);
-							STORESPR(fw + j*2*3,      temp0);
-							temp1 = VLD2(fw1 + j*2);
-							temp1.val[1] = VXOR(temp1.val[1], neg);
-							STORESPR(fw + j*2*3 + 8,  temp1);
-							temp2 = VLD2(fw2 + j*2);
-							temp2.val[1] = VXOR(temp2.val[1], neg);
-							STORESPR(fw + j*2*3 + 16, temp2);
-						}
-					#else
-						for(j=0;j<n/8;j+=1) {
-								fw[j*6] = fw0[j*2];
-								fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
-								fw[j*6+2] = fw1[j*2+0];
-								fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
-								fw[j*6+4] = fw2[j*2+0];
-								fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
-						}
-					#endif
-					w += n/8 * 3;
-				#else
-					//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
-					float *fw = (float *)w;
-					V temp0, temp1, temp2, re, im;
-					for(j=0;j<n/8;j+=2) {
-						temp0 = VLD(fw0 + j*2);
-						re = VDUPRE(temp0);
-						im = VDUPIM(temp0);
-						im = VXOR(im, MULI_SIGN);
-						VST(fw + j*2*6  , re);
-						VST(fw + j*2*6+4, im);
-
-						temp1 = VLD(fw1 + j*2);
-						re = VDUPRE(temp1);
-						im = VDUPIM(temp1);
-						im = VXOR(im, MULI_SIGN);
-						VST(fw + j*2*6+8 , re);
-						VST(fw + j*2*6+12, im);
-
-						temp2 = VLD(fw2 + j*2);
-						re = VDUPRE(temp2);
-						im = VDUPIM(temp2);
-						im = VXOR(im, MULI_SIGN);
-						VST(fw + j*2*6+16, re);
-						VST(fw + j*2*6+20, im);
-					}
-					w += n/8 * 3 * 2;
-				#endif
-
-				FFTS_FREE(w0);
-				FFTS_FREE(w1);
-				FFTS_FREE(w2);
-			}
-			///p->ws[i] = w;
-
-			n *= 2;
-		}
-
-	float *tmp = (float *)p->ws;
-
-	if(sign < 0) {
-		p->oe_ws = (void *)(&w_data[4]);
-		p->ee_ws = (void *)(w_data);
-		p->eo_ws = (void *)(&w_data[4]);
-	}else{
-		p->oe_ws = (void *)(w_data + 12);
-		p->ee_ws = (void *)(w_data + 8);
-		p->eo_ws = (void *)(w_data + 12);
-	}
-
-	p->N = N;
-	p->lastlut = w;
-	p->n_luts = n_luts;
-#ifdef DYNAMIC_DISABLED
-	if(sign < 0) { 
-		if(N >= 32) p->transform = ffts_static_transform_f; 
-	}else{
-		if(N >= 32) p->transform = ffts_static_transform_i; 
-	}
 
+            FFTS_FREE(w0);
+        } else {
+            cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+            cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+            cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+
+            float *fw0 = (float*) w0;
+            float *fw1 = (float*) w1;
+            float *fw2 = (float*) w2;
+
+            float *fw = (float *)w;
+            V temp0, temp1, temp2, re, im;
+
+            size_t j;
+            for (j = 0; j < n/8; j++) {
+                w0[j][0] = W_re((float) n, (float) 2*j);
+                w0[j][1] = W_im((float) n, (float) 2*j);
+                w1[j][0] = W_re((float) n, (float) j);
+                w1[j][1] = W_im((float) n, (float) j);
+                w2[j][0] = W_re((float) n, (float) (j + (n/8)));
+                w2[j][1] = W_im((float) n, (float) (j + (n/8)));
+            }
+
+#ifdef __arm__
+            //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
+            float *fw = (float *)w;
+#ifdef HAVE_NEON
+            VS temp0, temp1, temp2;
+            for (j = 0; j < n/8; j += 4) {
+                temp0 = VLD2(fw0 + j*2);
+                temp0.val[1] = VXOR(temp0.val[1], neg);
+                STORESPR(fw + j*2*3,      temp0);
+                temp1 = VLD2(fw1 + j*2);
+                temp1.val[1] = VXOR(temp1.val[1], neg);
+                STORESPR(fw + j*2*3 + 8,  temp1);
+                temp2 = VLD2(fw2 + j*2);
+                temp2.val[1] = VXOR(temp2.val[1], neg);
+                STORESPR(fw + j*2*3 + 16, temp2);
+            }
 #else
-	if(N>=32)  ffts_generate_func_code(p, N, leafN, sign);
+            for (j = 0; j < n/8; j += 1) {
+                fw[j*6] = fw0[j*2];
+                fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+                fw[j*6+2] = fw1[j*2+0];
+                fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
+                fw[j*6+4] = fw2[j*2+0];
+                fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
+            }
 #endif
+            w += n/8 * 3;
+#else
+            //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
+            for (j = 0; j < n/8; j += 2) {
+                temp0 = VLD(fw0 + j*2);
+                re = VDUPRE(temp0);
+                im = VDUPIM(temp0);
+                im = VXOR(im, MULI_SIGN);
+                VST(fw + j*2*6  , re);
+                VST(fw + j*2*6+4, im);
+
+                temp1 = VLD(fw1 + j*2);
+                re = VDUPRE(temp1);
+                im = VDUPIM(temp1);
+                im = VXOR(im, MULI_SIGN);
+                VST(fw + j*2*6+8 , re);
+                VST(fw + j*2*6+12, im);
+
+                temp2 = VLD(fw2 + j*2);
+                re = VDUPRE(temp2);
+                im = VDUPIM(temp2);
+                im = VXOR(im, MULI_SIGN);
+                VST(fw + j*2*6+16, re);
+                VST(fw + j*2*6+20, im);
+            }
+
+            w += n/8 * 3 * 2;
+#endif
+
+            FFTS_FREE(w0);
+            FFTS_FREE(w1);
+            FFTS_FREE(w2);
+        }
+        ///p->ws[i] = w;
+
+        n *= 2;
+    }
+
+    tmp = (float *)p->ws;
+
+#ifdef __arm__
+    if (sign < 0) {
+        p->oe_ws = (void*)(&w_data[4]);
+        p->ee_ws = (void*)(w_data);
+        p->eo_ws = (void*)(&w_data[4]);
+    } else {
+        p->oe_ws = (void*)(w_data + 12);
+        p->ee_ws = (void*)(w_data + 8);
+        p->eo_ws = (void*)(w_data + 12);
+    }
+#endif
+
+    p->lastlut = w;
+    p->n_luts = n_luts;
+    return 0;
 
-	return p;
+cleanup:
+    return -1;
 }
 
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+ffts_plan_t *ffts_init_1d(size_t N, int sign)
+{
+    const size_t leaf_N = 8;
+    ffts_plan_t *p;
+
+    if (N < 2 || (N & (N - 1)) != 0) {
+        LOG("FFT size must be a power of two\n");
+        return NULL;
+    }
+
+    p = calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    p->destroy = ffts_free_1d;
+    p->N = N;
+
+    if (ffts_generate_luts(p, N, leaf_N, sign)) {
+        goto cleanup;
+    }
+
+    if (N >= 32) {
+        p->offsets = ffts_init_offsets(N, leaf_N);
+        if (!p->offsets) {
+            goto cleanup;
+        }
+
+        p->is = ffts_init_is(N, leaf_N, 1);
+        if (!p->is) {
+            goto cleanup;
+        }
+
+        p->i0 = N/leaf_N/3 + 1;
+        p->i1 = p->i2 = N/leaf_N/3;
+        if ((N/leaf_N) % 3 > 1) {
+            p->i1++;
+        }
+
+#ifdef __arm__
+#ifdef HAVE_NEON
+        p->i0 /= 2;
+        p->i1 /= 2;
+#endif
+#else
+        p->i0 /= 2;
+        p->i1 /= 2;
+#endif
+
+#ifdef DYNAMIC_DISABLED
+        if (sign < 0) {
+            p->transform = ffts_static_transform_f;
+        } else {
+            p->transform = ffts_static_transform_i;
+        }
+#else
+        /* determinate transform size */
+#ifdef __arm__
+        if (N < 8192) {
+            p->transform_size = 8192;
+        } else {
+            p->transform_size = N;
+        }
+#else
+        if (N < 2048) {
+            p->transform_size = 16384;
+        } else {
+            p->transform_size = 16384 + 2*N/8 * ffts_ctzl(N);
+        }
+#endif
+
+        /* allocate code/function buffer */
+        p->transform_base = ffts_vmem_alloc(p->transform_size);
+        if (!p->transform_base) {
+            goto cleanup;
+        }
+
+        /* generate code */
+        p->transform = ffts_generate_func_code(p, N, leaf_N, sign);
+        if (!p->transform) {
+            goto cleanup;
+        }
+
+        /* enable execution with read access for the block */
+        if (ffts_allow_execute(p->transform_base, p->transform_size)) {
+            goto cleanup;
+        }
+
+        /* flush from the instruction cache */
+        if (ffts_flush_instruction_cache(p->transform_base, p->transform_size)) {
+            goto cleanup;
+        }
+#endif
+    } else {
+        p->transforms = malloc(2 * sizeof(*p->transforms));
+        if (!p->transforms) {
+            goto cleanup;
+        }
+
+        p->transforms[0] = 0;
+        p->transforms[1] = 1;
+
+        switch (N) {
+        case 2:
+            p->transform = &ffts_firstpass_2;
+            break;
+        case 4:
+            if (sign == -1) {
+                p->transform = &ffts_firstpass_4_f;
+            } else if (sign == 1) {
+                p->transform = &ffts_firstpass_4_b;
+            }
+            break;
+        case 8:
+            if (sign == -1) {
+                p->transform = &ffts_firstpass_8_f;
+            } else if (sign == 1) {
+                p->transform = &ffts_firstpass_8_b;
+            }
+            break;
+        case 16:
+        default:
+            if (sign == -1) {
+                p->transform = &ffts_firstpass_16_f;
+            } else {
+                p->transform = &ffts_firstpass_16_b;
+            }
+            break;
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_1d(p);
+    return NULL;
+}
+\ No newline at end of file
diff --git a/src/ffts.h b/src/ffts.h
index cd9e24b..ca2951a 100644
--- a/src/ffts.h
+++ b/src/ffts.h
@@ -30,21 +30,21 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
+
 #ifndef __CP_SSE_H__
 #define __CP_SSE_H__
 
-#include "config.h"
+//#include "config.h"
+//#include "codegen.h"
+#include "ffts_attributes.h"
+#include "types.h"
 
-#include <stdio.h>
-#include <stdlib.h>
+#include <malloc.h>
 #include <math.h>
 #include <stddef.h>
 #include <stdint.h>
-//#include <stdalign.h>
-
-//#include "codegen.h"
-#include "types.h"
-
+#include <stdlib.h>
+#include <stdio.h>
 
 #ifdef __ANDROID__
 #include <android/log.h>
@@ -53,29 +53,14 @@
 #define LOG(s) fprintf(stderr, s)
 #endif
 
-#define PI 3.1415926535897932384626433832795028841971693993751058209
-
-static const __attribute__ ((aligned(64))) float w_data[16] = {
-	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
-	-0.70710678118654757273731092936941,	-0.70710678118654746171500846685376,
-    1.0f,									 0.70710678118654757273731092936941f,
-	-0.0f,									-0.70710678118654746171500846685376,
-	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
-	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
-	1.0f,									0.70710678118654757273731092936941f,
-	0.0f,									0.70710678118654746171500846685376
-};
-
-__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); }
-__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); }
-
-typedef size_t transform_index_t;
-
-//typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
-typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795028841971693993751058209
+#endif
 
 typedef struct _ffts_plan_t ffts_plan_t;
 
+typedef void (*transform_func_t)(ffts_plan_t *p, const void *in, void *out);
+
 /**
  * Contains all the Information need to perform FFT
  *
@@ -86,101 +71,153 @@ typedef struct _ffts_plan_t ffts_plan_t;
  */
 struct _ffts_plan_t {
 
-	/**
-	 *
-	 */
-	ptrdiff_t *offsets;
+    /**
+     *
+     */
+    ptrdiff_t *offsets;
 #ifdef DYNAMIC_DISABLED
-	/**
-	 * Twiddle factors
-	 */
-	void *ws;
-	/**
-	 * ee - 2 size x  size8
-	 * oo - 2 x size4 in parallel
-	 * oe -
-	 */
-	void  *oe_ws, *eo_ws, *ee_ws;
+    /**
+     * Twiddle factors
+     */
+    void *ws;
+
+    /**
+     * ee - 2 size x  size8
+     * oo - 2 x size4 in parallel
+     * oe -
+     */
+    void  *oe_ws, *eo_ws, *ee_ws;
 #else
-	void __attribute__((aligned(32))) *ws;
-	void __attribute__((aligned(32)))  *oe_ws, *eo_ws, *ee_ws;
+    void FFTS_ALIGN(32) *ws;
+    void FFTS_ALIGN(32) *oe_ws, *eo_ws, *ee_ws;
 #endif
-	/**
-	 * Pointer into an array of precomputed indexes for the input data array
-	 */
-	ptrdiff_t *is;
-
-	/**
-	 * Twiddle Factor Indexes
-	 */
-	size_t *ws_is;
-
-	/**
-	 * Size of the loops for the base cases
-	 */
-	size_t i0, i1, n_luts;
-
-	/**
-	 * Size fo the Transform
-	 */
-	size_t N;
-	void *lastlut;
-	/**
-	 * Used in multidimensional Code ??
-	 */
-	transform_index_t *transforms;
-	//transform_func_t transform;
-
-	/**
-	 * Pointer to the dynamically generated function
-	 * that will execute the FFT
-	 */
-	void (*transform)(ffts_plan_t * , const void * , void * );
-
-	/**
-	 * Pointer to the base memory address of
-	 * of the transform function
-	 */
-	void *transform_base;
-
-	/**
-	 * Size of the memory block contain the
-	 * generated code
-	 */
-	size_t transform_size;
-
-	/**
-	 * Points to the cosnant variables used by
-	 * the Assembly Code
-	 */
-	void *constants;
-
-	// multi-dimensional stuff:
-	struct _ffts_plan_t **plans;
-	int rank;
-	size_t *Ns, *Ms;
-	void *buf;
-
-	void *transpose_buf;
-
-	/**
-	 * Pointer to the destroy function
-	 * to clean up the plan after use
-	 * (differs for real and multi dimension transforms
-	 */
-	void (*destroy)(ffts_plan_t *);
-
-	/**
-	 * Coefficiants for the real valued transforms
-	 */
-	float *A, *B;
-
-	size_t i2;
+
+    /**
+     * Pointer into an array of precomputed indexes for the input data array
+     */
+    ptrdiff_t *is;
+
+    /**
+     * Twiddle Factor Indexes
+     */
+    size_t *ws_is;
+
+    /**
+     * Size of the loops for the base cases
+     */
+    size_t i0, i1, n_luts;
+
+    /**
+     * Size fo the Transform
+     */
+    size_t N;
+    void *lastlut;
+
+    /**
+     * Used in multidimensional Code ??
+     */
+    size_t *transforms;
+
+    /**
+     * Pointer to the dynamically generated function
+     * that will execute the FFT
+     */
+    transform_func_t transform;
+
+    /**
+     * Pointer to the base memory address of
+     * of the transform function
+     */
+    void *transform_base;
+
+    /**
+     * Size of the memory block contain the
+     * generated code
+     */
+    size_t transform_size;
+
+    /**
+     * Points to the cosnant variables used by
+     * the Assembly Code
+     */
+    void *constants;
+
+    // multi-dimensional stuff:
+    struct _ffts_plan_t **plans;
+    int rank;
+    size_t *Ns, *Ms;
+    void *buf;
+
+    void *transpose_buf;
+
+    /**
+     * Pointer to the destroy function
+     * to clean up the plan after use
+     * (differs for real and multi dimension transforms
+     */
+    void (*destroy)(ffts_plan_t *);
+
+    /**
+     * Coefficiants for the real valued transforms
+     */
+    float *A, *B;
+
+    size_t i2;
 };
 
+static FFTS_INLINE void *ffts_aligned_malloc(size_t size)
+{
+#if defined(_MSC_VER)
+    return _aligned_malloc(size, 32);
+#else
+    return valloc(size);
+#endif
+}
 
-void ffts_free(ffts_plan_t *);
+static FFTS_INLINE void ffts_aligned_free(void *p)
+{
+#if defined(_MSC_VER)
+    _aligned_free(p);
+#else
+    free(p);
+#endif
+}
+
+#if GCC_VERSION_AT_LEAST(3,3)
+#define ffts_ctzl __builtin_ctzl
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#ifdef _M_AMD64
+#pragma intrinsic(_BitScanForward64)
+static __inline unsigned long ffts_ctzl(size_t N)
+{
+    unsigned long count;
+    _BitScanForward64((unsigned long*) &count, N);
+    return count;
+}
+#else
+#pragma intrinsic(_BitScanForward)
+static __inline unsigned long ffts_ctzl(size_t N)
+{
+    unsigned long count;
+    _BitScanForward((unsigned long*) &count, N);
+    return count;
+}
+#endif /* _WIN64 */
+#endif /* _MSC_VER */
+
+static FFTS_ALWAYS_INLINE float W_re(float N, float k)
+{
+    return cos(-2.0 * M_PI * k / N);
+}
+
+static FFTS_ALWAYS_INLINE float W_im(float N, float k)
+{
+    return sin(-2.0 * M_PI * k / N);
+}
+
+void         ffts_free(ffts_plan_t *);
+void         ffts_execute(ffts_plan_t *, const void *, void *);
 ffts_plan_t *ffts_init_1d(size_t N, int sign);
-void ffts_execute(ffts_plan_t *, const void *, void *);
+
 #endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
diff --git a/src/macros-sse.h b/src/macros-sse.h
index d845734..85cd02d 100644
--- a/src/macros-sse.h
+++ b/src/macros-sse.h
@@ -1,10 +1,10 @@
 /*
- 
+
  This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
  Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -31,8 +31,8 @@
 
 */
 
-#ifndef __SSE_FLOAT_H__
-#define __SSE_FLOAT_H__
+#ifndef FFTS_MACROS_SSE_H
+#define FFTS_MACROS_SSE_H
 
 #include <xmmintrin.h>
 
@@ -63,23 +63,27 @@ typedef __m128 V;
 #define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
 #define FFTS_FREE(d) (_mm_free(d))
 
-__INLINE V IMULI(int inv, V a) {
-	if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
-	else    return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+static FFTS_ALWAYS_INLINE V IMULI(int inv, V a)
+{
+    if (inv) {
+        return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+    } else {
+        return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+    }
 }
 
-
-__INLINE V IMUL(V d, V re, V im) {
-  re = VMUL(re, d);                   
-  im = VMUL(im, VSWAPPAIRS(d));
-  return VSUB(re, im);  
+static FFTS_ALWAYS_INLINE V IMUL(V d, V re, V im)
+{
+    re = VMUL(re, d);
+    im = VMUL(im, VSWAPPAIRS(d));
+    return VSUB(re, im);
 }
 
-__INLINE V IMULJ(V d, V re, V im) {
-  re = VMUL(re, d);                   
-  im = VMUL(im, VSWAPPAIRS(d));
-  return VADD(re, im);  
+static FFTS_ALWAYS_INLINE V IMULJ(V d, V re, V im)
+{
+    re = VMUL(re, d);
+    im = VMUL(im, VSWAPPAIRS(d));
+    return VADD(re, im);
 }
 
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+#endif /* FFTS_MACROS_SSE_H */
diff --git a/src/macros.h b/src/macros.h
index 08029a3..12c52c6 100644
--- a/src/macros.h
+++ b/src/macros.h
@@ -1,38 +1,42 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
- Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
 
-#ifndef __MACROS_H__
-#define __MACROS_H__
+#ifndef FFTS_MACROS_H
+#define FFTS_MACROS_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
 
 #ifdef HAVE_NEON
 #include "macros-neon.h"
@@ -44,119 +48,138 @@
 #include "macros-altivec.h"
 #endif
 #endif
-
 #endif
 
-
 #ifdef HAVE_VFP
 #include "macros-alpha.h"
 #endif
+
 #ifdef HAVE_SSE
-	#include "macros-sse.h"
+#include "macros-sse.h"
 #endif
 
-static inline void TX2(V *a, V *b)
+static FFTS_INLINE void TX2(V *a, V *b)
 {
     V TX2_t0 = VUNPACKLO(*a, *b);
     V TX2_t1 = VUNPACKHI(*a, *b);
-    *a = TX2_t0; *b = TX2_t1; 
+    *a = TX2_t0;
+    *b = TX2_t1;
 }
 
-static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
 {
     V uk, uk2, zk_p, zk_n, zk, zk_d;
-    uk   = *r0; uk2  = *r1;
+
+    uk  = *r0;
+    uk2 = *r1;
+
     zk_p = IMUL(*r2, re, im);
     zk_n = IMULJ(*r3, re, im);
 
-    zk   = VADD(zk_p, zk_n);
+    zk = VADD(zk_p, zk_n);
     zk_d = IMULI(inv, VSUB(zk_p, zk_n));
-  
+
     *r2 = VSUB(uk, zk);
     *r0 = VADD(uk, zk);
     *r3 = VADD(uk2, zk_d);
     *r1 = VSUB(uk2, zk_d);
 }
 
-
-static inline void S_4(V r0, V r1, V r2, V r3, 
-		       data_t * restrict o0, data_t * restrict o1,
-		       data_t * restrict o2, data_t * restrict o3)
-{
-    VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
-}
-
-
-static inline void L_2_4(int inv, 
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void L_2_4(int inv, const data_t* FFTS_RESTRICT i0, const data_t* FFTS_RESTRICT i1,
+                              const data_t* FFTS_RESTRICT i2, const data_t* FFTS_RESTRICT i3,
+                              V *r0, V *r1, V *r2, V *r3)
 {
     V t0, t1, t2, t3, t4, t5, t6, t7;
 
-    t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);    
+    t0 = VLD(i0);
+    t1 = VLD(i1);
+    t2 = VLD(i2);
+    t3 = VLD(i3);
+
     t4 = VADD(t0, t1);
     t5 = VSUB(t0, t1);
     t6 = VADD(t2, t3);
     t7 = VSUB(t2, t3);
+
     *r0 = VUNPACKLO(t4, t5);
     *r1 = VUNPACKLO(t6, t7);
+
     t5 = IMULI(inv, t5);
+
     t0 = VADD(t6, t4);
     t2 = VSUB(t6, t4);
     t1 = VSUB(t7, t5);
     t3 = VADD(t7, t5);
+
     *r3 = VUNPACKHI(t0, t1);
     *r2 = VUNPACKHI(t2, t3);
 }
 
-
-static inline void L_4_4(int inv,  
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void L_4_4(int inv, const data_t* FFTS_RESTRICT i0, const data_t* FFTS_RESTRICT i1,
+                              const data_t* FFTS_RESTRICT i2, const data_t* FFTS_RESTRICT i3,
+                              V *r0, V *r1, V *r2, V *r3)
 {
     V t0, t1, t2, t3, t4, t5, t6, t7;
- 
-    t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);   
+
+    t0 = VLD(i0);
+    t1 = VLD(i1);
+    t2 = VLD(i2);
+    t3 = VLD(i3);
+
     t4 = VADD(t0, t1);
     t5 = VSUB(t0, t1);
     t6 = VADD(t2, t3);
+
     t7 = IMULI(inv, VSUB(t2, t3));
+
     t0 = VADD(t4, t6);
     t2 = VSUB(t4, t6);
     t1 = VSUB(t5, t7);
     t3 = VADD(t5, t7);
+
     TX2(&t0, &t1);
     TX2(&t2, &t3);
-    *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; 
-}
-
 
+    *r0 = t0;
+    *r2 = t1;
+    *r1 = t2;
+    *r3 = t3;
+}
 
-static inline void L_4_2(int inv,  
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void L_4_2(int inv, const data_t * FFTS_RESTRICT i0, const data_t * FFTS_RESTRICT i1,
+                              const data_t * FFTS_RESTRICT i2, const data_t * FFTS_RESTRICT i3,
+                              V *r0, V *r1, V *r2, V *r3)
 {
     V t0, t1, t2, t3, t4, t5, t6, t7;
 
-    t0 = VLD(i0);    t1 = VLD(i1);    t6 = VLD(i2);    t7 = VLD(i3);
+    t0 = VLD(i0);
+    t1 = VLD(i1);
+    t6 = VLD(i2);
+    t7 = VLD(i3);
+
     t2 = VBLEND(t6, t7);
     t3 = VBLEND(t7, t6);
+
     t4 = VADD(t0, t1);
     t5 = VSUB(t0, t1);
     t6 = VADD(t2, t3);
     t7 = VSUB(t2, t3);
+
     *r2 = VUNPACKHI(t4, t5);
-    *r3 = VUNPACKHI(t6, t7); 
+    *r3 = VUNPACKHI(t6, t7);
+
     t7 = IMULI(inv, t7);
+
     t0 = VADD(t4, t6);
     t2 = VSUB(t4, t6);
     t1 = VSUB(t5, t7);
     t3 = VADD(t5, t7);
+
     *r0 = VUNPACKLO(t0, t1);
     *r1 = VUNPACKLO(t2, t3);
 }
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+
+#define S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
+	VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
+
+#endif /* FFTS_MACROS_H */
+\ No newline at end of file
diff --git a/src/sse.s b/src/sse.s
index 90f02db..ccdebc8 100644
--- a/src/sse.s
+++ b/src/sse.s
@@ -54,6 +54,7 @@ _leaf_ee_init:
 leaf_ee_init:
 #endif
     #lea L_sse_constants(%rip), %r9
+    movq (%rdi), %r8
     movq 0xe0(%rdi), %r9
     xorl %eax, %eax
 
@@ -559,9 +560,9 @@ x8_soft:
         leaq      (%r14,%rcx,4), %r15
 X8_soft_loop:
         movaps    (%rsi), %xmm9
-        movaps    (%r10,%rax,4), %xmm6 
+        movaps    (%r10,%rax,4), %xmm6
         movaps    %xmm9, %xmm11
-        movaps    (%r11,%rax,4), %xmm7 
+        movaps    (%r11,%rax,4), %xmm7
         movaps    16(%rsi), %xmm8
         mulps     %xmm6, %xmm11
         mulps     %xmm7, %xmm9
@@ -647,6 +648,14 @@ X8_soft_loop:
         ret
 
 #ifdef __APPLE__
+    .globl  _x8_soft_end
+_x8_soft_end:
+#else
+    .globl  x8_soft_end
+x8_soft_end:
+#endif
+
+#ifdef __APPLE__
     .globl  _x8_hard
 _x8_hard:
 #else
diff --git a/src/sse_win64.s b/src/sse_win64.s
new file mode 100644
index 0000000..6b75391
--- /dev/null
+++ b/src/sse_win64.s
@@ -0,0 +1,840 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+    * Neither the name of the organization nor the
+      names of its contributors may be used to endorse or promote products
+        derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+    .code64
+
+    .globl	_neon_x4
+    .align	4
+_neon_x4:
+
+    .globl _neon_x8
+    .align 4
+_neon_x8:
+
+    .globl _neon_x8_t
+    .align 4
+_neon_x8_t:
+
+#ifdef __APPLE__
+    .globl _leaf_ee_init
+_leaf_ee_init:
+#else
+    .globl leaf_ee_init
+leaf_ee_init:
+#endif
+
+# rcx is a pointer to the ffts_plan
+# eax is loop counter (init to 0)
+# rbx is loop max count
+# rdx is 'in' base pointer
+# r8  is 'out' base pointer
+# rdi is offsets pointer
+# rsi is constants pointer
+# scratch: rax r10 r11
+
+        xorl %eax, %eax
+        movq (%rcx), %rdi
+        movq 0xe0(%rcx), %rsi
+
+# _leaf_ee + 8 needs 16 byte alignment
+#ifdef __APPLE__
+    .globl _leaf_ee
+_leaf_ee:
+#else
+    .globl leaf_ee
+leaf_ee:
+#endif
+        movaps    32(%rsi), %xmm0                                #83.5
+        movaps    (%rsi), %xmm8                                  #83.5
+LEAF_EE_1:
+LEAF_EE_const_0:
+        movaps    0xFECA(%rdx,%rax,4), %xmm7                    #83.5
+LEAF_EE_const_2:
+        movaps    0xFECA(%rdx,%rax,4), %xmm12                   #83.5
+        movaps    %xmm7, %xmm6                                  #83.5
+LEAF_EE_const_3:
+        movaps    0xFECA(%rdx,%rax,4), %xmm10                   #83.5
+        movaps    %xmm12, %xmm11                                #83.5
+        subps     %xmm10, %xmm12                                #83.5
+        addps     %xmm10, %xmm11                                #83.5
+        xorps     %xmm8, %xmm12                                 #83.5
+LEAF_EE_const_1:
+        movaps    0xFECA(%rdx,%rax,4), %xmm9                    #83.5
+LEAF_EE_const_4:
+        movaps    0xFECA(%rdx,%rax,4), %xmm10                   #83.5
+        addps     %xmm9, %xmm6                                  #83.5
+        subps     %xmm9, %xmm7                                  #83.5
+LEAF_EE_const_5:
+        movaps    0xFECA(%rdx,%rax,4), %xmm13                   #83.5
+        movaps    %xmm10, %xmm9                                 #83.5
+LEAF_EE_const_6:
+        movaps    0xFECA(%rdx,%rax,4), %xmm3                    #83.5
+        movaps    %xmm6, %xmm5                                  #83.5
+LEAF_EE_const_7:
+        movaps    0xFECA(%rdx,%rax,4), %xmm14                   #83.5
+        movaps    %xmm3, %xmm15                                 #83.5
+        shufps    $177, %xmm12, %xmm12                          #83.5
+        movaps    %xmm7, %xmm4                                  #83.5
+        movslq    (%rdi, %rax, 4), %r10                         #83.44
+        subps     %xmm13, %xmm10                                #83.5
+        subps     %xmm14, %xmm3                                 #83.5
+        addps     %xmm11, %xmm5                                 #83.5
+        subps     %xmm11, %xmm6                                 #83.5
+        subps     %xmm12, %xmm4                                 #83.5
+        addps     %xmm12, %xmm7                                 #83.5
+        addps     %xmm13, %xmm9                                 #83.5
+        addps     %xmm14, %xmm15                                #83.5
+        movaps    16(%rsi), %xmm12                               #83.5
+        movaps    %xmm9, %xmm1                                  #83.5
+        movaps    16(%rsi), %xmm11                               #83.5
+        movaps    %xmm5, %xmm2                                  #83.5
+        mulps     %xmm10, %xmm12                                #83.5
+        subps     %xmm15, %xmm9                                 #83.5
+        addps     %xmm15, %xmm1                                 #83.5
+        mulps     %xmm3, %xmm11                                 #83.5
+        addps     %xmm1, %xmm2                                  #83.5
+        subps     %xmm1, %xmm5                                  #83.5
+        shufps    $177, %xmm10, %xmm10                          #83.5
+        xorps     %xmm8, %xmm9                                  #83.5
+        shufps    $177, %xmm3, %xmm3                            #83.5
+        movaps    %xmm6, %xmm1                                  #83.5
+        mulps     %xmm0, %xmm10                                 #83.5
+        movaps    %xmm4, %xmm13                                 #83.5
+        mulps     %xmm0, %xmm3                                  #83.5
+        subps     %xmm10, %xmm12                                #83.5
+        addps     %xmm3, %xmm11                                 #83.5
+        movaps    %xmm12, %xmm3                                 #83.5
+        movaps    %xmm7, %xmm14                                 #83.5
+        shufps    $177, %xmm9, %xmm9                            #83.5
+        subps     %xmm11, %xmm12                                #83.5
+        addps     %xmm11, %xmm3                                 #83.5
+        subps     %xmm9, %xmm1                                  #83.5
+        addps     %xmm9, %xmm6                                  #83.5
+        addps     %xmm3, %xmm4                                  #83.5
+        subps     %xmm3, %xmm13                                 #83.5
+        xorps     %xmm8, %xmm12                                 #83.5
+        movaps    %xmm2, %xmm3                                  #83.5
+        shufps    $177, %xmm12, %xmm12                          #83.5
+        movaps    %xmm6, %xmm9                                  #83.5
+        movslq    8(%rdi, %rax, 4), %r11                        #83.59
+        movlhps   %xmm4, %xmm3                                  #83.5
+        addq      $4, %rax
+        shufps    $238, %xmm4, %xmm2                           #83.5
+        movaps    %xmm1, %xmm4                                 #83.5
+        subps     %xmm12, %xmm7                                #83.5
+        addps     %xmm12, %xmm14                               #83.5
+        movlhps   %xmm7, %xmm4                                 #83.5
+        shufps    $238, %xmm7, %xmm1                           #83.5
+        movaps    %xmm5, %xmm7                                 #83.5
+        movlhps   %xmm13, %xmm7                                #83.5
+        movlhps   %xmm14, %xmm9                                #83.5
+        shufps    $238, %xmm13, %xmm5                          #83.5
+        shufps    $238, %xmm14, %xmm6                          #83.5
+        movaps    %xmm3, (%r8,%r10,4)                          #83.5
+        movaps    %xmm4, 16(%r8,%r10,4)                        #83.5
+        movaps    %xmm7, 32(%r8,%r10,4)                        #83.5
+        movaps    %xmm9, 48(%r8,%r10,4)                        #83.5
+        movaps    %xmm2, (%r8,%r11,4)                          #83.5
+        movaps    %xmm1, 16(%r8,%r11,4)                        #83.5
+        movaps    %xmm5, 32(%r8,%r11,4)                        #83.5
+        movaps    %xmm6, 48(%r8,%r11,4)                        #83.5
+        cmpq      %rbx, %rax
+        jne       LEAF_EE_1
+
+# _leaf_oo + 3 needs to be 16 byte aligned
+#ifdef __APPLE__
+    .globl _leaf_oo
+_leaf_oo:
+#else
+    .globl leaf_oo
+leaf_oo:
+#endif
+        movaps    (%rsi), %xmm5                                 #92.7
+LEAF_OO_1:
+LEAF_OO_const_0:
+        movaps    0xFECA(%rdx,%rax,4), %xmm4                    #93.5
+        movaps    %xmm4, %xmm6                                  #93.5
+LEAF_OO_const_1:
+        movaps    0xFECA(%rdx,%rax,4), %xmm7                    #93.5
+LEAF_OO_const_2:
+        movaps    0xFECA(%rdx,%rax,4), %xmm10                   #93.5
+        addps     %xmm7, %xmm6                                  #93.5
+        subps     %xmm7, %xmm4                                  #93.5
+LEAF_OO_const_3:
+        movaps    0xFECA(%rdx,%rax,4), %xmm8                    #93.5
+        movaps    %xmm10, %xmm9                                 #93.5
+LEAF_OO_const_4:
+        movaps    0xFECA(%rdx,%rax,4), %xmm1                    #93.5
+        movaps    %xmm6, %xmm3                                  #93.5
+LEAF_OO_const_5:
+        movaps    0xFECA(%rdx,%rax,4), %xmm11                   #93.5
+        movaps    %xmm1, %xmm2                                  #93.5
+LEAF_OO_const_6:
+        movaps    0xFECA(%rdx,%rax,4), %xmm14                   #93.5
+        movaps    %xmm4, %xmm15                                 #93.5
+LEAF_OO_const_7:
+        movaps    0xFECA(%rdx,%rax,4), %xmm12                   #93.5
+        movaps    %xmm14, %xmm13                                #93.5
+        movslq    (%rdi, %rax, 4), %r10                         #83.44
+        subps     %xmm8, %xmm10                                 #93.5
+        addps     %xmm8, %xmm9                                  #93.5
+        addps     %xmm11, %xmm2                                 #93.5
+        subps     %xmm12, %xmm14                                #93.5
+        subps     %xmm11, %xmm1                                 #93.5
+        addps     %xmm12, %xmm13                                #93.5
+        addps     %xmm9, %xmm3                                  #93.5
+        subps     %xmm9, %xmm6                                  #93.5
+        xorps     %xmm5, %xmm10                                 #93.5
+        xorps     %xmm5, %xmm14                                 #93.5
+        shufps    $177, %xmm10, %xmm10                          #93.5
+        movaps    %xmm2, %xmm9                                  #93.5
+        shufps    $177, %xmm14, %xmm14                          #93.5
+        movaps    %xmm6, %xmm7                                  #93.5
+        movslq    8(%rdi, %rax, 4), %r11                        #83.59
+        addq      $4, %rax                                      #92.18
+        addps     %xmm10, %xmm4                                 #93.5
+        addps     %xmm13, %xmm9                                 #93.5
+        subps     %xmm13, %xmm2                                 #93.5
+        subps     %xmm10, %xmm15                                #93.5
+        movaps    %xmm1, %xmm13                                 #93.5
+        movaps    %xmm2, %xmm8                                  #93.5
+        movlhps   %xmm4, %xmm7                                  #93.5
+        subps     %xmm14, %xmm13                                #93.5
+        addps     %xmm14, %xmm1                                 #93.5
+        shufps    $238, %xmm4, %xmm6                            #93.5
+        movaps    %xmm3, %xmm14                                 #93.5
+        movaps    %xmm9, %xmm4                                  #93.5
+        movlhps   %xmm15, %xmm14                                #93.5
+        movlhps   %xmm13, %xmm4                                 #93.5
+        movlhps   %xmm1, %xmm8                                  #93.5
+        shufps    $238, %xmm15, %xmm3                           #93.5
+        shufps    $238, %xmm13, %xmm9                           #93.5
+        shufps    $238, %xmm1, %xmm2                            #93.5
+        movaps    %xmm14, (%r8,%r10,4)                          #93.5
+        movaps    %xmm7, 16(%r8,%r10,4)                         #93.5
+        movaps    %xmm4, 32(%r8,%r10,4)                         #93.5
+        movaps    %xmm8, 48(%r8,%r10,4)                         #93.5
+        movaps    %xmm3, (%r8,%r11,4)                           #93.5
+        movaps    %xmm6, 16(%r8,%r11,4)                         #93.5
+        movaps    %xmm9, 32(%r8,%r11,4)                         #93.5
+        movaps    %xmm2, 48(%r8,%r11,4)                         #93.5
+        cmpq      %rbx, %rax
+        jne       LEAF_OO_1       # Prob 95%                    #92.14
+
+#ifdef __APPLE__
+    .globl _leaf_eo
+_leaf_eo:
+#else
+    .globl leaf_eo
+leaf_eo:
+#endif
+LEAF_EO_const_0:
+        movaps    0xFECA(%rdx,%rax,4), %xmm9                    #88.5
+LEAF_EO_const_2:
+        movaps    0xFECA(%rdx,%rax,4), %xmm7                    #88.5
+        movaps    %xmm9, %xmm11                                 #88.5
+LEAF_EO_const_3:
+        movaps    0xFECA(%rdx,%rax,4), %xmm5                    #88.5
+        movaps    %xmm7, %xmm6                                  #88.5
+LEAF_EO_const_1:
+        movaps    0xFECA(%rdx,%rax,4), %xmm4                    #88.5
+        subps     %xmm5, %xmm7                                  #88.5
+        addps     %xmm4, %xmm11                                 #88.5
+        subps     %xmm4, %xmm9                                  #88.5
+        addps     %xmm5, %xmm6                                  #88.5
+        movaps    (%rsi), %xmm3                                 #88.5
+        movaps    %xmm11, %xmm10                               #88.5
+        xorps     %xmm3, %xmm7                                 #88.5
+        movaps    %xmm9, %xmm8                                 #88.5
+        shufps    $177, %xmm7, %xmm7                           #88.5
+        addps     %xmm6, %xmm10                                #88.5
+        subps     %xmm6, %xmm11                                #88.5
+        subps     %xmm7, %xmm8                                 #88.5
+        addps     %xmm7, %xmm9                                 #88.5
+        movslq    8(%rdi, %rax, 4), %r11                       #83.59
+        movaps    %xmm10, %xmm2                                #88.5
+        movslq    (%rdi, %rax, 4), %r10                        #83.44
+        movaps    %xmm11, %xmm1                                #88.5
+        shufps    $238, %xmm8, %xmm10                          #88.5
+        shufps    $238, %xmm9, %xmm11                          #88.5
+        movaps    %xmm10, (%r8,%r11,4)                         #88.5
+        movaps    %xmm11, 16(%r8,%r11,4)                       #88.5
+LEAF_EO_const_4:
+        movaps    0xFECA(%rdx,%rax,4), %xmm15                   #88.5
+LEAF_EO_const_5:
+        movaps    0xFECA(%rdx,%rax,4), %xmm12                   #88.5
+        movaps    %xmm15, %xmm14                                #88.5
+LEAF_EO_const_6:
+        movaps    0xFECA(%rdx,%rax,4), %xmm4                    #88.5
+        addps     %xmm12, %xmm14                                #88.5
+        subps     %xmm12, %xmm15                                #88.5
+LEAF_EO_const_7:
+        movaps    0xFECA(%rdx,%rax,4), %xmm13                   #88.5
+        movaps    %xmm4, %xmm5                                  #88.5
+        movaps    %xmm14, %xmm7                                 #88.5
+        addps     %xmm13, %xmm5                                 #88.5
+        subps     %xmm13, %xmm4                                 #88.5
+        movlhps   %xmm8, %xmm2                                  #88.5
+        movaps    %xmm5, %xmm8                                  #88.5
+        movlhps   %xmm15, %xmm7                                 #88.5
+        xorps     %xmm3, %xmm15                                 #88.5
+        movaps    %xmm5, %xmm6                                  #88.5
+        subps     %xmm14, %xmm5                                 #88.5
+        addps     %xmm14, %xmm6                                 #88.5
+        movlhps   %xmm9, %xmm1                                  #88.5
+        movaps    %xmm4, %xmm14                                 #88.5
+        movlhps   %xmm4, %xmm8                                  #88.5
+        movaps    %xmm1, %xmm12                                 #88.5
+        shufps    $177, %xmm15, %xmm15                          #88.5
+        movaps    0x30(%rsi), %xmm11                             #88.5
+        addq      $4, %rax                                      #90.5
+        subps     %xmm15, %xmm14                                #88.5
+        mulps     %xmm7, %xmm11                                 #88.5
+        addps     %xmm15, %xmm4                                 #88.5
+        movaps    0x30(%rsi), %xmm9                              #88.5
+        movaps    0x40(%rsi), %xmm15                             #88.5
+        shufps    $177, %xmm7, %xmm7                            #88.5
+        mulps     %xmm8, %xmm9                                  #88.5
+        mulps     %xmm15, %xmm7                                 #88.5
+        shufps    $177, %xmm8, %xmm8                            #88.5
+        subps     %xmm7, %xmm11                                #88.5
+        mulps     %xmm15, %xmm8                                #88.5
+        movaps    %xmm11, %xmm10                               #88.5
+        addps     %xmm8, %xmm9                                 #88.5
+        shufps    $238, %xmm14, %xmm6                          #88.5
+        subps     %xmm9, %xmm11                                #88.5
+        addps     %xmm9, %xmm10                                #88.5
+        xorps     %xmm3, %xmm11                                #88.5
+        movaps    %xmm2, %xmm3                                 #88.5
+        shufps    $177, %xmm11, %xmm11                         #88.5
+        subps     %xmm10, %xmm3                                #88.5
+        addps     %xmm10, %xmm2                                #88.5
+        addps     %xmm11, %xmm12                               #88.5
+        subps     %xmm11, %xmm1                                #88.5
+        shufps    $238, %xmm4, %xmm5                           #88.5
+        movaps    %xmm5, 48(%r8,%r11,4)                        #88.5
+        movaps    %xmm6, 32(%r8,%r11,4)                        #88.5
+        movaps    %xmm2, (%r8,%r10,4)                          #88.5
+        movaps    %xmm1, 16(%r8,%r10,4)                        #88.5
+        movaps    %xmm3, 32(%r8,%r10,4)                        #88.5
+        movaps    %xmm12, 48(%r8,%r10,4)                       #88.5
+
+#ifdef __APPLE__
+    .globl _leaf_oe
+_leaf_oe:
+#else
+    .globl leaf_oe
+leaf_oe:
+#endif
+        movaps    (%rsi), %xmm0                                 #59.5
+LEAF_OE_const_2:
+        movaps    0xFECA(%rdx,%rax,4), %xmm6                    #70.5
+LEAF_OE_const_3:
+        movaps    0xFECA(%rdx,%rax,4), %xmm8                    #70.5
+        movaps    %xmm6, %xmm10                                 #70.5
+        shufps    $228, %xmm8, %xmm10                           #70.5
+        movaps    %xmm10, %xmm9                                 #70.5
+        shufps    $228, %xmm6, %xmm8                            #70.5
+LEAF_OE_const_0:
+        movaps    0xFECA(%rdx,%rax,4), %xmm12                   #70.5
+LEAF_OE_const_1:
+        movaps    0xFECA(%rdx,%rax,4), %xmm7                    #70.5
+        movaps    %xmm12, %xmm14                                #70.5
+        movslq    (%rdi, %rax, 4), %r10                         #83.44
+        addps     %xmm8, %xmm9                                  #70.5
+        subps     %xmm8, %xmm10                                 #70.5
+        addps     %xmm7, %xmm14                                 #70.5
+        subps     %xmm7, %xmm12                                 #70.5
+        movaps    %xmm9, %xmm4                                  #70.5
+        movaps    %xmm14, %xmm13                                #70.5
+        shufps    $238, %xmm10, %xmm4                           #70.5
+        xorps     %xmm0, %xmm10                                 #70.5
+        shufps    $177, %xmm10, %xmm10                          #70.5
+        movaps    %xmm12, %xmm11                                #70.5
+        movaps    %xmm14, %xmm5                                 #70.5
+        addps     %xmm9, %xmm13                                 #70.5
+        subps     %xmm10, %xmm11                                #70.5
+        subps     %xmm9, %xmm14                                 #70.5
+        shufps    $238, %xmm12, %xmm5                           #70.5
+        addps     %xmm10, %xmm12                                #70.5
+        movslq    8(%rdi, %rax, 4), %r11                        #83.59
+        movlhps   %xmm11, %xmm13                                #70.5
+        movaps    %xmm13, (%r8,%r10,4)                          #70.5
+        movaps    0x30(%rsi), %xmm13                            #70.5
+        movlhps   %xmm12, %xmm14                                #70.5
+        movaps    0x40(%rsi), %xmm12                            #70.5
+        mulps     %xmm5, %xmm13                                 #70.5
+        shufps    $177, %xmm5, %xmm5                            #70.5
+        mulps     %xmm12, %xmm5                                 #70.5
+        movaps    %xmm14, 16(%r8,%r10,4)                        #70.5
+        subps     %xmm5, %xmm13                                 #70.5
+        movaps    0x30(%rsi), %xmm5                             #70.5
+        mulps     %xmm4, %xmm5                                  #70.5
+        shufps    $177, %xmm4, %xmm4                            #70.5
+        mulps     %xmm12, %xmm4                                 #70.5
+LEAF_OE_const_4:
+        movaps    0xFECA(%rdx,%rax,4), %xmm9                    #70.5
+        addps     %xmm4, %xmm5                                  #70.5
+LEAF_OE_const_6:
+        movaps    0xFECA(%rdx,%rax,4), %xmm7                    #70.5
+        movaps    %xmm9, %xmm3                                  #70.5
+LEAF_OE_const_7:
+        movaps    0xFECA(%rdx,%rax,4), %xmm2                    #70.5
+        movaps    %xmm7, %xmm6                                  #70.5
+LEAF_OE_const_5:
+        movaps    0xFECA(%rdx,%rax,4), %xmm15                   #70.5
+        movaps    %xmm13, %xmm4                                 #70.5
+        subps     %xmm2, %xmm7                                  #70.5
+        addps     %xmm15, %xmm3                                 #70.5
+        subps     %xmm15, %xmm9                                 #70.5
+        addps     %xmm2, %xmm6                                  #70.5
+        subps     %xmm5, %xmm13                                 #70.5
+        addps     %xmm5, %xmm4                                  #70.5
+        xorps     %xmm0, %xmm7                                  #70.5
+        addq      $4, %rax                                      #72.5
+        movaps    %xmm3, %xmm2                                  #70.5
+        shufps    $177, %xmm7, %xmm7                            #70.5
+        movaps    %xmm9, %xmm8                                  #70.5
+        xorps     %xmm0, %xmm13                                 #70.5
+        addps     %xmm6, %xmm2                                  #70.5
+        subps     %xmm7, %xmm8                                  #70.5
+        subps     %xmm6, %xmm3                                  #70.5
+        addps     %xmm7, %xmm9                                  #70.5
+        movaps    %xmm2, %xmm10                                 #70.5
+        movaps    %xmm3, %xmm11                                 #70.5
+        shufps    $238, %xmm8, %xmm2                            #70.5
+        shufps    $238, %xmm9, %xmm3                            #70.5
+        movaps    %xmm2, %xmm14                                 #70.5
+        shufps    $177, %xmm13, %xmm13                          #70.5
+        subps     %xmm4, %xmm14                                 #70.5
+        addps     %xmm4, %xmm2                                  #70.5
+        movaps    %xmm3, %xmm4                                  #70.5
+        subps     %xmm13, %xmm3                                 #70.5
+        addps     %xmm13, %xmm4                                 #70.5
+        movlhps   %xmm8, %xmm10                                 #70.5
+        movlhps   %xmm9, %xmm11                                 #70.5
+        movaps    %xmm10, 32(%r8,%r10,4)                       #70.5
+        movaps    %xmm11, 48(%r8,%r10,4)                       #70.5
+        movaps    %xmm2, (%r8,%r11,4)                          #70.5
+        movaps    %xmm3, 16(%r8,%r11,4)                        #70.5
+        movaps    %xmm14, 32(%r8,%r11,4)                       #70.5
+        movaps    %xmm4, 48(%r8,%r11,4)                        #70.5
+
+#ifdef __APPLE__
+    .globl	_leaf_end
+_leaf_end:
+#else
+    .globl	leaf_end
+leaf_end:
+#endif
+
+#ifdef __APPLE__
+    .globl	_x_init
+_x_init:
+#else
+    .globl	x_init
+x_init:
+#endif
+        movaps    (%rsi), %xmm3                                 #34.3
+        movq      0x20(%rcx), %rdi
+#ifdef __APPLE__
+    .globl	_x4
+_x4:
+#else	
+    .globl	x4
+x4:
+#endif
+        movaps    64(%r8), %xmm0                               #34.3
+        movaps    96(%r8), %xmm1                               #34.3
+        movaps    (%r8), %xmm7                                 #34.3
+        movaps    (%rdi), %xmm4        #const
+        movaps    %xmm7, %xmm9                                 #34.3
+        movaps    %xmm4, %xmm6                                 #34.3
+        movaps    16(%rdi), %xmm2      #const
+        mulps     %xmm0, %xmm6                                 #34.3
+        mulps     %xmm1, %xmm4                                 #34.3
+        shufps    $177, %xmm0, %xmm0                           #34.3
+        shufps    $177, %xmm1, %xmm1                           #34.3
+        mulps     %xmm2, %xmm0                                 #34.3
+        mulps     %xmm1, %xmm2                                 #34.3
+        subps     %xmm0, %xmm6                                 #34.3
+        addps     %xmm2, %xmm4                                 #34.3
+        movaps    %xmm6, %xmm5                                 #34.3
+        subps     %xmm4, %xmm6                                 #34.3
+        addps     %xmm4, %xmm5                                 #34.3        
+        movaps    32(%r8), %xmm8                               #34.3        
+        xorps     %xmm3, %xmm6                                 #34.3
+        shufps    $177, %xmm6, %xmm6                           #34.3        
+        movaps    %xmm8, %xmm10                                #34.3
+        movaps    112(%r8), %xmm12                             #34.3        
+        subps     %xmm5, %xmm9                                 #34.3
+        addps     %xmm5, %xmm7                                 #34.3
+        addps     %xmm6, %xmm10                                #34.3
+        subps     %xmm6, %xmm8                                 #34.3        
+        movaps    %xmm7, (%r8)                                 #34.3
+        movaps    %xmm8, 32(%r8)                               #34.3
+        movaps    %xmm9, 64(%r8)                               #34.3
+        movaps    %xmm10, 96(%r8)                              #34.3        
+        movaps    32(%rdi), %xmm14    #const                   #34.3
+        movaps    80(%r8), %xmm11                              #34.3
+        movaps    %xmm14, %xmm0                                #34.3
+        movaps    48(%rdi), %xmm13    #const                   #34.3
+        mulps     %xmm11, %xmm0                                #34.3
+        mulps     %xmm12, %xmm14                               #34.3
+        shufps    $177, %xmm11, %xmm11                         #34.3
+        shufps    $177, %xmm12, %xmm12                         #34.3
+        mulps     %xmm13, %xmm11                               #34.3
+        mulps     %xmm12, %xmm13                               #34.3
+        subps     %xmm11, %xmm0                                #34.3
+        addps     %xmm13, %xmm14                               #34.3        
+        movaps    %xmm0, %xmm15                                #34.3
+        subps     %xmm14, %xmm0                                #34.3
+        addps     %xmm14, %xmm15                               #34.3
+        xorps     %xmm3, %xmm0                                 #34.3
+        movaps    16(%r8), %xmm1                               #34.3
+        movaps    48(%r8), %xmm2                               #34.3
+        movaps    %xmm1, %xmm4                                 #34.3
+        shufps    $177, %xmm0, %xmm0                           #34.3
+        movaps    %xmm2, %xmm5                                 #34.3
+        addps     %xmm15, %xmm1                                #34.3
+        subps     %xmm0, %xmm2                                 #34.3
+        subps     %xmm15, %xmm4                                #34.3
+        addps     %xmm0, %xmm5                                 #34.3
+        movaps    %xmm1, 16(%r8)                               #34.3
+        movaps    %xmm2, 48(%r8)                               #34.3
+        movaps    %xmm4, 80(%r8)                               #34.3
+        movaps    %xmm5, 112(%r8)                              #34.3
+        ret
+
+# _x8_soft + 6 needs to be 16 byte aligned
+#ifdef __APPLE__
+    .globl	_x8_soft
+_x8_soft:
+#else
+    .globl	x8_soft
+x8_soft:
+#endif
+        # rax, rcx, rdx, r8, r10, r11 (r9 not used)
+        # rbx, rdi, rsi
+        
+        # input
+        movq      %rdi, %rax
+
+        # output
+        movq      %r8, %rcx
+
+        # loop stop (output + output_stride)
+        leaq      (%r8, %rbx), %rdx
+
+        # 3 * output_stride
+        leaq      (%rbx, %rbx, 2), %rsi
+
+        # 5 * output_stride
+        leaq      (%rbx, %rbx, 4), %r10
+
+        # 7 * output_stride
+        leaq      (%rsi, %rbx, 4), %r11
+        
+X8_soft_loop:
+        # input + 0 * input_stride
+        movaps    (%rax), %xmm9
+
+        # output + 2 * output_stride
+        movaps    (%rcx, %rbx, 2), %xmm6
+
+        movaps    %xmm9, %xmm11
+
+        # output + 3 * output_stride
+        movaps    (%rcx, %rsi), %xmm7
+
+        # input + 1 * input_stride
+        movaps    16(%rax), %xmm8
+
+        mulps     %xmm6, %xmm11
+        mulps     %xmm7, %xmm9
+        shufps    $177, %xmm6, %xmm6
+        mulps     %xmm8, %xmm6
+        shufps    $177, %xmm7, %xmm7
+        subps     %xmm6, %xmm11
+        mulps     %xmm7, %xmm8
+        movaps    %xmm11, %xmm10
+        addps     %xmm8, %xmm9
+
+        # input + 2 * input_stride
+        movaps    32(%rax), %xmm15
+
+        addps     %xmm9, %xmm10
+        subps     %xmm9, %xmm11
+
+        # output + 0 * output_stride
+        movaps    (%rcx), %xmm5
+
+        movaps    %xmm15, %xmm6
+
+        # output + 4 * output_stride
+        movaps    (%rcx, %rbx, 4), %xmm12
+
+        movaps    %xmm5, %xmm2
+
+        # output + 6 * output_stride
+        movaps    (%rcx, %rsi, 2), %xmm13
+
+        xorps     %xmm3, %xmm11   #const
+
+        # input + 3 * input_stride
+        movaps    48(%rax), %xmm14
+
+        subps     %xmm10, %xmm2
+        mulps     %xmm12, %xmm6
+        addps     %xmm10, %xmm5
+        mulps     %xmm13, %xmm15
+
+        # input + 4 * input_stride
+        movaps    64(%rax), %xmm10
+
+        movaps    %xmm5, %xmm0
+        shufps    $177, %xmm12, %xmm12
+        shufps    $177, %xmm13, %xmm13
+        mulps     %xmm14, %xmm12
+        mulps     %xmm13, %xmm14
+        subps     %xmm12, %xmm6
+        addps     %xmm14, %xmm15
+
+        # output + 5 * output_stride
+        movaps    (%rcx, %r10), %xmm7
+
+        movaps    %xmm10, %xmm13
+
+        # output + 7 * output_stride
+        movaps    (%rcx, %r11), %xmm8
+
+        movaps    %xmm6, %xmm12
+
+        # input + 5 * input_stride
+        movaps    80(%rax), %xmm9
+
+        # input + 6 * input_stride
+        addq      $96, %rax
+
+        mulps     %xmm7, %xmm13
+        subps     %xmm15, %xmm6
+        addps     %xmm15, %xmm12
+        mulps     %xmm8, %xmm10
+        subps     %xmm12, %xmm0
+        addps     %xmm12, %xmm5
+        shufps    $177, %xmm7, %xmm7
+        xorps     %xmm3, %xmm6   #const
+        shufps    $177, %xmm8, %xmm8
+        movaps    %xmm2, %xmm12
+        mulps     %xmm9, %xmm7
+        mulps     %xmm8, %xmm9
+        subps     %xmm7, %xmm13
+        addps     %xmm9, %xmm10
+
+        # output + 1 * output_stride
+        movaps    (%rcx, %rbx), %xmm4
+
+        shufps    $177, %xmm11, %xmm11
+        movaps    %xmm4, %xmm1
+        shufps    $177, %xmm6, %xmm6
+        addps     %xmm11, %xmm1
+        subps     %xmm11, %xmm4
+        addps     %xmm6, %xmm12
+        subps     %xmm6, %xmm2
+        movaps    %xmm13, %xmm11
+        movaps    %xmm4, %xmm14
+        movaps    %xmm1, %xmm6
+        subps     %xmm10, %xmm13
+        addps     %xmm10, %xmm11
+        xorps     %xmm3, %xmm13   #const
+        addps     %xmm11, %xmm4
+        subps     %xmm11, %xmm14
+        shufps    $177, %xmm13, %xmm13
+
+        # output + 0 * output_stride
+        movaps    %xmm5, (%rcx)
+
+        # output + 1 * output_stride
+        movaps    %xmm4, (%rcx, %rbx)
+
+        # output + 2 * output_stride
+        movaps    %xmm2, (%rcx, %rbx, 2)
+
+        subps     %xmm13, %xmm1
+        addps     %xmm13, %xmm6
+
+        # output + 3 * output_stride
+        movaps    %xmm1, (%rcx, %rsi)
+
+        # output + 4 * output_stride
+        movaps    %xmm0, (%rcx, %rbx, 4)
+
+        # output + 5 * output_stride
+        movaps    %xmm14, (%rcx, %r10)
+
+        # output + 6 * output_stride
+        movaps    %xmm12, (%rcx, %rsi, 2)
+
+        # output + 7 * output_stride
+        movaps    %xmm6, (%rcx, %r11)
+
+        # output + 8 * output_stride
+        addq      $16, %rcx
+
+        cmpq      %rdx, %rcx
+        jne       X8_soft_loop
+        ret
+
+#ifdef __APPLE__
+    .globl	_x8_soft_end
+_x8_soft_end:
+#else
+    .globl	x8_soft_end
+x8_soft_end:
+
+#ifdef __APPLE__
+    .globl _sse_leaf_ee_offsets
+    .globl _sse_leaf_oo_offsets
+    .globl _sse_leaf_eo_offsets
+    .globl _sse_leaf_oe_offsets
+    .align 4
+_sse_leaf_ee_offsets:
+    .long LEAF_EE_const_0-_leaf_ee+0x4
+    .long LEAF_EE_const_1-_leaf_ee+0x5
+    .long LEAF_EE_const_2-_leaf_ee+0x5
+    .long LEAF_EE_const_3-_leaf_ee+0x5
+    .long LEAF_EE_const_4-_leaf_ee+0x5
+    .long LEAF_EE_const_5-_leaf_ee+0x5
+    .long LEAF_EE_const_6-_leaf_ee+0x4
+    .long LEAF_EE_const_7-_leaf_ee+0x5
+_sse_leaf_oo_offsets:
+    .long LEAF_OO_const_0-_leaf_oo+0x4
+    .long LEAF_OO_const_1-_leaf_oo+0x4
+    .long LEAF_OO_const_2-_leaf_oo+0x5
+    .long LEAF_OO_const_3-_leaf_oo+0x5
+    .long LEAF_OO_const_4-_leaf_oo+0x4
+    .long LEAF_OO_const_5-_leaf_oo+0x5
+    .long LEAF_OO_const_6-_leaf_oo+0x5
+    .long LEAF_OO_const_7-_leaf_oo+0x5
+_sse_leaf_eo_offsets:
+    .long LEAF_EO_const_0-_leaf_eo+0x5
+    .long LEAF_EO_const_1-_leaf_eo+0x4
+    .long LEAF_EO_const_2-_leaf_eo+0x4
+    .long LEAF_EO_const_3-_leaf_eo+0x4
+    .long LEAF_EO_const_4-_leaf_eo+0x5
+    .long LEAF_EO_const_5-_leaf_eo+0x5
+    .long LEAF_EO_const_6-_leaf_eo+0x4
+    .long LEAF_EO_const_7-_leaf_eo+0x5
+_sse_leaf_oe_offsets:
+    .long LEAF_OE_const_0-_leaf_oe+0x5
+    .long LEAF_OE_const_1-_leaf_oe+0x4
+    .long LEAF_OE_const_2-_leaf_oe+0x4
+    .long LEAF_OE_const_3-_leaf_oe+0x5
+    .long LEAF_OE_const_4-_leaf_oe+0x5
+    .long LEAF_OE_const_5-_leaf_oe+0x5
+    .long LEAF_OE_const_6-_leaf_oe+0x4
+    .long LEAF_OE_const_7-_leaf_oe+0x4
+#else
+    .globl sse_leaf_ee_offsets
+    .globl sse_leaf_oo_offsets
+    .globl sse_leaf_eo_offsets
+    .globl sse_leaf_oe_offsets
+    .align 4
+sse_leaf_ee_offsets:
+    .long LEAF_EE_const_0-leaf_ee+0x4
+    .long LEAF_EE_const_1-leaf_ee+0x5
+    .long LEAF_EE_const_2-leaf_ee+0x5
+    .long LEAF_EE_const_3-leaf_ee+0x5
+    .long LEAF_EE_const_4-leaf_ee+0x5
+    .long LEAF_EE_const_5-leaf_ee+0x5
+    .long LEAF_EE_const_6-leaf_ee+0x4
+    .long LEAF_EE_const_7-leaf_ee+0x5
+sse_leaf_oo_offsets:
+    .long LEAF_OO_const_0-leaf_oo+0x4
+    .long LEAF_OO_const_1-leaf_oo+0x4
+    .long LEAF_OO_const_2-leaf_oo+0x5
+    .long LEAF_OO_const_3-leaf_oo+0x5
+    .long LEAF_OO_const_4-leaf_oo+0x4
+    .long LEAF_OO_const_5-leaf_oo+0x5
+    .long LEAF_OO_const_6-leaf_oo+0x5
+    .long LEAF_OO_const_7-leaf_oo+0x5
+sse_leaf_eo_offsets:
+    .long LEAF_EO_const_0-leaf_eo+0x5
+    .long LEAF_EO_const_1-leaf_eo+0x4
+    .long LEAF_EO_const_2-leaf_eo+0x4
+    .long LEAF_EO_const_3-leaf_eo+0x4
+    .long LEAF_EO_const_4-leaf_eo+0x5
+    .long LEAF_EO_const_5-leaf_eo+0x5
+    .long LEAF_EO_const_6-leaf_eo+0x4
+    .long LEAF_EO_const_7-leaf_eo+0x5
+sse_leaf_oe_offsets:
+    .long LEAF_OE_const_0-leaf_oe+0x5
+    .long LEAF_OE_const_1-leaf_oe+0x4
+    .long LEAF_OE_const_2-leaf_oe+0x4
+    .long LEAF_OE_const_3-leaf_oe+0x5
+    .long LEAF_OE_const_4-leaf_oe+0x5
+    .long LEAF_OE_const_5-leaf_oe+0x5
+    .long LEAF_OE_const_6-leaf_oe+0x4
+    .long LEAF_OE_const_7-leaf_oe+0x4
+#endif
+
+#ifdef __APPLE__
+    .data
+#else
+    .section .data
+#endif
+    .p2align 4
+#ifdef __APPLE__
+    .globl _sse_constants
+_sse_constants:
+#else
+    .globl sse_constants
+sse_constants:
+#endif
+    .long   0x00000000,0x80000000,0x00000000,0x80000000
+    .long   0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+    .long   0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
+    .long   0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+    .long   0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
+#ifdef __APPLE__
+    .globl _sse_constants_inv
+_sse_constants_inv:
+#else
+    .globl sse_constants_inv
+sse_constants_inv:
+#endif
+    .long   0x80000000,0x00000000,0x80000000,0x00000000
+    .long   0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+    .long   0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
+    .long   0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+    .long   0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
author	Jukka Ojanen <jukka.ojanen@linkotec.net>	2014-10-31 17:55:21 +0200
committer	Jukka Ojanen <jukka.ojanen@linkotec.net>	2014-10-31 17:55:21 +0200
commit	196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c (patch)
tree	a35307258c8cd76cf4c41af630938943c2c47e09
parent	7b999686ec4c732d28efd344065606fccba84ae4 (diff)
download	ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.zip ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.tar.gz