13 files changed, 1790 insertions, 456 deletions
diff --git a/config.h.in b/config.h.in
index 69071a2..7922cd6 100644
--- a/config.h.in
+++ b/config.h.in
@@ -12,13 +12,12 @@
 /* Define to 1 if you have the `m' library (-lm). */
 #undef HAVE_LIBM
 
-/* Define to 1 if your system has a GNU libc compatible `malloc' function, and
-   to 0 otherwise. */
-#undef HAVE_MALLOC
-
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 
+/* Define to FFT with ARM NEON. */
+#undef HAVE_NEON
+
 /* Define to 1 if you have the `pow' function. */
 #undef HAVE_POW
 
@@ -97,9 +96,6 @@
    such a type exists and the standard includes do not define it. */
 #undef int32_t
 
-/* Define to rpl_malloc if the replacement function should be used. */
-#undef malloc
-
 /* Define to the equivalent of the C99 'restrict' keyword, or to
    nothing if this is not supported.  Do not define if restrict is
    supported directly.  */
diff --git a/configure b/configure
index d412018..d2bab5b 100755
--- a/configure
+++ b/configure
@@ -628,6 +628,8 @@ LIBOBJS
 EGREP
 GREP
 CPP
+HAVE_NEON_FALSE
+HAVE_NEON_TRUE
 am__fastdepCC_FALSE
 am__fastdepCC_TRUE
 CCDEPMODE
@@ -717,6 +719,7 @@ ac_user_opts='
 enable_option_checking
 enable_dependency_tracking
 enable_single
+enable_neon
 '
       ac_precious_vars='build_alias
 host_alias
@@ -1349,6 +1352,7 @@ Optional Features:
   --disable-dependency-tracking
                           speeds up one-time build
   --enable-single         compile single-precision library
+  --enable-neon           enable NEON extensions
 
 Some influential environment variables:
   CXX         C++ compiler command
@@ -4270,6 +4274,7 @@ fi
 #SFFT_CFLAGS="$CFLAGS"
 #SFFT_CC="$CC"
 
+SIMD=sse
 
 # Check whether --enable-single was given.
 if test "${enable_single+set}" = set; then :
@@ -4289,6 +4294,31 @@ $as_echo "#define FFTS_PREC_SINGLE 0" >>confdefs.h
 
 fi
 
+# Check whether --enable-neon was given.
+if test "${enable_neon+set}" = set; then :
+  enableval=$enable_neon; have_neon=$enableval
+else
+  have_neon=no
+fi
+
+if test "$have_neon" = "yes"; then
+	if test "$SIMD" != "sse"; then
+		as_fn_error $? "conflicting SIMD extensisons specified" "$LINENO" 5
+	fi
+
+$as_echo "#define HAVE_NEON 1" >>confdefs.h
+
+fi
+ if test "$have_neon" = "yes"; then
+  HAVE_NEON_TRUE=
+  HAVE_NEON_FALSE='#'
+else
+  HAVE_NEON_TRUE='#'
+  HAVE_NEON_FALSE=
+fi
+
+
+
 #if test "$ord_sr" = "no"; then
 #	AC_DEFINE(SFFT_ORD_SR,0,[Define to enable ordinary split radix.])
 #fi
@@ -4968,73 +4998,7 @@ _ACEOF
 
 
 # Checks for library functions.
-for ac_header in stdlib.h
-do :
-  ac_fn_c_check_header_mongrel "$LINENO" "stdlib.h" "ac_cv_header_stdlib_h" "$ac_includes_default"
-if test "x$ac_cv_header_stdlib_h" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_STDLIB_H 1
-_ACEOF
-
-fi
-
-done
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU libc compatible malloc" >&5
-$as_echo_n "checking for GNU libc compatible malloc... " >&6; }
-if ${ac_cv_func_malloc_0_nonnull+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "$cross_compiling" = yes; then :
-  ac_cv_func_malloc_0_nonnull=no
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#if defined STDC_HEADERS || defined HAVE_STDLIB_H
-# include <stdlib.h>
-#else
-char *malloc ();
-#endif
-
-int
-main ()
-{
-return ! malloc (0);
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
-  ac_cv_func_malloc_0_nonnull=yes
-else
-  ac_cv_func_malloc_0_nonnull=no
-fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_malloc_0_nonnull" >&5
-$as_echo "$ac_cv_func_malloc_0_nonnull" >&6; }
-if test $ac_cv_func_malloc_0_nonnull = yes; then :
-
-$as_echo "#define HAVE_MALLOC 1" >>confdefs.h
-
-else
-  $as_echo "#define HAVE_MALLOC 0" >>confdefs.h
-
-   case " $LIBOBJS " in
-  *" malloc.$ac_objext "* ) ;;
-  *) LIBOBJS="$LIBOBJS malloc.$ac_objext"
- ;;
-esac
-
-
-$as_echo "#define malloc rpl_malloc" >>confdefs.h
-
-fi
-
-
+#AC_FUNC_MALLOC
 for ac_func in gettimeofday pow
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
@@ -5188,6 +5152,10 @@ if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
   as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${HAVE_NEON_TRUE}" && test -z "${HAVE_NEON_FALSE}"; then
+  as_fn_error $? "conditional \"HAVE_NEON\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
diff --git a/configure.ac b/configure.ac
index 06cb913..25d6c71 100644
--- a/configure.ac
+++ b/configure.ac
@@ -20,6 +20,7 @@ AC_PROG_CC
 #SFFT_CFLAGS="$CFLAGS"
 #SFFT_CC="$CC"
 
+SIMD=sse
 
 AC_ARG_ENABLE(single, [AC_HELP_STRING([--enable-single],[compile single-precision library])], sfft_single=$enableval, sfft_single=no)
 if test "$sfft_single" = "yes"; then
@@ -29,6 +30,16 @@ if test "$sfft_single" = "no"; then
 	AC_DEFINE(FFTS_PREC_SINGLE,0,[Define to FFT in single precision.])
 fi
 
+AC_ARG_ENABLE(neon, [AC_HELP_STRING([--enable-neon],[enable NEON extensions])], have_neon=$enableval, have_neon=no)
+if test "$have_neon" = "yes"; then
+	if test "$SIMD" != "sse"; then
+		AC_MSG_ERROR([conflicting SIMD extensisons specified])
+	fi
+	AC_DEFINE(HAVE_NEON,1,[Define to FFT with ARM NEON.])
+fi
+AM_CONDITIONAL(HAVE_NEON, test "$have_neon" = "yes")
+
+
 #if test "$ord_sr" = "no"; then
 #	AC_DEFINE(SFFT_ORD_SR,0,[Define to enable ordinary split radix.])
 #fi
@@ -49,7 +60,7 @@ AC_TYPE_SIZE_T
 AC_TYPE_UINT64_T
 
 # Checks for library functions.
-AC_FUNC_MALLOC
+#AC_FUNC_MALLOC
 AC_CHECK_FUNCS([gettimeofday pow])
 
 
diff --git a/include/ffts.h b/include/ffts.h
index e266491..9bd0dbe 100644
--- a/include/ffts.h
+++ b/include/ffts.h
@@ -45,7 +45,7 @@ struct _ffts_plan_t {
 	ptrdiff_t *is;
 	ptrdiff_t *offsets;
 	void __attribute__ ((aligned(32))) **ws;
-	void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict);
+	void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict);
 	size_t i0, i1, i2;
 	uint64_t n_bits, leaftime;
 	
@@ -57,6 +57,6 @@ typedef struct _ffts_plan_t ffts_plan_t;
 
 void ffts_execute(ffts_plan_t * restrict, const void * restrict, const void * restrict);
 ffts_plan_t *ffts_init(size_t N, int sign);
-
+void ffts_free(ffts_plan_t *);
 
 #endif
diff --git a/src/Makefile.am b/src/Makefile.am
index a2ea644..f005968 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -8,6 +8,7 @@ all: $(OBJLIBS)
 
 %.o: %.c $(HDRS)
 	$(CC) $(CFLAGS) -c -o $@ $< -I../include
+	$(CC) $(CFLAGS) -S $< -I../include
 	
 $(OBJLIBS): $(OBJS) 
 	$(AR) rcs libffts.a $(OBJS)
diff --git a/src/Makefile.in b/src/Makefile.in
index fbc26bb..340688f 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -346,6 +346,7 @@ all: $(OBJLIBS)
 
 %.o: %.c $(HDRS)
 	$(CC) $(CFLAGS) -c -o $@ $< -I../include
+	$(CC) $(CFLAGS) -S $< -I../include
 
 $(OBJLIBS): $(OBJS) 
 	$(AR) rcs libffts.a $(OBJS)
diff --git a/src/cp_sse.c b/src/cp_sse.c
index 8b09031..2ac13c4 100644
--- a/src/cp_sse.c
+++ b/src/cp_sse.c
@@ -3,213 +3,174 @@
 #include "patterns.h"
 
 __INLINE void 
-firstpass_type_1(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) {
-	size_t i, i0 = p->i0, i1 = p->i1;
+firstpass_type_1(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
+	size_t i, ii0 = p->i0, ii1 = p->i1;
 	size_t *offsets = (size_t *)p->offsets;
 	size_t *is = (size_t *)p->is;
-  for(i=i0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
-  for(i=i1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
+#ifdef __ARM_NEON__
+  const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7];
+  for(i=ii0;i>0;--i) {
+    neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
+    offsets += 2;
+  }
+  for(i=ii1;i>0;--i) {
+    neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
+    offsets += 2;
+  }
+    neon_shl8_oe(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
+    offsets += 2;
+  for(i=ii1;i>0;--i) {
+    neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2);
+    offsets += 2;
+  }
+
+#else
+	for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+  for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
   LEAF_OE(&is, in, &offsets, out);
-  for(i=i1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+  for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+#endif
 }
 
 __INLINE void 
-firstpass_type_2(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) {
-	size_t i, i0 = p->i0, i1 = p->i1;
+firstpass_type_2(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
+	size_t i, ii0 = p->i0, ii1 = p->i1;
 	size_t *offsets = (size_t *)p->offsets;
 	size_t *is = (size_t *)p->is;
-  for(i=i0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+#ifdef __ARM_NEON__
+  const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7];
+
+	for(i=ii0;i>0;--i) {
+		neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
+		offsets+=2;
+	}
+    neon_shl8_eo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
+    offsets += 2;
+  for(i=ii1;i>0;--i) {
+    neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
+    offsets += 2;
+	}
+  for(i=ii1;i>0;--i) {
+    neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2);
+    offsets += 2;
+	}
+
+#else
+	for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
   LEAF_EO(&is, in, &offsets, out);
-  for(i=i1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
-  for(i=i1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+  for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
+  for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+#endif
 }
 
 __INLINE void 
-firstpass_64(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) {
+firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
 	size_t *offsets = (size_t *)p->offsets;
 	size_t *is = (size_t *)p->is;
   LEAF_EE(&is, in, &offsets, out);
   LEAF_OE(&is, in, &offsets, out);
 }
 
-void 
-firstpass_32(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
-  __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
-  float *LUT8 = p->ws[0];
-  float *LUT16 = p->ws[1];
-  float *LUT32 = p->ws[2];
-
-  L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19);
-  L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23);
-  K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
-  L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31);
-  L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27);
-  K_N(_mm_load_ps(LUT16),_mm_load_ps(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13);
-  K_N(_mm_load_ps(LUT16+8),_mm_load_ps(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15);
-  K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r16_17,&r18_19,&r20_21,&r22_23);
-  K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r24_25,&r26_27,&r28_29,&r30_31);
-  K_N(_mm_load_ps(LUT32),_mm_load_ps(LUT32+4),&r0_1,&r8_9,&r16_17,&r24_25);
-  S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48);
-  K_N(_mm_load_ps(LUT32+8),_mm_load_ps(LUT32+12),&r2_3,&r10_11,&r18_19,&r26_27);
-  S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52);
-  K_N(_mm_load_ps(LUT32+16),_mm_load_ps(LUT32+20),&r4_5,&r12_13,&r20_21,&r28_29);
-  S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56);
-  K_N(_mm_load_ps(LUT32+24),_mm_load_ps(LUT32+28),&r6_7,&r14_15,&r22_23,&r30_31);
-  S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60);
-
-}
-
-void 
-firstpass_16(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
-  __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
-  float *LUT8 = p->ws[0];
-  float *LUT16 = p->ws[1];
-
-  L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11);
-  L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13);
-  K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
-  K_N(_mm_load_ps(LUT16),_mm_load_ps(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13);
-  S_4(r0_1,r4_5,r8_9,r12_13,out+0,out+8,out+16,out+24);
-  K_N(_mm_load_ps(LUT16+8),_mm_load_ps(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15);
-  S_4(r2_3,r6_7,r10_11,r14_15,out+4,out+12,out+20,out+28);
-}
-
-void 
-firstpass_8(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
-  __m128 r0_1,r2_3,r4_5,r6_7;
-  float *LUT8 = p->ws[0];
-	L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7);
-  K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
-  S_4(r0_1,r2_3,r4_5,r6_7,out+0,out+4,out+8,out+12);
-}
-void 
-firstpass_4(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
-  __m128 r0,r1,r2,r3;
-  L_4(in+0,in+4,in+2,in+6,&r0,&r1,&r2,&r3);
-  S_4(r0,r1,r2,r3,out+0,out+2,out+4,out+6);
-}
-void 
-firstpass_2(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
-  __m128 r0,r1;
-  L_S2(in+0,in+2,&r0,&r1);
-  S_2(r0,r1,out+0,out+2);
-}
-
-void X_8(data_t * restrict data0, size_t N, const data_t * restrict LUT) {
-	data_t *data2 = data0 + 2*N/4;
-	data_t *data4 = data0 + 4*N/4;
-	data_t *data6 = data0 + 6*N/4;
-	data_t *data1 = data0 + 1*N/4;
-	data_t *data3 = data0 + 3*N/4;
-	data_t *data5 = data0 + 5*N/4;
-	data_t *data7 = data0 + 7*N/4;
-	size_t k, n4 = N/4;
-
-	for(k=N/8/2;k>0;--k) {
-      __m128 r0, r1, r2, r3, r4, r5, r6, r7;
-			r0 = _mm_load_ps(data0);
-			r1 = _mm_load_ps(data1);
-			r2 = _mm_load_ps(data2);
-			r3 = _mm_load_ps(data3);
-			K_N(_mm_load_ps(LUT), _mm_load_ps(LUT+4), &r0, &r1, &r2, &r3);
-			r4 = _mm_load_ps(data4);
-			r6 = _mm_load_ps(data6);
-			K_N(_mm_load_ps(LUT+8), _mm_load_ps(LUT+12), &r0, &r2, &r4, &r6);
-			r5 = _mm_load_ps(data5);
-			r7 = _mm_load_ps(data7);
-			K_N(_mm_load_ps(LUT+16), _mm_load_ps(LUT+20), &r1, &r3, &r5, &r7);
-			LUT += 24;
-			_mm_store_ps(data0, r0); data0 += 4;
-			_mm_store_ps(data1, r1); data1 += 4;
-			_mm_store_ps(data2, r2); data2 += 4;
-			_mm_store_ps(data3, r3); data3 += 4;
-			_mm_store_ps(data4, r4); data4 += 4;
-			_mm_store_ps(data5, r5); data5 += 4;
-			_mm_store_ps(data6, r6); data6 += 4;
-			_mm_store_ps(data7, r7); data7 += 4;
-	}
-}
-void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out, size_t N) {
+void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) {
 	transform_index_t *ps = p->transforms;
+	int leafN = 8;
+	p->firstpass((const float *)in, (float *)out, p);
+	size_t ps0_next = ps[0];
+	while(ps0_next) {
+		size_t ps0 = ps0_next;
+		size_t ps1 = ps[1];
+		ps0_next = ps[2];
+		ps += 2;
 
-	p->firstpass((const float *)in, (float *)out, N, p);
-	while(ps[0]) {
-		
-		if(ps[0] == 32) { 
+		if(ps0 == 2*leafN) { 
   		float *LUT = (float *)p->ws[0];
-  		float *data = (float *)(out) + ps[1];
-  		size_t n = 32;
-  		size_t i;
-  		for(i=0;i<n/4/2;i++) {
-  			__m128 uk = _mm_load_ps(data);
-  			__m128 uk2 = _mm_load_ps(data + 2*n/4);
-  			__m128 zk_p = _mm_load_ps(data + 4*n/4);
-  			__m128 zk_n = _mm_load_ps(data + 6*n/4);
-
-  			K_N(_mm_load_ps(LUT), _mm_load_ps(LUT+4), &uk, &uk2, &zk_p, &zk_n);
-
-  			_mm_store_ps(data, uk); 
-  			_mm_store_ps(data + 2*n/4, uk2); 
-  			_mm_store_ps(data + 4*n/4, zk_p);
-  			_mm_store_ps(data + 6*n/4, zk_n);
-
-  			LUT += 8;
-  			data += 4;
-  		}
-
+  		float *data = (float *)(out) + ps1;
+  		#ifdef __ARM_NEON__
+			X_4_SPLIT(data, 16, LUT);
+			#else
+			X_4(data, 16, LUT);
+			#endif
 
 		}else{
-  		int index = __builtin_ctzl(ps[0])-5;
-  		float *LUT = (float *)p->ws[__builtin_ctzl(ps[0])-5];
-  		X_8(((float *)out) + ps[1], ps[0], LUT);
+  		int index = __builtin_ctzl(ps0)-4;
+  		float *LUT = (float *)p->ws[__builtin_ctzl(ps0)-4];
+  		#ifdef __ARM_NEON__
+			X_8_SPLIT(((float *)out) + ps1, ps0, LUT);
+			#else
+			X_8(((float *)out) + ps1, ps0, LUT);
+			#endif
 		}
 
-		ps += 2;
+
 	}
 	
+	#ifdef __ARM_NEON__
+		if(p->N>32)
+		X_8_SPLIT_T((float *)out, p->N, p->lastlut);
+	#endif
 	
 }
 
+void ffts_free(ffts_plan_t *p) {
+	
+	size_t i;
+
+	if(p->ws) {
+		for(i=0;i<p->n_luts;i++) {
+			FFTS_FREE(p->ws[i]);
+		}
+		free(p->ws);
+	}
+	if(p->is) free(p->is);
+	if(p->offsets)		free(p->offsets);
+	free(p->transforms);
+
+	free(p);
+}
 
 ffts_plan_t *ffts_init(size_t N, int sign) {
 	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
-	size_t leafN = 16;	
+	size_t leafN = 8;	
 	size_t i;	
 		
-	if(sign < 0) MULI_SIGN = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f);
-	else         MULI_SIGN = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f);
+	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+	
+	if(sign < 0) SCALAR_MULI_SIGN = -0.0f*I; 
+	else         SCALAR_MULI_SIGN = -0.0f; 
 	
 	if(N > 32) {
-		init_offsets(p, N, leafN);
-		init_is(p, N, leafN, 2);
-		init_tree(p, N, leafN);
+		ffts_init_offsets(p, N, leafN);
+		ffts_init_is(p, N, leafN, 2);
+		ffts_init_tree(p, N, leafN);
 		
-		if(N == 64) p->firstpass = &firstpass_64;
-		else if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_2;	
-		else p->firstpass = &firstpass_type_1;	
-
-		LEAFLUT[0] = _mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
-		LEAFLUT[1] = _mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
-		LEAFLUT[2] = _mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011);
-		LEAFLUT[3] = _mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199);
-		LEAFLUT[4] = _mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981);
-		LEAFLUT[5] = _mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011);
+	//	if(N == 64) p->firstpass = &firstpass_64;
+		if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_1;	
+		else p->firstpass = &firstpass_type_2;	
+
+		LEAFLUT[0] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
+		LEAFLUT[1] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
+		LEAFLUT[2] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011);
+		LEAFLUT[3] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199);
+		LEAFLUT[4] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981);
+		LEAFLUT[5] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011);
   	
-		LEAFLUT[6] = _mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1);
-		LEAFLUT[7] = _mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0);
-		LEAFLUT[8] = _mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1);
-		LEAFLUT[9] = _mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0);
-		LEAFLUT[10] = _mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
-		LEAFLUT[11] = _mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
+		LEAFLUT[6] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1);
+		LEAFLUT[7] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0);
+		LEAFLUT[8] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1);
+		LEAFLUT[9] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0);
+		LEAFLUT[10] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
+		LEAFLUT[11] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
 
 		
 		if(sign > 0) {
-			LEAFLUT[1] = _mm_xor_ps(LEAFLUT[1], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
-			LEAFLUT[3] = _mm_xor_ps(LEAFLUT[3], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
-			LEAFLUT[5] = _mm_xor_ps(LEAFLUT[5], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
-			LEAFLUT[7] = _mm_xor_ps(LEAFLUT[7], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
-			LEAFLUT[9] = _mm_xor_ps(LEAFLUT[9], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
-			LEAFLUT[11] = _mm_xor_ps(LEAFLUT[11], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
+			V neg = VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
+			LEAFLUT[1] = VXOR(LEAFLUT[1], neg);
+			LEAFLUT[3] = VXOR(LEAFLUT[3], neg);
+			LEAFLUT[5] = VXOR(LEAFLUT[5], neg);
+			LEAFLUT[7] = VXOR(LEAFLUT[7], neg);
+			LEAFLUT[9] = VXOR(LEAFLUT[9], neg);
+			LEAFLUT[11] = VXOR(LEAFLUT[11], neg);
 		}
 		
 		p->i0 = N/leafN/3+1;
@@ -223,11 +184,14 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
 		p->transforms[0] = 0;
 		p->transforms[1] = 1;
 		if(N == 2) p->firstpass = &firstpass_2;
-		else if(N == 4) p->firstpass = &firstpass_4;
+		else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f;
+		else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b;
 		else if(N == 8) p->firstpass = &firstpass_8;
 		else if(N == 16) p->firstpass = &firstpass_16;
 		else if(N == 32) p->firstpass = &firstpass_32;
 
+		p->is = NULL;
+		p->offsets = NULL;
 	}
 
 		int hardcoded = 0;
@@ -236,9 +200,14 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
 		size_t n_luts = __builtin_ctzl(N/leafN);
 		if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
 
+		if(n_luts >= 32) n_luts = 0;
 
-		//printf("n_luts = %zu\n", n_luts);
-		p->ws = malloc(n_luts * sizeof(data_t *));
+//		fprintf(stderr, "n_luts = %zu\n", n_luts);
+		if(n_luts)
+			p->ws = malloc(n_luts * sizeof(data_t *));
+		else 
+			p->ws = NULL;
+		
 		cdata_t *w;
 
 		int n = leafN*2;
@@ -246,45 +215,67 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
 
 		for(i=0;i<n_luts;i++) {
 				
-			//printf("LUT[%zu] = %d\n", i, n);	
+//			fprintf(stderr, "LUT[%zu] = %d\n", i, n);	
 			
 			if(!i || hardcoded) {
-
-				w = _mm_malloc(n/4 * 2 * sizeof(cdata_t), 32);
-
-				cdata_t *w0 = _mm_malloc(n/4 * sizeof(cdata_t), 32);
+				cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
 
 				size_t j;
 				for(j=0;j<n/4;j++) {
 					w0[j]	= W(n,j);
 				}
 
-				__m128 temp0, temp1, temp2;
 
-				float *fw = (float *)w;
 				float *fw0 = (float *)w0;
+				#ifdef __ARM_NEON__
+				if(N <= 32) {
+					w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+					float *fw = (float *)w;
+					V temp0, temp1, temp2;
+					for(j=0;j<n/4;j+=2) {
+						temp0 = VLD(fw0 + j*2);
+						V re, im;
+						re = VDUPRE(temp0);
+						im = VDUPIM(temp0);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*4  , re);
+						VST(fw + j*4+4, im);
+					}
+				}else{
+					w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+					float *fw = (float *)w;
+					VS temp0, temp1, temp2;
+					for(j=0;j<n/4;j+=4) {
+						temp0 = VLD2(fw0 + j*2);
+						STORESPR(fw + j*2, temp0);
+					}
+				}
+				#else
+				w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+				float *fw = (float *)w;
+				V temp0, temp1, temp2;
 				for(j=0;j<n/4;j+=2) {
-					temp0 = _mm_load_ps(fw0 + j*2);
-					__m128 re, im;
-					re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0));
-					im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1));
-					im = _mm_xor_ps(im, MULI_SIGN);
-					_mm_store_ps(fw + j*4  , re);
-					_mm_store_ps(fw + j*4+4, im);
+					temp0 = VLD(fw0 + j*2);
+					V re, im;
+					re = VDUPRE(temp0);
+					im = VDUPIM(temp0);
+					im = VXOR(im, MULI_SIGN);
+					VST(fw + j*4  , re);
+					VST(fw + j*4+4, im);
 				}
+				#endif
 
 	//		  	for(j=0;j<n/2;j++) {
 	//		  		printf("%f %f\n", creal(w[j]), cimag(w[j]));
 
 	//		  	}
 
-				_mm_free(w0);
+				FFTS_FREE(w0);
 			}else{
-				w = _mm_malloc(n/8 * 3 * 2 * sizeof(cdata_t), 32);
 
-				cdata_t *w0 = _mm_malloc(n/8 * sizeof(cdata_t), 32);
-				cdata_t *w1 = _mm_malloc(n/8 * sizeof(cdata_t), 32);
-				cdata_t *w2 = _mm_malloc(n/8 * sizeof(cdata_t), 32);
+				cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+				cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+				cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
 
 				size_t j;
 				for(j=0;j<n/8;j++) {
@@ -294,46 +285,69 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
 
 				}
 
-				__m128 temp0, temp1, temp2, re, im;
-
-				float *fw = (float *)w;
 				float *fw0 = (float *)w0;
 				float *fw1 = (float *)w1;
 				float *fw2 = (float *)w2;
+				#ifdef __ARM_NEON__
+				w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
+				float *fw = (float *)w;
+				VS temp0, temp1, temp2;
+				for(j=0;j<n/8;j+=4) {
+					temp0 = VLD2(fw0 + j*2);
+					STORESPR(fw + j*2*3,      temp0);
+					//VST(fw + j*2*3,      temp0.val[0]);
+					//VST(fw + j*2*3 + 4,  temp0.val[1]);
+					temp1 = VLD2(fw1 + j*2);
+					STORESPR(fw + j*2*3 + 8,  temp1);
+					//VST(fw + j*2*3 + 8,  temp1.val[0]);
+					//VST(fw + j*2*3 + 12, temp1.val[1]);
+					temp2 = VLD2(fw2 + j*2);
+					STORESPR(fw + j*2*3 + 16, temp2);
+					//VST(fw + j*2*3 + 16, temp2.val[0]);
+					//VST(fw + j*2*3 + 20, temp2.val[1]);
+				#else
+				w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
+				float *fw = (float *)w;
+				V temp0, temp1, temp2, re, im;
 				for(j=0;j<n/8;j+=2) {
-					temp0 = _mm_load_ps(fw0 + j*2);
-					re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0));
-					im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1));
-					im = _mm_xor_ps(im, MULI_SIGN);
-					_mm_store_ps(fw + j*2*6  , re);
-					_mm_store_ps(fw + j*2*6+4, im);
-
-					temp1 = _mm_load_ps(fw1 + j*2);
-					re = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(2, 2, 0, 0));
-					im = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 3, 1, 1));
-					im = _mm_xor_ps(im, MULI_SIGN);
-					_mm_store_ps(fw + j*2*6+8 , re);
-					_mm_store_ps(fw + j*2*6+12, im);
-
-					temp2 = _mm_load_ps(fw2 + j*2);
-					re = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 2, 0, 0));
-					im = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 1, 1));
-					im = _mm_xor_ps(im, MULI_SIGN);
-					_mm_store_ps(fw + j*2*6+16, re);
-					_mm_store_ps(fw + j*2*6+20, im);
+					temp0 = VLD(fw0 + j*2);
+					re = VDUPRE(temp0);
+					im = VDUPIM(temp0);
+					im = VXOR(im, MULI_SIGN);
+					VST(fw + j*2*6  , re);
+					VST(fw + j*2*6+4, im);
+
+					temp1 = VLD(fw1 + j*2);
+					re = VDUPRE(temp1);
+					im = VDUPIM(temp1);
+					im = VXOR(im, MULI_SIGN);
+					VST(fw + j*2*6+8 , re);
+					VST(fw + j*2*6+12, im);
+
+					temp2 = VLD(fw2 + j*2);
+					re = VDUPRE(temp2);
+					im = VDUPIM(temp2);
+					im = VXOR(im, MULI_SIGN);
+					VST(fw + j*2*6+16, re);
+					VST(fw + j*2*6+20, im);
+				#endif
 				}
 
-				_mm_free(w0);
-				_mm_free(w1);
-				_mm_free(w2);
+				FFTS_FREE(w0);
+				FFTS_FREE(w1);
+				FFTS_FREE(w2);
 			}
 			p->ws[i] = w;
 
 			n *= 2;
 		}
 
+	p->N = N;
+	p->lastlut = w;
+	p->n_luts = n_luts;
+
+//	fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t));
 
-	
 	return p;
 }
 /*
@@ -342,8 +356,8 @@ int main(int argc, char *argv[]) {
 	int count = atoi(argv[2]);
 
 	ffts_plan_t *p = ffts_init(n);
-	cdata_t *in = _mm_malloc(n * sizeof(cdata_t), 32);
-	cdata_t *out = _mm_malloc(n * sizeof(cdata_t), 32);
+	cdata_t *in = FFTS_MALLOC(n * sizeof(cdata_t), 32);
+	cdata_t *out = FFTS_MALLOC(n * sizeof(cdata_t), 32);
 
 	size_t i;
 	for(i=0;i<n;i++) in[i] = i;
diff --git a/src/cp_sse.h b/src/cp_sse.h
index 2c6825f..f94055d 100644
--- a/src/cp_sse.h
+++ b/src/cp_sse.h
@@ -6,11 +6,11 @@
 #include <math.h>
 #include <complex.h>
 #include <stddef.h>
-#include <xmmintrin.h>
 #include <stdint.h>
+#include <stdalign.h>
 
 typedef complex float cdata_t;
-typedef float data_t;
+typedef alignas(16) float data_t;
 
 #define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N)))
 
@@ -20,9 +20,10 @@ struct _ffts_plan_t {
 	ptrdiff_t *is;
 	ptrdiff_t *offsets;
 	void __attribute__ ((aligned(32))) **ws;
-	void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict);
-	size_t i0, i1;
-	
+	void (*firstpass)(const float * restrict, float * restrict, struct _ffts_plan_t * restrict);
+	size_t i0, i1, n_luts;
+	size_t N;
+	void *lastlut;
 	transform_index_t *transforms; 
 };
 
diff --git a/src/macros.h b/src/macros.h
index b2f44e6..039ee40 100644
--- a/src/macros.h
+++ b/src/macros.h
@@ -1,125 +1,121 @@
 #ifndef __MACROS_H__
 #define __MACROS_H__
 
+#include "../config.h"
+
+#ifdef HAVE_NEON
+	#include "neon_float.h"
+#else 
+	#include "sse_float.h"
+#endif
+
+
 #include "cp_sse.h"
 
 #define __INLINE static inline __attribute__((always_inline))
 
-#define VLIT4 _mm_set_ps
-
-__m128 MULI_SIGN;
+cdata_t SCALAR_MULI_SIGN;
+V MULI_SIGN;
+V LEAFLUT[12];
 
-__INLINE __m128 IMULI(__m128 a) {
-	__m128 temp = _mm_xor_ps(a, MULI_SIGN);//_mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
-	return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(2,3,0,1));
+__INLINE V IMULI(V a) {
+	return VSWAPPAIRS(VXOR(a, MULI_SIGN));
 }
 
 __INLINE void 
-S_4(__m128 r0, __m128 r1, __m128 r2, __m128 r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) {
- 	__m128 t0, t1, t2, t3; 
-  _mm_store_ps(o0, r0);
-  _mm_store_ps(o1, r1);
-  _mm_store_ps(o2, r2);
-  _mm_store_ps(o3, r3);
+S_4(V r0, V r1, V r2, V r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) {
+ 	V t0, t1, t2, t3; 
+  VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
 }
-__INLINE void S_2(__m128 r0, __m128 r1, data_t * restrict o0, data_t * restrict o1) {
-  _mm_store_ps(o0, r0);
-  _mm_store_ps(o1, r1);
+__INLINE void S_2(V r0, V r1, data_t * restrict o0, data_t * restrict o1) {
+  VST(o0, r0); VST(o1, r1);
 }
 
-__INLINE void L_S2(const data_t * restrict i0, const data_t * restrict i1, __m128 * restrict r0, __m128 * restrict r1) {
-  __m128 t0, t1;
-  t0 = _mm_load_ps(i0);    
-  t1 = _mm_load_ps(i1);    
-  *r0 = _mm_add_ps(t0, t1);
-  *r1 = _mm_sub_ps(t0, t1);
+__INLINE void L_S2(const data_t * restrict i0, const data_t * restrict i1, V * restrict r0, V * restrict r1) {
+  V t0, t1;
+  t0 = VLD(i0);    t1 = VLD(i1);    
+  *r0 = VADD(t0, t1);
+  *r1 = VSUB(t0, t1);
 }
 __INLINE void 
 L_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, 
-									  __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
-  __m128 t0, t1, t2, t3;
-  t0 = _mm_load_ps(i0);    
-  t1 = _mm_load_ps(i1);    
-  t2 = _mm_load_ps(i2);    
-  t3 = _mm_load_ps(i3);    
-  *r0 = _mm_add_ps (t0, t1);
-  *r1 = _mm_sub_ps (t0, t1);
-  *r2 = _mm_add_ps (t2, t3);
-  *r3 = _mm_sub_ps (t2, t3);
+									  V *r0, V *r1, V *r2, V *r3) {
+  V t0, t1, t2, t3;
+  t0 = VLD(i0);    
+  t1 = VLD(i1);    
+  t2 = VLD(i2);    
+  t3 = VLD(i3);    
+  *r0 = VADD (t0, t1);
+  *r1 = VSUB (t0, t1);
+  *r2 = VADD (t2, t3);
+  *r3 = VSUB (t2, t3);
 }
 
 __INLINE void 
 L_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, 
-									__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
-  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
-  t0 = _mm_load_ps(i0);    
-  t1 = _mm_load_ps(i1);    
-  t2 = _mm_load_ps(i2);    
-  t3 = _mm_load_ps(i3);    
-  t4 = _mm_add_ps (t0, t1);
-  t5 = _mm_sub_ps (t0, t1);
-  t6 = _mm_add_ps (t2, t3);
-  t7 = IMULI(_mm_sub_ps (t2, t3));
-  *r0 = _mm_add_ps (t4, t6);
-  *r2 = _mm_sub_ps (t4, t6);
-  *r1 = _mm_sub_ps (t5, t7);
-  *r3 = _mm_add_ps (t5, t7);
+									V *r0, V *r1, V *r2, V *r3) {
+  V t0, t1, t2, t3, t4, t5, t6, t7;
+  t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);    
+  t4 = VADD (t0, t1);
+  t5 = VSUB (t0, t1);
+  t6 = VADD (t2, t3);
+  t7 = IMULI(VSUB (t2, t3));
+  *r0 = VADD (t4, t6);
+  *r2 = VSUB (t4, t6);
+  *r1 = VSUB (t5, t7);
+  *r3 = VADD (t5, t7);
 }
 
 __INLINE void 
-K_0(__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
-  __m128 uk, uk2, zk, zk_d;
-	uk   = *r0; 
-	uk2  = *r1;
-  zk   = _mm_add_ps(*r2, *r3);
-  zk_d = IMULI(_mm_sub_ps(*r2, *r3));
-  *r0 = _mm_add_ps(uk, zk);
-  *r2 = _mm_sub_ps(uk, zk);
-  *r1 = _mm_sub_ps(uk2, zk_d);
-  *r3 = _mm_add_ps(uk2, zk_d);
+K_0(V *r0, V *r1, V *r2, V *r3) {
+  V uk, uk2, zk, zk_d;
+	uk   = *r0; uk2  = *r1;
+  zk   = VADD(*r2, *r3);
+  zk_d = IMULI(VSUB(*r2, *r3));
+  *r0 = VADD(uk, zk);
+  *r2 = VSUB(uk, zk);
+  *r1 = VSUB(uk2, zk_d);
+  *r3 = VADD(uk2, zk_d);
 }
 
-__INLINE __m128 IMUL(__m128 d, __m128 re, __m128 im) {
-  re = _mm_mul_ps(re, d);                   
-  im = _mm_mul_ps(im, _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,3,0,1)));
-  return _mm_sub_ps(re, im);  
+__INLINE V IMUL(V d, V re, V im) {
+  re = VMUL(re, d);                   
+  im = VMUL(im, VSWAPPAIRS(d));
+  return VSUB(re, im);  
 }
 
-__INLINE __m128 IMULJ(__m128 d, __m128 re, __m128 im) {
-  re = _mm_mul_ps(re, d);                   
-  im = _mm_mul_ps(im, _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,3,0,1)));
-  return _mm_add_ps(re, im);  
+__INLINE V IMULJ(V d, V re, V im) {
+  re = VMUL(re, d);                   
+  im = VMUL(im, VSWAPPAIRS(d));
+  return VADD(re, im);  
 }
 
 __INLINE void 
-K_N(__m128 re, __m128 im, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
-	__m128 uk, uk2, zk_p, zk_n, zk, zk_d;
-
-	uk   = *r0;
-	uk2  = *r1;
+K_N(V re, V im, V *r0, V *r1, V *r2, V *r3) {
+	V uk, uk2, zk_p, zk_n, zk, zk_d;
+	uk   = *r0; uk2  = *r1;
   zk_p = IMUL(*r2, re, im);
   zk_n = IMULJ(*r3, re, im);
+	
+	zk   = VADD(zk_p, zk_n);
+  zk_d = IMULI(VSUB(zk_p, zk_n));
   
-	zk   = _mm_add_ps(zk_p, zk_n);
-  zk_d = IMULI(_mm_sub_ps(zk_p, zk_n));
-
-  *r2 = _mm_sub_ps(uk, zk);
-  *r0 = _mm_add_ps(uk, zk);
-  *r3 = _mm_add_ps(uk2, zk_d);
-  *r1 = _mm_sub_ps(uk2, zk_d);
+	*r2 = VSUB(uk, zk);
+  *r0 = VADD(uk, zk);
+  *r3 = VADD(uk2, zk_d);
+  *r1 = VSUB(uk2, zk_d);
 }
 
-__INLINE void TX2(__m128 *a, __m128 *b) {
-    __m128 TX2_t0 = _mm_shuffle_ps(*a, *b, _MM_SHUFFLE(1,0,1,0));
-		__m128 TX2_t1 = _mm_shuffle_ps(*a, *b, _MM_SHUFFLE(3,2,3,2));
+__INLINE void TX2(V *a, V *b) {
+    V TX2_t0 = VUNPACKLO(*a, *b);
+		V TX2_t1 = VUNPACKHI(*a, *b);
     *a = TX2_t0; *b = TX2_t1; 
 }
 
-__m128 __attribute__((aligned(32))) LEAFLUT[12];
 
 __INLINE void 
 LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
-  __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
+  V r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
 
 	data_t *out0 = out + (*out_offsets)[0];
 	data_t *out1 = out + (*out_offsets)[1];
@@ -147,7 +143,7 @@ LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou
 
 __INLINE void 
 LEAF_OO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
-  __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
+  V r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
 	
 	data_t *out0 = out + (*out_offsets)[0];
 	data_t *out1 = out + (*out_offsets)[1];
@@ -170,32 +166,123 @@ LEAF_OO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou
 
 	*is += 16;
 }
+#ifdef __ARM_NEON__
+__INLINE void 
+S_4_1(V r0, V r1, V r2, V r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) {
+	register V p0 __asm__ ("q0") = r0; register V p1 __asm__ ("q1") = r1; register V p2 __asm__ ("q2") = r2; register V p3 __asm__ ("q3") = r3;
+	__asm__ __volatile__ ("vst4.32 {%q1,%q2}, [%0, :128]!\n\t"
+												"vst4.32 {%q3,%q4}, [%0, :128]!\n\t"
+									 		: 
+		                	: "r" (o0), "w" (p0), "w" (p1), "w" (p2), "w" (p3)
+	                 		: "memory");
+}
+__INLINE void 
+S_4_2(V r0, V r1, V r2, V r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) {
+	register V p0 __asm__ ("q4") = r0; register V p1 __asm__ ("q5") = r1; register V p2 __asm__ ("q6") = r2; register V p3 __asm__ ("q7") = r3;
+	__asm__ __volatile__ ("vst4.32 {%q1,%q2}, [%0, :128]!\n\t"
+												"vst4.32 {%q3,%q4}, [%0, :128]!\n\t"
+									 		: 
+		                	: "r" (o0), "w" (p0), "w" (p1), "w" (p2), "w" (p3)
+	                 		: "memory");
+}
+__INLINE void 
+LEAF_EE8(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+  V r0,r1,r2,r3,r4,r5,r6,r7;
+	data_t *out0 = out + (*out_offsets)[0];
+	data_t *out1 = out + (*out_offsets)[1];
+	*out_offsets += 2;
+
+  L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3);
+  L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7);
+  K_0(&r0,&r2,&r4,&r6);
+  K_N(LEAFLUT[0], LEAFLUT[1],&r1,&r3,&r5,&r7);
+	
+	register V p0 __asm__ ("q0") = r0; 
+	register V p1 __asm__ ("q1") = r2; 
+	register V p2 __asm__ ("q2") = r4;
+	register V p3 __asm__ ("q3") = r6;
+	register V p4 __asm__ ("q4") = r1; 
+	register V p5 __asm__ ("q5") = r3; 
+	register V p6 __asm__ ("q6") = r5;
+	register V p7 __asm__ ("q7") = r7;
+  
+	__asm__ __volatile__ ("vswp %f1,%e6\n\t"
+												"vswp %f2,%e7\n\t"
+												"vswp %f3,%e8\n\t"
+												"vswp %f4,%e9\n\t"
+												"vst4.32 {%q1,%q2}, [%0, :128]!\n\t"
+												"vst4.32 {%q3,%q4}, [%0, :128]!\n\t"
+												"vst4.32 {%q6,%q7}, [%5, :128]!\n\t"
+												"vst4.32 {%q8,%q9}, [%5, :128]!\n\t"
+									 		: 
+		                	: "r" (out0), "w" (p0), "w" (p1), "w" (p2), "w" (p3), 
+											  "r" (out1), "w" (p4), "w" (p5), "w" (p6), "w" (p7)
+	                 		: "memory");
+//TX2(&r0,&r1); TX2(&r2,&r3); TX2(&r4,&r5); TX2(&r6,&r7);
+//S_4_1(r0,r2,r4,r6,out0+0,out0+4,out0+8,out0+12);
+//S_4_2(r1,r3,r5,r7,out1+0,out1+4,out1+8,out1+12);
+	*is += 8;
+}
+__INLINE void 
+LEAF_OO8(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+	V r0,r1,r2,r3,r4,r5,r6,r7;
+	data_t *out0 = out + (*out_offsets)[0];
+	data_t *out1 = out + (*out_offsets)[1];
+	*out_offsets += 2;
 
+  L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3);
+  L_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7);
+	register V p0 __asm__ ("q0") = r0; 
+	register V p1 __asm__ ("q1") = r2; 
+	register V p2 __asm__ ("q2") = r4;
+	register V p3 __asm__ ("q3") = r6;
+	register V p4 __asm__ ("q4") = r1; 
+	register V p5 __asm__ ("q5") = r3; 
+	register V p6 __asm__ ("q6") = r5;
+	register V p7 __asm__ ("q7") = r7;
+	__asm__ __volatile__ ("vswp %f1,%e6\n\t"
+												"vswp %f2,%e7\n\t"
+												"vswp %f3,%e8\n\t"
+												"vswp %f4,%e9\n\t"
+												"vst4.32 {%q1,%q2}, [%0, :128]!\n\t"
+												"vst4.32 {%q3,%q4}, [%0, :128]!\n\t"
+												"vst4.32 {%q6,%q7}, [%5, :128]!\n\t"
+												"vst4.32 {%q8,%q9}, [%5, :128]!\n\t"
+									 		: 
+		                	: "r" (out0), "w" (p0), "w" (p1), "w" (p2), "w" (p3), 
+											  "r" (out1), "w" (p4), "w" (p5), "w" (p6), "w" (p7)
+	                 		: "memory");
+//TX2(&r0,&r1); TX2(&r2,&r3); TX2(&r4,&r5); TX2(&r6,&r7);
+//S_4_1(r0,r2,r4,r6,out0+0,out0+4,out0+8,out0+12);
+//S_4_2(r1,r3,r5,r7,out1+0,out1+4,out1+8,out1+12);
+	*is += 8;
+}
+#endif
 __INLINE void 
 L_4_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, 
-									__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
-  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
-	t0 = _mm_load_ps(i0);    t1 = _mm_load_ps(i1);    t2 = _mm_load_ps(i2);    t3 = _mm_load_ps(i3);    
-	t4 = _mm_add_ps(t0, t1);
-	t5 = _mm_sub_ps(t0, t1);
-	t6 = _mm_add_ps(t2, t3);
-	t7 = IMULI(_mm_sub_ps(t2, t3));
-	t0 = _mm_add_ps(t4, t6);
-	t2 = _mm_sub_ps(t4, t6);
-	t1 = _mm_sub_ps(t5, t7);
-	t3 = _mm_add_ps(t5, t7);
+									V *r0, V *r1, V *r2, V *r3) {
+  V t0, t1, t2, t3, t4, t5, t6, t7;
+	t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);    
+	t4 = VADD(t0, t1);
+	t5 = VSUB(t0, t1);
+	t6 = VADD(t2, t3);
+	t7 = IMULI(VSUB(t2, t3));
+	t0 = VADD(t4, t6);
+	t2 = VSUB(t4, t6);
+	t1 = VSUB(t5, t7);
+	t3 = VADD(t5, t7);
 	TX2(&t0,&t1);
 	TX2(&t2,&t3);
 	*r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; }
 
 __INLINE void 
 L_2_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, 
-									  __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
-  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
-	t0 = _mm_load_ps(i0);    t1 = _mm_load_ps(i1);    t2 = _mm_load_ps(i2);    t3 = _mm_load_ps(i3); t4 = _mm_add_ps(t0, t1);
-	t5 = _mm_sub_ps(t0, t1);
-	t6 = _mm_add_ps(t2, t3);
-	t7 = _mm_sub_ps(t2, t3);
+									  V *r0, V *r1, V *r2, V *r3) {
+  V t0, t1, t2, t3, t4, t5, t6, t7;
+	t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3); t4 = VADD(t0, t1);
+	t5 = VSUB(t0, t1);
+	t6 = VADD(t2, t3);
+	t7 = VSUB(t2, t3);
 	TX2(&t4,&t5);
 	TX2(&t6,&t7);
 	*r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7;
@@ -203,52 +290,49 @@ L_2_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * res
 
 __INLINE void 
 L_2_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, 
-									  __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
-  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
-  t0 = _mm_load_ps(i0);    t1 = _mm_load_ps(i1);    t2 = _mm_load_ps(i2);    t3 = _mm_load_ps(i3);    
-  t4 = _mm_add_ps(t0, t1);
-  t5 = _mm_sub_ps(t0, t1);
-  t6 = _mm_add_ps(t2, t3);
-  t7 = _mm_sub_ps(t2, t3);
-  *r0 = _mm_shuffle_ps(t4, t5, _MM_SHUFFLE(1,0,1,0));
-  *r1 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(1,0,1,0));
+									  V *r0, V *r1, V *r2, V *r3) {
+  V t0, t1, t2, t3, t4, t5, t6, t7;
+  t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);    
+  t4 = VADD(t0, t1);
+  t5 = VSUB(t0, t1);
+  t6 = VADD(t2, t3);
+  t7 = VSUB(t2, t3);
+  *r0 = VUNPACKLO(t4, t5);
+  *r1 = VUNPACKLO(t6, t7);
   t5 = IMULI(t5);
-  t0 = _mm_add_ps(t6, t4);
-  t2 = _mm_sub_ps(t6, t4);
-  t1 = _mm_sub_ps(t7, t5);
-  t3 = _mm_add_ps(t7, t5);
-  *r3 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,2,3,2));
-  *r2 = _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,2,3,2));
+  t0 = VADD(t6, t4);
+  t2 = VSUB(t6, t4);
+  t1 = VSUB(t7, t5);
+  t3 = VADD(t7, t5);
+  *r3 = VUNPACKHI(t0, t1);
+  *r2 = VUNPACKHI(t2, t3);
 }
 
 __INLINE void 
 L_4_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, 
-									  __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
-  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
-  t0 = _mm_load_ps(i0);    
-  t1 = _mm_load_ps(i1);    
-  t6 = _mm_load_ps(i2);    
-  t7 = _mm_load_ps(i3);    
-	t2 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(3,2,1,0));
-  t3 = _mm_shuffle_ps(t7, t6, _MM_SHUFFLE(3,2,1,0));
-	t4 = _mm_add_ps(t0, t1);
-  t5 = _mm_sub_ps(t0, t1);
-  t6 = _mm_add_ps(t2, t3);
-  t7 = _mm_sub_ps(t2, t3);
-  *r2 = _mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,2,3,2));
-  *r3 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(3,2,3,2));
+									  V *r0, V *r1, V *r2, V *r3) {
+  V t0, t1, t2, t3, t4, t5, t6, t7;
+  t0 = VLD(i0);    t1 = VLD(i1);    t6 = VLD(i2);    t7 = VLD(i3);    
+	t2 = VBLEND(t6, t7);
+  t3 = VBLEND(t7, t6);
+	t4 = VADD(t0, t1);
+  t5 = VSUB(t0, t1);
+  t6 = VADD(t2, t3);
+  t7 = VSUB(t2, t3);
+  *r2 = VUNPACKHI(t4, t5);
+  *r3 = VUNPACKHI(t6, t7);
  	t7 = IMULI(t7);
-  t0 = _mm_add_ps(t4, t6);
-  t2 = _mm_sub_ps(t4, t6);
-  t1 = _mm_sub_ps(t5, t7);
-  t3 = _mm_add_ps(t5, t7);
-  *r0 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(1,0,1,0));
-  *r1 = _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(1,0,1,0));
+  t0 = VADD(t4, t6);
+  t2 = VSUB(t4, t6);
+  t1 = VSUB(t5, t7);
+  t3 = VADD(t5, t7);
+  *r0 = VUNPACKLO(t0, t1);
+  *r1 = VUNPACKLO(t2, t3);
 }
 
 __INLINE void 
 LEAF_OE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
-  __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
+  V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
 
 	data_t *out0 = out + (*out_offsets)[0];
 	data_t *out1 = out + (*out_offsets)[1];
@@ -273,7 +357,7 @@ LEAF_OE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou
 
 __INLINE void 
 LEAF_EO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
-  __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
+  V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
 	
 	data_t *out0 = out + (*out_offsets)[0];
 	data_t *out1 = out + (*out_offsets)[1];
@@ -295,6 +379,180 @@ LEAF_EO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou
 
 	*is += 16;
 }
+#ifdef __ARM_NEON__
+__INLINE void 
+LEAF_OE8(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+  V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+	data_t *out0 = out + (*out_offsets)[0];
+	data_t *out1 = out + (*out_offsets)[1];
+	*out_offsets += 2;
+
+  L_4_2(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r12_13,&r14_15);
+  L_4_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r8_9,&r10_11);
+  S_4_1(r0_1,r2_3,r4_5,r6_7,out0+0,out0+4,out0+8,out0+12);
+  K_N(LEAFLUT[6],LEAFLUT[7],&r8_9,&r10_11,&r12_13,&r14_15);
+  S_4_2(r8_9,r10_11,r12_13,r14_15,out1+0,out1+4,out1+8,out1+12);
+	*is += 8;
+}
+__INLINE void 
+LEAF_EO8(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+  V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+	data_t *out0 = out + (*out_offsets)[0];
+	data_t *out1 = out + (*out_offsets)[1];
+	*out_offsets += 2;
+
+  L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r8_9,&r10_11);
+  L_2_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r14_15,&r12_13);
+  S_4_1(r8_9,r10_11,r12_13,r14_15,out1+0,out1+4,out1+8,out1+12);
+  K_N(LEAFLUT[6],LEAFLUT[7],&r0_1,&r2_3,&r4_5,&r6_7);
+  S_4_2(r0_1,r2_3,r4_5,r6_7,out0+0,out0+4,out0+8,out0+12);
+
+	*is += 8;
+}
+#endif
+__INLINE void 
+firstpass_32(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) {
+  V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
+  float *LUT8 = p->ws[0];
+  float *LUT16 = p->ws[1];
+  float *LUT32 = p->ws[2];
+
+  L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19);
+  L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23);
+  K_N(VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+  L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31);
+  L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27);
+  K_N(VLD(LUT16),VLD(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13);
+  K_N(VLD(LUT16+8),VLD(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15);
+  K_N(VLD(LUT8),VLD(LUT8+4),&r16_17,&r18_19,&r20_21,&r22_23);
+  K_N(VLD(LUT8),VLD(LUT8+4),&r24_25,&r26_27,&r28_29,&r30_31);
+  K_N(VLD(LUT32),VLD(LUT32+4),&r0_1,&r8_9,&r16_17,&r24_25);
+  S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48);
+  K_N(VLD(LUT32+8),VLD(LUT32+12),&r2_3,&r10_11,&r18_19,&r26_27);
+  S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52);
+  K_N(VLD(LUT32+16),VLD(LUT32+20),&r4_5,&r12_13,&r20_21,&r28_29);
+  S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56);
+  K_N(VLD(LUT32+24),VLD(LUT32+28),&r6_7,&r14_15,&r22_23,&r30_31);
+  S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60);
+
+}
+
+__INLINE void 
+firstpass_16(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) {
+  V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+  float *LUT8 = p->ws[0];
+  float *LUT16 = p->ws[1];
+
+  L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11);
+  L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13);
+  K_N(VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+  K_N(VLD(LUT16),VLD(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13);
+  S_4(r0_1,r4_5,r8_9,r12_13,out+0,out+8,out+16,out+24);
+  K_N(VLD(LUT16+8),VLD(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15);
+  S_4(r2_3,r6_7,r10_11,r14_15,out+4,out+12,out+20,out+28);
+}
 
+__INLINE void 
+firstpass_8(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) {
+  V r0_1,r2_3,r4_5,r6_7;
+  float *LUT8 = p->ws[0];
+	L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7);
+  K_N(VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+  S_4(r0_1,r2_3,r4_5,r6_7,out+0,out+4,out+8,out+12);
+}
+__INLINE void 
+firstpass_4_f(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) {
+  cdata_t *i = (cdata_t *)in, *o = (cdata_t *)out;
+	cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
+  t0 = i[0]; t1 = i[2]; t2 = i[1]; t3 = i[3];
+  t4 = t0 + t1;
+  t5 = t0 - t1;
+  t6 = t2 + t3;
+  t7 = (t2 - t3);
+	t7 = (creal(t7))*I - (cimag(t7));
+	o[0] = t4 + t6;
+  o[2] = t4 - t6;
+  o[1] = t5 - t7;
+  o[3] = t5 + t7;
+}
+__INLINE void 
+firstpass_4_b(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) {
+  cdata_t *i = (cdata_t *)in, *o = (cdata_t *)out;
+	cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
+  t0 = i[0]; t1 = i[2]; t2 = i[1]; t3 = i[3];
+  t4 = t0 + t1;
+  t5 = t0 - t1;
+  t6 = t2 + t3;
+  t7 = (t2 - t3);
+	t7 = -(creal(t7))*I + (cimag(t7));
+	o[0] = t4 + t6;
+  o[2] = t4 - t6;
+  o[1] = t5 - t7;
+  o[3] = t5 + t7;
+}
+__INLINE void 
+firstpass_2(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) {
+  cdata_t t0, t1, r0,r1;
+  t0 = ((cdata_t *)in)[0]; t1 = ((cdata_t *)in)[1];
+  r0 = t0 + t1; r1 = t0 - t1;
+	((cdata_t *)out)[0] = r0;
+	((cdata_t *)out)[1] = r1;
+}
+
+__INLINE void X_8(data_t * restrict data0, size_t N, const data_t * restrict LUT) {
+	data_t *data2 = data0 + 2*N/4;
+	data_t *data4 = data0 + 4*N/4;
+	data_t *data6 = data0 + 6*N/4;
+	data_t *data1 = data0 + 1*N/4;
+	data_t *data3 = data0 + 3*N/4;
+	data_t *data5 = data0 + 5*N/4;
+	data_t *data7 = data0 + 7*N/4;
+	size_t k, n4 = N/4;
+
+	for(k=N/8/2;k>0;--k) {
+      V r0, r1, r2, r3, r4, r5, r6, r7;
+			r0 = VLD(data0);
+			r1 = VLD(data1);
+			r2 = VLD(data2);
+			r3 = VLD(data3);
+			K_N(VLD(LUT), VLD(LUT+4), &r0, &r1, &r2, &r3);
+			r4 = VLD(data4);
+			r6 = VLD(data6);
+			K_N(VLD(LUT+8), VLD(LUT+12), &r0, &r2, &r4, &r6);
+			r5 = VLD(data5);
+			r7 = VLD(data7);
+			K_N(VLD(LUT+16), VLD(LUT+20), &r1, &r3, &r5, &r7);
+			LUT += 24;
+			VST(data0, r0); data0 += 4;
+			VST(data1, r1); data1 += 4;
+			VST(data2, r2); data2 += 4;
+			VST(data3, r3); data3 += 4;
+			VST(data4, r4); data4 += 4;
+			VST(data5, r5); data5 += 4;
+			VST(data6, r6); data6 += 4;
+			VST(data7, r7); data7 += 4;
+	}
+}
+
+__INLINE void X_4(data_t * restrict data, size_t N, const data_t * restrict LUT) {
+
+	size_t i;
+	for(i=0;i<N/4/2;i++) {
+		V uk = VLD(data);
+		V uk2 = VLD(data + 2*N/4);
+		V zk_p = VLD(data + 4*N/4);
+		V zk_n = VLD(data + 6*N/4);
+
+		K_N(VLD(LUT), VLD(LUT+4), &uk, &uk2, &zk_p, &zk_n);
+
+		VST(data, uk); 
+		VST(data + 2*N/4, uk2); 
+		VST(data + 4*N/4, zk_p);
+		VST(data + 6*N/4, zk_n);
+
+		LUT += 8;
+		data += 4;
+	}
+}
 
 #endif
diff --git a/src/neon_float.h b/src/neon_float.h
new file mode 100644
index 0000000..220219f
--- /dev/null
+++ b/src/neon_float.h
@@ -0,0 +1,1037 @@
+#ifndef __NEON_FLOAT_H__
+#define __NEON_FLOAT_H__
+
+#include <arm_neon.h>
+
+//#define VL 4
+#define __INLINE static inline __attribute__((always_inline))
+
+typedef float32x4_t V;
+
+typedef float32x4x2_t VS;
+
+#define ADD vaddq_f32
+#define SUB vsubq_f32
+#define MUL vmulq_f32
+#define VADD vaddq_f32
+#define VSUB vsubq_f32
+#define VMUL vmulq_f32
+#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
+#define VST vst1q_f32
+#define VLD vld1q_f32 
+#define VST2 vst2q_f32
+#define VLD2 vld2q_f32 
+
+#define VSWAPPAIRS(x) (vrev64q_f32(x))
+
+#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
+#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
+
+#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
+
+static inline V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
+    data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
+    return VLD(d);
+}
+
+#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
+#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
+
+#define FFTS_MALLOC(d,a) (valloc(d))
+#define FFTS_FREE(d) (free(d))
+__INLINE void FMA(V *Rd, V Rn, V Rm) {
+    __asm__ ("vmla.f32 %q0,%q1,%q2\n\t"
+             : "+w" (*Rd)
+             : "w" (Rn), "w" (Rm)
+             //: "0"
+             );
+    
+}
+__INLINE void FMS(V *Rd, V Rn, V Rm) {
+    __asm__ ("vmls.f32 %q0,%q1,%q2\n\t"
+             : "+w" (*Rd)
+             : "w" (Rn), "w" (Rm)
+            // : "0"
+             );
+}
+
+__INLINE VS VSMUL(VS *d, VS *w) {
+    VS t;
+    t.val[0] = vmulq_f32(d->val[0], w->val[0]);
+    t.val[1] = vmulq_f32(d->val[0], w->val[1]);
+//  t.val[0] = vmlsq_f32(t.val[0], d->val[1], w->val[1]);
+//  t.val[1] = vmlaq_f32(t.val[1], d->val[1], w->val[0]);
+    FMS(&t.val[0], d->val[1], w->val[1]);
+    FMA(&t.val[1], d->val[1], w->val[0]);
+    return t;
+}	
+__INLINE VS VSMULJ(VS *d, VS *w) {
+    VS t;
+    t.val[0] = vmulq_f32(d->val[0], w->val[0]);
+    t.val[1] = vmulq_f32(d->val[1], w->val[0]);
+//  t.val[0] = vmlaq_f32(t.val[0], d->val[1], w->val[1]);
+//  t.val[1] = vmlsq_f32(t.val[1], d->val[0], w->val[1]);
+    FMA(&t.val[0], d->val[1], w->val[1]);
+    FMS(&t.val[1], d->val[0], w->val[1]);
+    return t;
+}	
+__INLINE VS VSADD(VS *a, VS *b) {
+    VS r; 
+    r.val[0] = vaddq_f32(a->val[0], b->val[0]); 
+    r.val[1] = vaddq_f32(a->val[1], b->val[1]); 
+    return r;
+}
+__INLINE VS VSSUB(VS *a, VS *b) {
+    VS r; 
+    r.val[0] = vsubq_f32(a->val[0], b->val[0]); 
+    r.val[1] = vsubq_f32(a->val[1], b->val[1]); 
+    return r;
+}
+__INLINE VS VSSUB_MULI(VS *a, VS *b) {
+    VS r; 
+    r.val[0] = vaddq_f32(a->val[0], b->val[1]); 
+    r.val[1] = vsubq_f32(a->val[1], b->val[0]); 
+    return r;
+}
+__INLINE VS VSADD_MULI(VS *a, VS *b) {
+    VS r; 
+    r.val[0] = vsubq_f32(a->val[0], b->val[1]); 
+    r.val[1] = vaddq_f32(a->val[1], b->val[0]); 
+    return r;
+}
+
+__INLINE void VSK_N(VS w, VS *r0, VS *r1, VS *r2, VS *r3) {
+	VS uk, uk2, zk_p, zk_n, zk, zk_d;
+	uk   = *r0; uk2  = *r1;
+  zk_p = VSMUL(r2, &w);
+  zk_n = VSMULJ(r3, &w);
+	
+	zk   = VSADD(&zk_p, &zk_n);
+  zk_d = VSSUB(&zk_p, &zk_n);
+  
+	*r2 = VSSUB(&uk, &zk);
+  *r0 = VSADD(&uk, &zk);
+  *r3 = VSADD_MULI(&uk2, &zk_d);
+  *r1 = VSSUB_MULI(&uk2, &zk_d);
+}
+
+
+__INLINE float32x2x2_t HVS_ADD(float32x2x2_t a, float32x2x2_t b) {
+	float32x2x2_t rval;
+	rval.val[0] = vadd_f32(a.val[0], b.val[0]);
+	rval.val[1] = vadd_f32(a.val[1], b.val[1]);
+	return rval;
+}
+__INLINE float32x2x2_t HVS_SUB(float32x2x2_t a, float32x2x2_t b) {
+	float32x2x2_t rval;
+	rval.val[0] = vsub_f32(a.val[0], b.val[0]);
+	rval.val[1] = vsub_f32(a.val[1], b.val[1]);
+	return rval;
+}
+__INLINE float32x2x2_t HVS_SUB_MULI(float32x2x2_t a, float32x2x2_t b) {
+	float32x2x2_t rval;
+	rval.val[0] = vadd_f32(a.val[0], b.val[1]);
+	rval.val[1] = vsub_f32(a.val[1], b.val[0]);
+	return rval;
+}
+__INLINE float32x2x2_t HVS_ADD_MULI(float32x2x2_t a, float32x2x2_t b) {
+	float32x2x2_t rval;
+	rval.val[0] = vsub_f32(a.val[0], b.val[1]);
+	rval.val[1] = vadd_f32(a.val[1], b.val[0]);
+	return rval;
+}
+__INLINE float32x2x2_t HVS_MUL(float32x2x2_t d, float32x2x2_t w) {
+    float32x2x2_t t;
+		t.val[0] = vmul_f32(d.val[0], w.val[0]);
+		t.val[1] = vmul_f32(d.val[0], w.val[1]);
+		t.val[0] = vmls_f32(t.val[0], d.val[1], w.val[1]);
+		t.val[1] = vmla_f32(t.val[1], d.val[1], w.val[0]);
+    return t;
+}
+__INLINE float32x2x2_t HVS_MULJ(float32x2x2_t d, float32x2x2_t w) {
+    float32x2x2_t t;
+		t.val[0] = vmul_f32(d.val[0], w.val[0]);
+		t.val[1] = vmul_f32(d.val[1], w.val[0]);
+		t.val[0] = vmla_f32(t.val[0], d.val[1], w.val[1]);
+		t.val[1] = vmls_f32(t.val[1], d.val[0], w.val[1]);
+    return t;
+}
+__INLINE void HVS_K_N(float32x2x2_t w, float32x2x2_t *r0, float32x2x2_t *r1, float32x2x2_t *r2, float32x2x2_t *r3) {
+    float32x2x2_t uk, uk2, zk_p, zk_n, zk, zk_d;
+		uk   = *r0; uk2  = *r1;
+		zk_p = HVS_MUL(*r2, w);
+    zk_n = HVS_MULJ(*r3, w);
+    zk   = HVS_ADD(zk_p, zk_n);
+    zk_d = HVS_SUB(zk_p, zk_n);
+    
+    *r2 = HVS_SUB(uk, zk);
+    *r0 = HVS_ADD(uk, zk);
+    *r3 = HVS_ADD_MULI(uk2, zk_d);
+    *r1 = HVS_SUB_MULI(uk2, zk_d);
+}
+
+typedef union {
+	float32x4_t f32x4;
+	float32x2x2_t f32x2x2;
+} float_mixed_t;
+
+__INLINE void VSWP(float32x2x2_t *a, float32x2x2_t *b) {
+//float32x2_t tmp = a->val[1];
+//a->val[1] = b->val[0];
+//b->val[0] = tmp; 
+    __asm__ ("vswp %0,%1\n\t"
+             : "+w" (a->val[1]), "+w" (b->val[0])
+             :
+                 );
+}
+
+static const __attribute__ ((aligned(16))) float ee_w_data[4] = {0.70710678118654757273731092936941,0.70710678118654746171500846685376,
+    -0.70710678118654757273731092936941,-0.70710678118654746171500846685376};
+__INLINE void LEAF_EE8_SPLIT(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+	data_t *out0 = out + (*out_offsets)[0];
+	data_t *out1 = out + (*out_offsets)[1];
+	*out_offsets += 2;
+
+	float32x2x2_t r0, r1, r2, r3, r4, r5, r6, r7;
+	float32x2x2_t t0, t1, t2, t3, t4, t5, t6, t7;
+
+	t0 = vld2_f32(in + (*is)[0]); t1 = vld2_f32(in + (*is)[1]); t2 = vld2_f32(in + (*is)[2]); t3 = vld2_f32(in + (*is)[3]); 
+
+	t4 = HVS_ADD (t0, t1);
+	t5 = HVS_SUB (t0, t1);
+	t6 = HVS_ADD (t2, t3);
+	t7 = HVS_SUB (t2, t3);
+	r0 = HVS_ADD (t4, t6);
+	r2 = HVS_SUB (t4, t6);
+	r1 = HVS_SUB_MULI(t5, t7);
+	r3 = HVS_ADD_MULI(t5, t7);
+    
+	t0 = vld2_f32(in + (*is)[4]); t1 = vld2_f32(in + (*is)[5]); t2 = vld2_f32(in + (*is)[6]); t3 = vld2_f32(in + (*is)[7]); 
+	r4 = HVS_ADD (t0, t1);
+	r5 = HVS_SUB (t0, t1);
+	r6 = HVS_ADD (t2, t3);
+	r7 = HVS_SUB (t2, t3);
+	t0 = r0; t1 = r2;
+	t2 = HVS_ADD(r4, r6);
+	t3 = HVS_SUB(r4, r6);
+	r0 = HVS_ADD(t0, t2);
+	r4 = HVS_SUB(t0, t2);
+	r2 = HVS_SUB_MULI(t1, t3);
+	r6 = HVS_ADD_MULI(t1, t3);
+
+	float32x4_t w = vld1q_f32(ee_w_data);
+	float32x2x2_t ww;
+	ww.val[0] = vget_low_f32(w);
+	ww.val[1] = vget_high_f32(w);
+
+	HVS_K_N(ww,&r1,&r3,&r5,&r7);
+
+//vst2_f32(out0,   r0);
+//vst2_f32(out0+4, r2);
+//vst2_f32(out0+8, r4);
+//vst2_f32(out0+12, r6);
+
+//vst2_f32(out1,   r1);
+//vst2_f32(out1+4, r3);
+//vst2_f32(out1+8, r5);
+//vst2_f32(out1+12, r7);
+
+	float32x2x2_t tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
+
+	tt0 = vtrn_f32(r0.val[0], r0.val[1]);
+	tt1 = vtrn_f32(r1.val[0], r1.val[1]);
+	tt2 = vtrn_f32(r2.val[0], r2.val[1]);
+	tt3 = vtrn_f32(r3.val[0], r3.val[1]);
+	tt4 = vtrn_f32(r4.val[0], r4.val[1]);
+	tt5 = vtrn_f32(r5.val[0], r5.val[1]);
+	tt6 = vtrn_f32(r6.val[0], r6.val[1]);
+	tt7 = vtrn_f32(r7.val[0], r7.val[1]);
+
+//VSWP(&tt0.f32x2x2, &tt1.f32x2x2);	
+//VSWP(&tt2.f32x2x2, &tt3.f32x2x2);	
+//VSWP(&tt4.f32x2x2, &tt5.f32x2x2);	
+//VSWP(&tt6.f32x2x2, &tt7.f32x2x2);	
+
+	float32x4_t z0, z1, z2, z3, z4, z5, z6, z7;
+
+	z0 = vcombine_f32(tt0.val[0], tt1.val[0]);
+	z1 = vcombine_f32(tt0.val[1], tt1.val[1]);
+	z2 = vcombine_f32(tt2.val[0], tt3.val[0]);
+	z3 = vcombine_f32(tt2.val[1], tt3.val[1]);
+	z4 = vcombine_f32(tt4.val[0], tt5.val[0]);
+	z5 = vcombine_f32(tt4.val[1], tt5.val[1]);
+	z6 = vcombine_f32(tt6.val[0], tt7.val[0]);
+	z7 = vcombine_f32(tt6.val[1], tt7.val[1]);
+
+
+	vst1q_f32(out0,    z0);
+	vst1q_f32(out0+4,  z2);
+	vst1q_f32(out0+8,  z4);
+	vst1q_f32(out0+12, z6);
+	
+	vst1q_f32(out1,    z1);
+	vst1q_f32(out1+4,  z3);
+	vst1q_f32(out1+8,  z5);
+	vst1q_f32(out1+12, z7);
+/*
+  vst1_f32(out0,    tt0.val[0]);
+  vst1_f32(out0+2,  tt1.val[0]);
+  vst1_f32(out0+4,  tt2.val[0]);
+  vst1_f32(out0+6,  tt3.val[0]);
+  vst1_f32(out0+8,  tt4.val[0]);
+  vst1_f32(out0+10, tt5.val[0]);
+  vst1_f32(out0+12, tt6.val[0]);
+  vst1_f32(out0+14, tt7.val[0]);
+  
+  vst1_f32(out1,    tt0.val[1]);
+  vst1_f32(out1+2,  tt1.val[1]);
+  vst1_f32(out1+4,  tt2.val[1]);
+  vst1_f32(out1+6,  tt3.val[1]);
+  vst1_f32(out1+8,  tt4.val[1]);
+  vst1_f32(out1+10, tt5.val[1]);
+  vst1_f32(out1+12, tt6.val[1]);
+  vst1_f32(out1+14, tt7.val[1]);
+	*/
+/*
+	float32x4_t rr0 = vcombine_f32(r0.val[0], r0.val[1]);
+	float32x4_t rr1 = vcombine_f32(r1.val[0], r1.val[1]);
+	float32x4_t rr2 = vcombine_f32(r2.val[0], r2.val[1]);
+	float32x4_t rr3 = vcombine_f32(r3.val[0], r3.val[1]);
+	
+	float32x4x2_t tmp0, tmp1, tmp2, tmp3;
+	tmp0 = vtrnq_f32(rr0, rr2);
+	tmp1 = vtrnq_f32(rr1, rr3);
+
+
+	float32x2x2_t v0, v1, v2, v3;
+	v0.val[0] = vget_low_f32(tmp0.val[0]);
+	v0.val[1] = vget_high_f32(tmp0.val[0]);
+	v1.val[0] = vget_low_f32(tmp0.val[1]);
+	v1.val[1] = vget_high_f32(tmp0.val[1]);
+	v2.val[0] = vget_low_f32(tmp1.val[0]);
+	v2.val[1] = vget_high_f32(tmp1.val[0]);
+	v3.val[0] = vget_low_f32(tmp1.val[1]);
+	v3.val[1] = vget_high_f32(tmp1.val[1]);
+
+	tmp2.val[0] = tmp0.val[0];
+	tmp2.val[1] = tmp1.val[0];
+	tmp3.val[0] = tmp0.val[1];
+	tmp3.val[1] = tmp1.val[1];
+
+//vst2q_f32(out0 , tmp2);
+//vst2q_f32(out1 , tmp3);
+  vst2_f32(out0,   v0);
+  vst2_f32(out0+4, v1);
+  vst2_f32(out1,   v2);
+  vst2_f32(out1+4, v3);
+
+	float32x4_t rr4 = vcombine_f32(r4.val[0], r4.val[1]);
+	float32x4_t rr5 = vcombine_f32(r5.val[0], r5.val[1]);
+	float32x4_t rr6 = vcombine_f32(r6.val[0], r6.val[1]);
+	float32x4_t rr7 = vcombine_f32(r7.val[0], r7.val[1]);
+
+	tmp0 = vtrnq_f32(rr4, rr6);
+	tmp1 = vtrnq_f32(rr5, rr7);
+
+	tmp2.val[0] = tmp0.val[0];
+	tmp2.val[1] = tmp1.val[0];
+	tmp3.val[0] = tmp0.val[1];
+	tmp3.val[1] = tmp1.val[1];
+	v0.val[0] = vget_low_f32(tmp0.val[0]);
+	v0.val[1] = vget_high_f32(tmp0.val[0]);
+	v1.val[0] = vget_low_f32(tmp0.val[1]);
+	v1.val[1] = vget_high_f32(tmp0.val[1]);
+	v2.val[0] = vget_low_f32(tmp1.val[0]);
+	v2.val[1] = vget_high_f32(tmp1.val[0]);
+	v3.val[0] = vget_low_f32(tmp1.val[1]);
+	v3.val[1] = vget_high_f32(tmp1.val[1]);
+  vst2_f32(out0+8,   v0);
+  vst2_f32(out0+12, v1);
+  vst2_f32(out1+8,   v1);
+  vst2_f32(out1+12, v3);
+
+//vst2q_f32(out0 + 8, tmp2);
+//vst2q_f32(out1 + 8, tmp3);
+//vst1q_f32(out0+8, tmp0.val[0]);
+//vst1q_f32(out0+12,tmp0.val[1]);
+//vst1q_f32(out1+8, tmp1.val[0]);
+//vst1q_f32(out1+12,tmp1.val[1]);
+	*/
+	*is += 8;
+}
+
+__INLINE void STORESPR(data_t * addr,  VS p) {
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+									 		: 
+		                	: "r" (addr), "w" (p.val[0]), "w" (p.val[1])
+	                 		: "memory");
+}
+__INLINE void STORESPRI(data_t * restrict * addr,  V p0, V p1) {
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+}
+__INLINE void STORESPRI0(data_t * restrict *addr,  VS r) {
+	register V p0 __asm__ ("q0") = r.val[0];
+	register V p1 __asm__ ("q1") = r.val[1];
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+	//STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRI1(data_t **addr,  VS r) {
+	register V p0 __asm__ ("q2") = r.val[0];
+	register V p1 __asm__ ("q3") = r.val[1];
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+	//STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRI2(data_t **addr,  VS r) {
+	register V p0 __asm__ ("q4") = r.val[0];
+	register V p1 __asm__ ("q5") = r.val[1];
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+	//STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRI3(data_t **addr,  VS r) {
+	register V p0 __asm__ ("q6") = r.val[0];
+	register V p1 __asm__ ("q7") = r.val[1];
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+	//STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRIT0(data_t * restrict *addr,  VS r) {
+	register V p0 __asm__ ("q0") = r.val[0];
+	register V p1 __asm__ ("q1") = r.val[1];
+	__asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+	//STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRIT1(data_t **addr,  VS r) {
+	register V p0 __asm__ ("q2") = r.val[0];
+	register V p1 __asm__ ("q3") = r.val[1];
+	__asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+	//STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRIT2(data_t **addr,  VS r) {
+	register V p0 __asm__ ("q4") = r.val[0];
+	register V p1 __asm__ ("q5") = r.val[1];
+	__asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+	//STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPRIT3(data_t **addr,  VS r) {
+	register V p0 __asm__ ("q6") = r.val[0];
+	register V p1 __asm__ ("q7") = r.val[1];
+	__asm__ __volatile__ ("vst2.32 {%q1,%q2}, [%0, :128]!\n\t"
+									 		: "+r" (*addr)
+		                	: "w" (p0), "w" (p1)
+	                 		: "memory");
+	//STORESPRI(addr, p0, p1);
+}
+__INLINE void STORESPR0(data_t *addr,  VS r) {
+	register V p0 __asm__ ("q0") = r.val[0];
+	register V p1 __asm__ ("q1") = r.val[1];
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+									 		: 
+		                	: "r" (addr), "w" (p0), "w" (p1)
+	                 		: "memory");
+}
+__INLINE void STORESPR1(data_t *addr,  VS r) {
+	register V p0 __asm__ ("q2") = r.val[0];
+	register V p1 __asm__ ("q3") = r.val[1];
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+									 		: 
+		                	: "r" (addr), "w" (p0), "w" (p1)
+	                 		: "memory");
+}
+__INLINE void STORESPR2(data_t *addr,  VS r) {
+	register V p0 __asm__ ("q4") = r.val[0];
+	register V p1 __asm__ ("q5") = r.val[1];
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+									 		: 
+		                	: "r" (addr), "w" (p0), "w" (p1)
+	                 		: "memory");
+}
+__INLINE void STORESPR3(data_t *addr,  VS r) {
+	register V p0 __asm__ ("q6") = r.val[0];
+	register V p1 __asm__ ("q7") = r.val[1];
+	__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
+									 		: 
+		                	: "r" (addr), "w" (p0), "w" (p1)
+	                 		: "memory");
+}
+__INLINE VS LOADSPR0(data_t *addr) {
+	VS r;
+	register V p0 __asm__ ("q8") ;
+	register V p1 __asm__ ("q9") ;
+	__asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]\n\t"
+			: "=&w" (p0), "=&w" (p1)
+			: "r" (addr)
+			);
+	r.val[0] = p0; r.val[1] = p1;
+	return r;
+}
+__INLINE VS LOADSPR1(data_t *addr) {
+	VS r;
+	register V p0 __asm__ ("q10") ;
+	register V p1 __asm__ ("q11") ;
+	__asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]\n\t"
+			: "=&w" (p0), "=&w" (p1)
+			: "r" (addr)
+			);
+	r.val[0] = p0; r.val[1] = p1;
+	return r;
+}
+__INLINE VS LOADSPR2(data_t *addr) {
+	VS r;
+	register V p0 __asm__ ("q12") ;
+	register V p1 __asm__ ("q13") ;
+	__asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]\n\t"
+			: "=&w" (p0), "=&w" (p1)
+			: "r" (addr)
+			);
+	r.val[0] = p0; r.val[1] = p1;
+	return r;
+}
+__INLINE VS LOADSPR3(data_t *addr) {
+	VS r;
+	register V p0 __asm__ ("q14") ;
+	register V p1 __asm__ ("q15") ;
+	__asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]\n\t"
+			: "=&w" (p0), "=&w" (p1)
+			: "r" (addr)
+			);
+	r.val[0] = p0; r.val[1] = p1;
+	return r;
+}
+__INLINE VS LOADSPRI(data_t * restrict * addr) {
+	VS r;
+	register V p0 __asm__ ("q2") ;
+	register V p1 __asm__ ("q3") ;
+	__asm__ __volatile__ ("vld1.32 {%q0,%q1}, [%2, :128]!\n\t"
+			: "=&w" (p0), "=&w" (p1), "+r" (*addr)
+			: 
+			);
+	r.val[0] = p0; r.val[1] = p1;
+	return r;
+}
+
+__INLINE void X_4_SPLIT(data_t * restrict data, size_t N, data_t * restrict LUT) {
+
+//size_t i;
+//for(i=0;i<N/4/2/2;i++) {
+		VS uk = LOADSPR0(data);
+		VS uk2 = LOADSPR1(data + 2*N/4);
+		VS zk_p = LOADSPR2(data + 4*N/4);
+		VS zk_n = LOADSPR3(data + 6*N/4);
+
+		VSK_N(LOADSPRI(&LUT), &uk, &uk2, &zk_p, &zk_n);
+
+		STORESPR0(data, uk); 
+		STORESPR1(data + 2*N/4, uk2); 
+		STORESPR2(data + 4*N/4, zk_p);
+		STORESPR3(data + 6*N/4, zk_n);
+
+//		LUT += 8;
+//		data += 8;
+//	}
+}
+
+__INLINE void X_8_SPLIT(data_t * restrict data0, size_t N, data_t * restrict LUT) {
+	data_t *data2 = data0 + 2*N/4;
+	data_t *data4 = data0 + 4*N/4;
+	data_t *data6 = data0 + 6*N/4;
+	data_t *data1 = data0 + 1*N/4;
+	data_t *data3 = data0 + 3*N/4;
+	data_t *data5 = data0 + 5*N/4;
+	data_t *data7 = data0 + 7*N/4;
+	size_t k, n4 = N/4;
+
+	for(k=N/8/2/2;k>0;--k) {
+      VS r0, r1, r2, r3, r4, r5, r6, r7,w;
+			r0 = LOADSPR0(data0);
+			r2 = LOADSPR1(data2);
+			r1 = LOADSPR2(data1);
+			r3 = LOADSPR3(data3);
+			VSK_N(LOADSPRI(&LUT), &r0, &r1, &r2, &r3);
+			STORESPR2(data1, r1);
+			STORESPR3(data3, r3);
+			r4 = LOADSPR2(data4);
+			r6 = LOADSPR3(data6);
+			VSK_N(LOADSPRI(&LUT), &r0, &r2, &r4, &r6);
+			STORESPRI0(&data0, r0); //data0 += 8;
+			STORESPRI1(&data2, r2); //data2 += 8;
+			STORESPRI2(&data4, r4); //data4 += 8;
+			STORESPRI3(&data6, r6); //data6 += 8;
+			r1 = LOADSPR0(data1);
+			r3 = LOADSPR1(data3);
+			r5 = LOADSPR2(data5);
+			r7 = LOADSPR3(data7);
+			VSK_N(LOADSPRI(&LUT), &r1, &r3, &r5, &r7);
+	//		LUT += 24;
+			STORESPRI0(&data1, r1); //data1 += 8;
+			STORESPRI1(&data3, r3); //data3 += 8;
+			STORESPRI2(&data5, r5); //data5 += 8;
+			STORESPRI3(&data7, r7); //data7 += 8;
+	}
+}
+
+__INLINE void X_8_SPLIT_T(data_t * restrict data0, size_t N, data_t * restrict LUT) {
+	data_t *data2 = data0 + 2*N/4;
+	data_t *data4 = data0 + 4*N/4;
+	data_t *data6 = data0 + 6*N/4;
+	data_t *data1 = data0 + 1*N/4;
+	data_t *data3 = data0 + 3*N/4;
+	data_t *data5 = data0 + 5*N/4;
+	data_t *data7 = data0 + 7*N/4;
+	size_t k, n4 = N/4;
+
+	for(k=N/8/2/2;k>0;--k) {
+      VS r0, r1, r2, r3, r4, r5, r6, r7,w;
+			r0 = LOADSPR0(data0);
+			r2 = LOADSPR1(data2);
+			r1 = LOADSPR2(data1);
+			r3 = LOADSPR3(data3);
+			VSK_N(LOADSPRI(&LUT), &r0, &r1, &r2, &r3);
+			STORESPR2(data1, r1);
+			STORESPR3(data3, r3);
+			r4 = LOADSPR2(data4);
+			r6 = LOADSPR3(data6);
+			VSK_N(LOADSPRI(&LUT), &r0, &r2, &r4, &r6);
+			STORESPRIT0(&data0, r0); //data0 += 8;
+			STORESPRIT1(&data2, r2); //data2 += 8;
+			STORESPRIT2(&data4, r4); //data4 += 8;
+			STORESPRIT3(&data6, r6); //data6 += 8;
+			r1 = LOADSPR0(data1);
+			r3 = LOADSPR1(data3);
+			r5 = LOADSPR2(data5);
+			r7 = LOADSPR3(data7);
+			VSK_N(LOADSPRI(&LUT), &r1, &r3, &r5, &r7);
+			STORESPRIT0(&data1, r1); //data1 += 8;
+			STORESPRIT1(&data3, r3); //data3 += 8;
+			STORESPRIT2(&data5, r5); //data5 += 8;
+			STORESPRIT3(&data7, r7); //data7 += 8;
+	}
+}
+__INLINE V LOAD2I(const data_t **addr) {
+    float32x4_t o;
+    __asm__ ("vld2.32 {%q0}, [%1, :128]!\n\t"
+             : "=w" (o), "+r" (*addr)
+             : 
+             ); 
+
+    return o;
+}
+__INLINE V LOADI(const data_t **addr) {
+    float32x2_t out0, out1;
+    float32x4_t o;
+    
+    __asm__ ("vld1.32 {%q0}, [%1, :128]!\n\t"
+             : "=w" (o), "+r" (*addr)
+             : 
+             ); 
+    return o;
+}
+__INLINE V HSP_MUL(V *d, const V *w) {
+    V t;
+    t = vcombine_f32(vmul_f32(vget_low_f32(*d), vget_low_f32(*w)), 
+                     vmul_f32(vget_low_f32(*d), vget_high_f32(*w)));
+    t = vcombine_f32(vmls_f32(vget_low_f32(t), vget_high_f32(*d), vget_high_f32(*w)), 
+                     vmla_f32(vget_high_f32(t), vget_high_f32(*d), vget_low_f32(*w)));
+    return t;
+}
+__INLINE V HSP_MULJ(V *d, const V *w) {
+    V t;
+    t = vcombine_f32(vmul_f32(vget_low_f32(*d), vget_low_f32(*w)), 
+                     vmul_f32(vget_high_f32(*d), vget_low_f32(*w)));
+    t = vcombine_f32(vmla_f32(vget_low_f32(t), vget_high_f32(*d), vget_high_f32(*w)), 
+                     vmls_f32(vget_high_f32(t), vget_low_f32(*d), vget_high_f32(*w)));
+    return t;
+}
+__INLINE V HSP_SUB_MULI(V *a, V *b) {
+    return vcombine_f32(vadd_f32(vget_low_f32(*a), vget_high_f32(*b)), vsub_f32(vget_high_f32(*a), vget_low_f32(*b)));
+}
+__INLINE V HSP_ADD_MULI(V *a, V *b) {
+    return vcombine_f32(vsub_f32(vget_low_f32(*a), vget_high_f32(*b)), vadd_f32(vget_high_f32(*a), vget_low_f32(*b)));
+}
+
+__INLINE void K_N_HSP(const V *w, V *r0, V *r1, V *r2, V *r3) {
+    V uk, uk2, zk_p, zk_n, zk, zk_d;
+    
+	uk   = *r0;
+	uk2  = *r1;
+    zk_p = HSP_MUL(r2, w);
+    zk_n = HSP_MULJ(r3, w);
+    zk   = ADD(zk_p, zk_n);
+    zk_d = SUB(zk_p, zk_n);
+    
+    *r2 = SUB(uk, zk);
+    *r0 = ADD(uk, zk);
+    *r3 = HSP_ADD_MULI(&uk2, &zk_d);
+    *r1 = HSP_SUB_MULI(&uk2, &zk_d);
+}
+
+__INLINE void neon_shl8_ee(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) {
+    
+    V r0, r1, r2, r3, r4, r5, r6, r7;
+    V t0, t1, t2, t3, t4, t5, t6, t7;
+    
+    
+    t0 = LOAD2I(i0);    
+    t1 = LOAD2I(i1);    
+    t2 = LOAD2I(i2);    
+    t3 = LOAD2I(i3);    
+    t4 = ADD (t0, t1);
+    t5 = SUB (t0, t1);
+    t6 = ADD (t2, t3);
+    t7 = SUB (t2, t3);
+    r0 = ADD (t4, t6);
+    r2 = SUB (t4, t6);
+    r1 = HSP_SUB_MULI(&t5, &t7);
+    r3 = HSP_ADD_MULI(&t5, &t7);
+    
+    t0 = LOAD2I(i4);    
+    t1 = LOAD2I(i5);    
+    t2 = LOAD2I(i6);    
+    t3 = LOAD2I(i7);    
+    r4 = ADD (t0, t1);
+    r5 = SUB (t0, t1);
+    r6 = ADD (t2, t3);
+    r7 = SUB (t2, t3);
+    
+    t0 = r0; t1 = r2;
+    t2 = ADD(r4, r6);
+    t3 = SUB(r4, r6);
+    r0 = ADD(t0, t2);
+    r4 = SUB(t0, t2);
+    r2 = HSP_SUB_MULI(&t1, &t3);
+    r6 = HSP_ADD_MULI(&t1, &t3);
+    
+    V w = vld1q_f32(ee_w_data);
+    
+    K_N_HSP(&w,&r1,&r3,&r5,&r7);
+    V uk, uk2, zk, zk_d;
+   
+    float32x4x2_t tmp1 = vtrnq_f32(r0, r2);
+    r0 = tmp1.val[0];
+    r2 = tmp1.val[1];
+    float32x4x2_t tmp4 = vtrnq_f32(r1, r3);
+    r1 = tmp4.val[0];
+    r3 = tmp4.val[1];
+    register V tt0 __asm__ ("q0") = r0;
+    register V tt1 __asm__ ("q1") = r1;
+    register V tt2 __asm__ ("q2") = r2;
+    register V tt3 __asm__ ("q3") = r3;
+    __asm__ __volatile__ ("vst2.32 {q0,q1}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt0), "w"(tt1) : "memory");
+    __asm__ __volatile__ ("vst2.32 {q2,q3}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt2), "w"(tt3) : "memory");
+    
+    float32x4x2_t tmp2 = vtrnq_f32(r4, r6);
+    r4 = tmp2.val[0];
+    r6 = tmp2.val[1];
+    float32x4x2_t tmp3 = vtrnq_f32(r5, r7);
+    r5 = tmp3.val[0];
+    r7 = tmp3.val[1];
+    register V tt4 __asm__ ("q4") = r4;
+    register V tt5 __asm__ ("q5") = r5;
+    register V tt6 __asm__ ("q6") = r6;
+    register V tt7 __asm__ ("q7") = r7;
+    
+    __asm__ __volatile__ ("vst2.32 {q4,q5}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt4), "w"(tt5) : "memory");
+    __asm__ __volatile__ ("vst2.32 {q6,q7}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt6), "w"(tt7) : "memory");
+  
+}
+
+__INLINE void neon_shl8_oo(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) {
+    
+    V r0, r1, r2, r3, r4, r5, r6, r7;
+    V t0, t1, t2, t3, t4, t5, t6, t7; 
+    
+    t0 = LOAD2I(i0);    
+    t1 = LOAD2I(i1);    
+    t2 = LOAD2I(i2);    
+    t3 = LOAD2I(i3);    
+    t4 = ADD (t0, t1);
+    t5 = SUB (t0, t1);
+    t6 = ADD (t2, t3);
+    t7 = SUB (t2, t3);
+    r0 = ADD (t4, t6);
+    r2 = SUB (t4, t6);
+    r1 = HSP_SUB_MULI(&t5, &t7);
+    r3 = HSP_ADD_MULI(&t5, &t7);
+    
+    float32x4x2_t tmp1 = vtrnq_f32(r0, r2);
+    r0 = tmp1.val[0];
+    r2 = tmp1.val[1];
+    float32x4x2_t tmp4 = vtrnq_f32(r1, r3);
+    r1 = tmp4.val[0];
+    r3 = tmp4.val[1];
+    register V tt0 __asm__ ("q0") = r0;
+    register V tt1 __asm__ ("q1") = r1;
+    register V tt2 __asm__ ("q2") = r2;
+    register V tt3 __asm__ ("q3") = r3;
+    __asm__ __volatile__ ("vst2.32 {q0,q1}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt0), "w"(tt1) : "memory");
+    __asm__ __volatile__ ("vst2.32 {q2,q3}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt2), "w"(tt3) : "memory");
+
+    
+    
+    t0 = LOAD2I(i4);    
+    t1 = LOAD2I(i5);    
+    t2 = LOAD2I(i6);    
+    t3 = LOAD2I(i7);    
+    t4 = ADD (t0, t1);
+    t5 = SUB (t0, t1);
+    t6 = ADD (t2, t3);
+    t7 = SUB (t2, t3);
+    r4 = ADD (t4, t6);
+    r6 = SUB (t4, t6);
+    r5 = HSP_SUB_MULI(&t5, &t7);
+    r7 = HSP_ADD_MULI(&t5, &t7);
+      
+     float32x4x2_t tmp2 = vtrnq_f32(r4, r6);
+    r4 = tmp2.val[0];
+    r6 = tmp2.val[1];
+    float32x4x2_t tmp3 = vtrnq_f32(r5, r7);
+    r5 = tmp3.val[0];
+    r7 = tmp3.val[1];
+    
+    
+    register V tt4 __asm__ ("q4") = r4;
+    register V tt5 __asm__ ("q5") = r5;
+    register V tt6 __asm__ ("q6") = r6;
+    register V tt7 __asm__ ("q7") = r7;
+    
+    __asm__ __volatile__ ("vst2.32 {q4,q5}, [%0, :128]!\n\t" : "+&r" (out0): "w"(tt4), "w"(tt5) : "memory");
+    __asm__ __volatile__ ("vst2.32 {q6,q7}, [%0, :128]!\n\t" : "+&r" (out1): "w"(tt6), "w"(tt7) : "memory");
+
+    
+    
+}
+
+static const __attribute__ ((aligned(16))) data_t eo_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376};
+
+
+__INLINE void neon_shl8_eo(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) {
+    /*
+    register V r0_1   __asm__ ("q0");
+    register V r2_3   __asm__ ("q1");
+    register V r4_5   __asm__ ("q2");
+    register V r6_7   __asm__ ("q3");
+   */
+    const V w = vld1q_f32(eo_w_data);
+    
+    V r0_1, r2_3, r4_5, r6_7;
+    
+    register V r8_9   __asm__ ("q4");
+    register V r10_11 __asm__ ("q5");
+    register V r12_13 __asm__ ("q6");
+    register V r14_15 __asm__ ("q7");
+    
+    {
+    V t0, t1, t2, t3, t4, t5, t6, t7;
+    t0 = LOAD2I(i0);    
+    t1 = LOAD2I(i1);    
+    t2 = LOAD2I(i2);    
+    t3 = LOAD2I(i3);    
+    t4 = ADD(t0, t1);
+    t5 = SUB(t0, t1);
+    t6 = ADD(t2, t3);
+    t7 = SUB(t2, t3);
+    
+    t0 = ADD(t4, t6);
+    t2 = SUB(t4, t6);
+    t1 = HSP_SUB_MULI(&t5, &t7);
+    t3 = HSP_ADD_MULI(&t5, &t7);
+    
+    float32x4x2_t tmp1 = vtrnq_f32(t0, t1);
+        t0 = tmp1.val[0];
+        t1 = tmp1.val[1];
+    float32x4x2_t tmp2 = vtrnq_f32(t2, t3);
+        t2 = tmp2.val[0];
+        t3 = tmp2.val[1];
+
+    r0_1 = t0;
+    r2_3 = t2;
+    r8_9 = t1;
+    r10_11 = t3;
+        __asm__ __volatile__ ("vswp d9,d10\n\t"
+                              "vst1.32 {d8,d9,d10,d11}, [%0, :128]!\n\t"
+//                              "vst1.32 {d8,d9}, [%0, :128]!\n\t"
+//                              "vst1.32 {d10,d11}, [%0, :128]!\n\t"
+                              : "+&r" (out1)
+                              : "w" (r8_9), "w" (r10_11)
+                              : "memory");
+
+    }
+    {
+        V t0, t1, t2, t3, t4, t5, t6, t7;
+        t0 = LOAD2I(i4);    
+        t1 = LOAD2I(i5);    
+        t2 = LOAD2I(i6);    
+        t3 = LOAD2I(i7);    
+        //t2 = HALFBLEND(t6, t7);
+        //t3 = HALFBLEND(t7, t6);
+        t4 = ADD(t0, t1);
+        t5 = SUB(t0, t1);
+        t6 = ADD(t2, t3);
+        t7 = SUB(t2, t3);
+        float32x4x2_t tmp1 = vtrnq_f32(t4, t5);
+        r4_5 = tmp1.val[0];
+        float32x4x2_t tmp2 = vtrnq_f32(t6, t7);
+        r6_7 = tmp2.val[0];
+        //t5 = MULI(t5);
+        t0 = ADD(t6, t4);
+        t2 = SUB(t6, t4);
+        t1 = HSP_SUB_MULI(&t7, &t5);
+        t3 = HSP_ADD_MULI(&t7, &t5);
+        
+        float32x4x2_t tmp3 = vtrnq_f32(t0, t1);
+        r12_13 = tmp3.val[1];
+        float32x4x2_t tmp4 = vtrnq_f32(t2, t3);
+        r14_15 = tmp4.val[1];
+        
+
+        __asm__ __volatile__ ("vswp d13, d14\n\t"
+                              "vst1.32 {d12,d13,d14,d15}, [%0, :128]!\n\t"
+//                              "vst1.32 {d12,d13}, [%0, :128]!\n\t"
+//                              "vst1.32 {d14,d15}, [%0, :128]!\n\t"
+                              : "+&r" (out1)
+                              : "w" (r12_13), "w" (r14_15)
+                              : "memory");
+      
+        
+    }
+    
+       K_N_HSP(&w,&r0_1,&r2_3,&r4_5,&r6_7);
+
+    register V t0   __asm__ ("q0") = r0_1;
+    register V t1   __asm__ ("q1") = r2_3;
+    register V t2   __asm__ ("q2") = r4_5;
+    register V t3   __asm__ ("q3") = r6_7;
+    
+    __asm__ __volatile__ ("vswp d1, d2\n\t"
+                          "vswp d5, d6\n\t"
+                          "vstmia %0!, {q0-q3}\n\t"
+//                          "vst1.32 {d0,d1}, [%0, :128]!\n\t"
+//                          "vst1.32 {d2,d3}, [%0, :128]!\n\t"
+//                          "vst1.32 {d4,d5}, [%0, :128]!\n\t"
+//                          "vst1.32 {d6,d7}, [%0, :128]\n\t"
+                          : "+&r" (out0)
+                          : "w" (t0), "w" (t1), "w" (t2), "w" (t3)
+                          : "memory");
+    
+}
+static const __attribute__ ((aligned(16))) data_t oe_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376};
+
+__INLINE void neon_shl8_oe(data_t *restrict out0, data_t *restrict out1,const data_t **restrict i0,const data_t **restrict i1,const data_t **restrict i2,const data_t **restrict i3,const data_t **restrict i4,const data_t **restrict i5,const data_t **restrict i6,const data_t **restrict i7) {
+    register V r0_1   __asm__ ("q0");
+    register V r2_3   __asm__ ("q1");
+    register V r4_5   __asm__ ("q2");
+    register V r6_7   __asm__ ("q3");
+    
+    V r8_9, r10_11, r12_13, r14_15;
+    const V w = vld1q_f32(oe_w_data);
+    
+       {
+        V t0, t1, t2, t3, t4, t5, t6, t7;
+    t0 = LOAD2I(i0);    
+    t1 = LOAD2I(i1);    
+    t6 = LOADI(i2);    
+    t7 = LOADI(i3);    
+    
+        float32x2x2_t tmp0 = vtrn_f32(vget_low_f32(t6), vget_high_f32(t7));
+        float32x2x2_t tmp1 = vtrn_f32(vget_low_f32(t7), vget_high_f32(t6));
+        t2 = vcombine_f32(tmp0.val[0], tmp0.val[1]);
+        t3 = vcombine_f32(tmp1.val[0], tmp1.val[1]);
+    
+	t4 = ADD(t0, t1);
+    t5 = SUB(t0, t1);
+    t6 = ADD(t2, t3);
+    t7 = SUB(t2, t3);
+        float32x4x2_t tmp2 = vtrnq_f32(t4, t5);
+        r12_13 = tmp2.val[1];
+        float32x4x2_t tmp3 = vtrnq_f32(t6, t7);
+        r14_15 = tmp3.val[1];
+        
+    t0 = ADD(t4, t6);
+    t2 = SUB(t4, t6);
+    t1 = HSP_SUB_MULI(&t5, &t7);
+    t3 = HSP_ADD_MULI(&t5, &t7);
+        float32x4x2_t tmp4 = vtrnq_f32(t0, t1);
+        r0_1 = tmp4.val[0];
+        float32x4x2_t tmp5 = vtrnq_f32(t2, t3);
+        r2_3 = tmp5.val[0];
+        __asm__ __volatile__ ("vswp d1, d2\n\t"
+                              "vst1.32 {q0, q1}, [%0, :128]!\n\t"
+//                              "vst1.32 {q1}, [%0, :128]!\n\t"
+                              : "+&r" (out0)
+                              : "w" (r0_1), "w" (r2_3)
+                              : "memory");
+    }
+    {
+        V t0, t1, t2, t3, t4, t5, t6, t7;
+        t0 = LOAD2I(i4);    
+        t1 = LOAD2I(i5);    
+        t2 = LOAD2I(i6);    
+        t3 = LOAD2I(i7);    
+        t4 = ADD(t0, t1);
+        t5 = SUB(t0, t1);
+        t6 = ADD(t2, t3);
+        t7 = SUB(t2, t3);
+        t0 = ADD(t4, t6);
+        t2 = SUB(t4, t6);
+        t1 = HSP_SUB_MULI(&t5, &t7);
+        t3 = HSP_ADD_MULI(&t5, &t7);
+       
+        float32x4x2_t tmp0 = vtrnq_f32(t0, t1);
+        r4_5 = tmp0.val[0];
+        r8_9 = tmp0.val[1];
+        float32x4x2_t tmp1 = vtrnq_f32(t2, t3);
+        r6_7 = tmp1.val[0];
+        r10_11 = tmp1.val[1];
+
+        
+        __asm__ __volatile__ ("vswp d5, d6\n\t"
+                              "vst1.32 {q2, q3}, [%0, :128]!\n\t"
+//                              "vst1.32 {q3}, [%0, :128]!\n\t"
+                              : "+&r" (out0)
+                              : "w" (r4_5), "w" (r6_7)
+                              : "memory");
+        
+    }
+    
+    K_N_HSP(&w,&r8_9,&r10_11,&r12_13,&r14_15);
+    register V t0 __asm__ ("q4") = r8_9;
+    register V t1 __asm__ ("q5") = r10_11;
+    register V t2 __asm__ ("q6") = r12_13;
+    register V t3 __asm__ ("q7") = r14_15;
+    
+    __asm__ __volatile__ ("vswp d9, d10\n\t"
+                          "vswp d13, d14\n\t"
+                          "vstmia %0!, {q4-q7}\n\t"
+//                          "vst1.32 {q4}, [%0, :128]!\n\t"
+//                          "vst1.32 {q5}, [%0, :128]!\n\t"
+//                          "vst1.32 {q6}, [%0, :128]!\n\t"
+//                          "vst1.32 {q7}, [%0, :128]\n\t"
+                          : "+&r" (out1)
+                          : "w" (t0), "w" (t1), "w" (t2), "w" (t3)
+                          : "memory");
+    
+ 
+}
+#endif
diff --git a/src/patterns.c b/src/patterns.c
index 12d9a4c..226a3b6 100644
--- a/src/patterns.c
+++ b/src/patterns.c
@@ -8,12 +8,12 @@ void permute_addr(int N, int offset, int stride, int *d) {
     }
 }
 
-void hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) {
+void ffts_hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) {
  
 	if(N > 4) {  
-    hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL);  
-		if(N/4 >= 4) hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL);
-		if(N/4 >= 4) hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL);
+    ffts_hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL);  
+		if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL);
+		if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL);
 		else {
 			int temp = poffset+(1<<stride);
 			if(temp < 0) temp += bigN;
@@ -43,7 +43,7 @@ void hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offs
 	}
 }
 
-void init_is(ffts_plan_t *p, int N, int leafN, int VL) {
+void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL) {
 	int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3;
 	int stride = log(N/leafN)/log(2);
 	
@@ -53,12 +53,12 @@ void init_is(ffts_plan_t *p, int N, int leafN, int VL) {
 
 	if((N/leafN) % 3 > 1) i1++;
 
-	for(i=0;i<i0;i++) hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
+	for(i=0;i<i0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
 	for(i=i0;i<i0+i1;i++) {
-		hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL);
-		hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL);
+		ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL);
+		ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL);
 	}
-	for(i=0-i2;i<0;i++) hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
+	for(i=0-i2;i<0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
 
 
 //for(i=0;i<N/VL;i++) {
@@ -69,15 +69,15 @@ void init_is(ffts_plan_t *p, int N, int leafN, int VL) {
 	p->i0 = i0; p->i1 = i1;
 }
 
-void elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
+void ffts_elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
   if((even && N == leafN) || (!even && N <= leafN)) {
 		offsets[2*(ooffset/leafN)]   = ioffset*2;
 		offsets[2*(ooffset/leafN)+1] = ooffset;
 	}else if(N > 4) {
-  	elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even);
-  	elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0);
+  	ffts_elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even);
+  	ffts_elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0);
   	if(N/4 >= leafN) 
-			elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0);
+			ffts_elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0);
 	}
 
 }
@@ -86,11 +86,22 @@ int compare_offsets(const void *a, const void *b) {
 	return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0];
 }
 
-void init_offsets(ffts_plan_t *p, int N, int leafN) {
+uint32_t reverse_bits(uint32_t a, int n) {
+	uint32_t x = 0;
+
+	int i;
+	for(i=0;i<n;i++) {
+		if(a & (1 << i)) x |= 1 << (n-i-1);
+	}
+	return x;
+}
+
+
+void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) {
 
 	ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t));
 
-	elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1);
+	ffts_elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1);
 
 	size_t i;
   for(i=0;i<2*N/leafN;i+=2) {
@@ -103,9 +114,9 @@ void init_offsets(ffts_plan_t *p, int N, int leafN) {
 	for(i=0;i<N/leafN;i++) {
 		p->offsets[i] = offsets[i*2+1]*2;
 	}
-//for(i=0;i<N/leafN;i++) {
-//	printf("%td\n", p->offsets[i]);
-//}
+  for(i=0;i<N/leafN;i++) {
+  	printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
+  }
 	free(offsets);
 
 
@@ -140,7 +151,7 @@ void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) {
 	(*p)+=2;
 }
 
-void init_tree(ffts_plan_t *p, int N, int leafN) {
+void ffts_init_tree(ffts_plan_t *p, int N, int leafN) {
 
 	int count = tree_count(N, leafN, 0) + 1;
 	transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t));
@@ -148,11 +159,14 @@ void init_tree(ffts_plan_t *p, int N, int leafN) {
 //printf("count = %d\n", count);
 
 	elaborate_tree(&ps, N, leafN, 0);
+	#ifdef __ARM_NEON__
+	ps -= 2;
+	#endif
 	ps[0] = 0;
 	ps[1] = 0;
 //int i;
 //for(i=0;i<count;i++) {
-//	printf("%d %d - %d\n", p->transforms[i*2], p->transforms[i*2+1],
+//	fprintf(stderr, "%lu %lu - %d\n", p->transforms[i*2], p->transforms[i*2+1],
 //		__builtin_ctzl(p->transforms[i*2]) - 5);
 //}
 
diff --git a/src/patterns.h b/src/patterns.h
index 298ca3e..fd93042 100644
--- a/src/patterns.h
+++ b/src/patterns.h
@@ -3,8 +3,8 @@
 
 #include "cp_sse.h"
 
-void init_is(ffts_plan_t *p, int N, int leafN, int VL); 
-void init_offsets(ffts_plan_t *p, int N, int leafN); 
-void init_tree(ffts_plan_t *p, int N, int leafN); 
+void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL); 
+void ffts_init_offsets(ffts_plan_t *p, int N, int leafN); 
+void ffts_init_tree(ffts_plan_t *p, int N, int leafN); 
 
 #endif
diff --git a/src/sse_float.h b/src/sse_float.h
new file mode 100644
index 0000000..9974b2b
--- /dev/null
+++ b/src/sse_float.h
@@ -0,0 +1,33 @@
+#ifndef __SSE_FLOAT_H__
+#define __SSE_FLOAT_H__
+
+#include <xmmintrin.h>
+
+//#define VL 4
+
+typedef __m128 V;
+
+#define VADD _mm_add_ps
+#define VSUB _mm_sub_ps
+#define VMUL _mm_mul_ps
+//#define VLIT4 _mm_set_ps
+#define VXOR _mm_xor_ps
+#define VST _mm_store_ps
+#define VLD _mm_load_ps
+
+#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1)))
+
+#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2)))
+#define VUNPACKLO(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(1,0,1,0)))
+
+#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0)))
+
+#define VLIT4 _mm_set_ps
+
+#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0)))
+#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1)))
+
+#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
+#define FFTS_FREE(d) (_mm_free(d))
+
+#endif