8 files changed, 1015 insertions, 779 deletions
diff --git a/src/ffts_nd.c b/src/ffts_nd.c
index 15fc4d1..f982403 100644
--- a/src/ffts_nd.c
+++ b/src/ffts_nd.c
@@ -1,33 +1,33 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
 
@@ -35,249 +35,330 @@
 
 #ifdef HAVE_NEON
 #include "neon.h"
+#include <arm_neon.h>
+#endif
+
+#ifdef HAVE_SSE
+#include <xmmintrin.h>
 #endif
 
-void ffts_free_nd(ffts_plan_t *p) {
-
-	int i;
-	for(i=0;i<p->rank;i++) {
-		
-		ffts_plan_t *x = p->plans[i];
-		int k;
-		for(k=0;k<i;k++) {
-			if(p->Ms[i] == p->Ms[k]) x = NULL;
-		}
-		
-		if(x)	ffts_free(x);
-	}
-
-	free(p->Ns);
-	free(p->Ms);
-	free(p->plans);
-	free(p->buf);
-	free(p->transpose_buf);
-	free(p);
-}
 #define TSIZE 8
-#include <string.h>
-void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
-
-#ifdef HAVE_NEON 
-	size_t i,j,k;
-	int linebytes = w*8;
-
-	for(j=0;j<h;j+=8) {
-		for(i=0;i<w;i+=8) {
-			neon_transpose_to_buf(in + j*w + i, buf, w);
-
-			uint64_t *p = out + i*h + j;
-			uint64_t *pbuf = buf;
-			uint64_t *ptemp;
-
-			__asm__ __volatile__(
-							 "mov %[ptemp], %[p]\n\t"
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
-							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t" 
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t"
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
-							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t" 
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t"
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
-							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t" 
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t"
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
-							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t" 
-							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-							 
-						: [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
-						: [w] "r" (w)
-						: "memory", "q8", "q9", "q10", "q11"
-						);
-//			out[i*h + j] = in[j*w + i];
-		}
-	}
+
+static void ffts_free_nd(ffts_plan_t *p)
+{
+    if (p->plans) {
+        int i;
+
+        for (i = 0; i < p->rank; i++) {
+            ffts_plan_t *plan = p->plans[i];
+
+            if (plan) {
+                int k;
+
+                for (k = 0; k < i; k++) {
+                    if (p->Ms[i] == p->Ms[k]) {
+                        plan = NULL;
+                        break;
+                    }
+                }
+
+                if (plan) {
+                    ffts_free(plan);
+                }
+            }
+        }
+
+        free(p->plans);
+    }
+
+    if (p->Ns) {
+        free(p->Ns);
+    }
+
+    if (p->Ms) {
+        free(p->Ms);
+    }
+
+    if (p->buf) {
+        ffts_aligned_free(p->buf);
+    }
+
+    if (p->transpose_buf) {
+        ffts_aligned_free(p->transpose_buf);
+    }
+
+    free(p);
+}
+
+static void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf)
+{
+#ifdef HAVE_NEON
+    size_t i, j, k;
+    int linebytes = 8 * w;
+
+    for (j = 0; j < h; j += 8) {
+        for (i = 0; i < w; i += 8) {
+            neon_transpose_to_buf(in + j*w + i, buf, w);
+
+            uint64_t *p = out + i*h + j;
+            uint64_t *pbuf = buf;
+            uint64_t *ptemp;
+
+            __asm__ __volatile__(
+                "mov %[ptemp], %[p]\n\t"
+                "add %[p], %[p], %[w], lsl #3\n\t"
+                "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+                "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+                "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+                "mov %[ptemp], %[p]\n\t"
+                "add %[p], %[p], %[w], lsl #3\n\t"
+                "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+                "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+                "mov %[ptemp], %[p]\n\t"
+                "add %[p], %[p], %[w], lsl #3\n\t"
+                "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+                "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+                "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+                "mov %[ptemp], %[p]\n\t"
+                "add %[p], %[p], %[w], lsl #3\n\t"
+                "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+                "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+                "mov %[ptemp], %[p]\n\t"
+                "add %[p], %[p], %[w], lsl #3\n\t"
+                "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+                "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+                "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+                "mov %[ptemp], %[p]\n\t"
+                "add %[p], %[p], %[w], lsl #3\n\t"
+                "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+                "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+                "mov %[ptemp], %[p]\n\t"
+                "add %[p], %[p], %[w], lsl #3\n\t"
+                "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+                "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+                "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+                "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+                "mov %[ptemp], %[p]\n\t"
+                "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+                "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+
+                : [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
+                : [w] "r" (w)
+                : "memory", "q8", "q9", "q10", "q11"
+            );
+
+            /* out[i*h + j] = in[j*w + i]; */
+        }
+    }
 #else
 #ifdef HAVE_SSE
-	uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64)));
-	int tx, ty;
-	int x, y;
-	int tw = w / TSIZE;
-	int th = h / TSIZE;
-	for (ty=0;ty<th;ty++) {
-		for (tx=0;tx<tw;tx++) {
-			uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
-			uint64_t *op0 = tmp;//out + h*TSIZE*tx + ty*TSIZE;
-
-			// Copy/transpose to tmp
-			for (y=0;y<TSIZE;y+=2) {
-				//for (x=0;x<TSIZE;x+=2) {
-					//op[x*TSIZE] = ip[x];
-					__m128d q0 = _mm_load_pd((double *)(ip0 + 0*w));
-					__m128d q1 = _mm_load_pd((double *)(ip0 + 1*w));
-					__m128d q2 = _mm_load_pd((double *)(ip0 + 2*w));
-					__m128d q3 = _mm_load_pd((double *)(ip0 + 3*w));
-					__m128d q4 = _mm_load_pd((double *)(ip0 + 4*w));
-					__m128d q5 = _mm_load_pd((double *)(ip0 + 5*w));
-					__m128d q6 = _mm_load_pd((double *)(ip0 + 6*w));
-					__m128d q7 = _mm_load_pd((double *)(ip0 + 7*w));
-					ip0 += 2;	
-					
-					__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
-					__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
-					__m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
-					__m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
-					__m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
-					__m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
-					__m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
-					__m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
-					//_mm_store_pd((double *)(op0 + y*h + x), t0);
-					//_mm_store_pd((double *)(op0 + y*h + x + h), t1);
-					_mm_store_pd((double *)(op0 + 0), t0);
-					_mm_store_pd((double *)(op0 + 0 + TSIZE), t1);
-					_mm_store_pd((double *)(op0 + 2 ), t2);
-					_mm_store_pd((double *)(op0 + 2 + TSIZE), t3);
-					_mm_store_pd((double *)(op0 + 4 ), t4);
-					_mm_store_pd((double *)(op0 + 4 + TSIZE), t5);
-					_mm_store_pd((double *)(op0 + 6 ), t6);
-					_mm_store_pd((double *)(op0 + 6 + TSIZE), t7);
-				//}
-				op0 += 2*TSIZE;
-			}
-			
-			op0 = out + h*tx*TSIZE + ty*TSIZE;
-			ip0 = tmp;
-			for (y=0;y<TSIZE;y+=1) {
-		//		memcpy(op0, ip0, TSIZE * sizeof(*ip0));
-				
-				__m128d q0 = _mm_load_pd((double *)(ip0 + 0));
-				__m128d q1 = _mm_load_pd((double *)(ip0 + 2));
-				__m128d q2 = _mm_load_pd((double *)(ip0 + 4));
-				__m128d q3 = _mm_load_pd((double *)(ip0 + 6));
-				_mm_store_pd((double *)(op0 + 0), q0);
-				_mm_store_pd((double *)(op0 + 2), q1);
-				_mm_store_pd((double *)(op0 + 4), q2);
-				_mm_store_pd((double *)(op0 + 6), q3);
-				
-				op0 += h;
-				ip0 += TSIZE;
-			}
-
-		}
-	}
-/*
-	size_t i,j;
-	for(i=0;i<w;i+=2) {
-		for(j=0;j<h;j+=2) {
-//		out[i*h + j] = in[j*w + i];
-  		__m128d q0 = _mm_load_pd((double *)(in + j*w + i));
-  		__m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
-  		__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
-  		__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
-  		_mm_store_pd((double *)(out + i*h + j), t0);
-  		_mm_store_pd((double *)(out + i*h + j + h), t1);
-		}
-	}
-*/
+    uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE];
+    int tx, ty;
+    /* int x; */
+    int y;
+    int tw = w / TSIZE;
+    int th = h / TSIZE;
+
+    for (ty = 0; ty < th; ty++) {
+        for (tx = 0; tx < tw; tx++) {
+            uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
+            uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */
+
+            /* copy/transpose to tmp */
+            for (y = 0; y < TSIZE; y += 2) {
+                /* for (x=0;x<TSIZE;x+=2) {
+                   op[x*TSIZE] = ip[x];
+                */
+                __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w));
+                __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w));
+                __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w));
+                __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w));
+                __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w));
+                __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w));
+                __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w));
+                __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w));
+
+                __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+                __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+                __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
+                __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
+                __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
+                __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
+                __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
+                __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
+
+                ip0 += 2;
+                /* _mm_store_pd((double *)(op0 + y*h + x), t0);
+                   _mm_store_pd((double *)(op0 + y*h + x + h), t1);
+                   */
+
+                _mm_store_pd((double*)(op0 + 0        ), t0);
+                _mm_store_pd((double*)(op0 + 0 + TSIZE), t1);
+                _mm_store_pd((double*)(op0 + 2        ), t2);
+                _mm_store_pd((double*)(op0 + 2 + TSIZE), t3);
+                _mm_store_pd((double*)(op0 + 4        ), t4);
+                _mm_store_pd((double*)(op0 + 4 + TSIZE), t5);
+                _mm_store_pd((double*)(op0 + 6        ), t6);
+                _mm_store_pd((double*)(op0 + 6 + TSIZE), t7);
+                /* } */
+
+                op0 += 2*TSIZE;
+            }
+
+            op0 = out + h*tx*TSIZE + ty*TSIZE;
+            ip0 = tmp;
+            for (y = 0; y < TSIZE; y += 1) {
+                /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */
+
+                __m128d q0 = _mm_load_pd((double*)(ip0 + 0));
+                __m128d q1 = _mm_load_pd((double*)(ip0 + 2));
+                __m128d q2 = _mm_load_pd((double*)(ip0 + 4));
+                __m128d q3 = _mm_load_pd((double*)(ip0 + 6));
+
+                _mm_store_pd((double*)(op0 + 0), q0);
+                _mm_store_pd((double*)(op0 + 2), q1);
+                _mm_store_pd((double*)(op0 + 4), q2);
+                _mm_store_pd((double*)(op0 + 6), q3);
+
+                op0 += h;
+                ip0 += TSIZE;
+            }
+        }
+    }
+    /*
+    size_t i,j;
+    for(i=0;i<w;i+=2) {
+    for(j=0;j<h;j+=2) {
+    //		out[i*h + j] = in[j*w + i];
+    __m128d q0 = _mm_load_pd((double *)(in + j*w + i));
+    __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
+    __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+    __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+    _mm_store_pd((double *)(out + i*h + j), t0);
+    _mm_store_pd((double *)(out + i*h + j + h), t1);
+    }
+    }
+    */
 #endif
 #endif
-
 }
 
-void ffts_execute_nd(ffts_plan_t *p, const void *  in, void *  out) {
+static void ffts_execute_nd(ffts_plan_t *p, const void *in, void *out)
+{
+    uint64_t *din = (uint64_t*) in;
+    uint64_t *buf = p->buf;
+    uint64_t *dout = (uint64_t*) out;
+
+    ffts_plan_t *plan;
+    size_t i, j;
 
-	uint64_t *din = (uint64_t *)in;
-	uint64_t *buf = p->buf;
-	uint64_t *dout = (uint64_t *)out;
+    plan = p->plans[0];
+    for (i = 0; i < p->Ns[0]; i++) {
+        plan->transform(plan, din + (i * p->Ms[0]), buf + (i * p->Ms[0]));
+    }
 
-	size_t i,j;
-	for(i=0;i<p->Ns[0];i++) {
-		p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0]));	
-	}
-	ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);	
+    ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);
 
-	for(i=1;i<p->rank;i++) {
-		for(j=0;j<p->Ns[i];j++) { 
-			p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));	
-		}
-		ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);	
-	}
+    for (i = 1; i < p->rank; i++) {
+        plan = p->plans[i];
+
+        for (j = 0; j < p->Ns[i]; j++) {
+            plan->transform(plan, dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
+        }
+
+        ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
+    }
 }
 
-ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) {
-	size_t vol = 1;
+ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign)
+{
+    ffts_plan_t *p;
+    size_t vol;
+    int i;
+
+    p = calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    p->transform = &ffts_execute_nd;
+    p->destroy   = &ffts_free_nd;
+    p->rank      = rank;
+
+    p->Ms = malloc(rank * sizeof(*p->Ms));
+    if (!p->Ms) {
+        goto cleanup;
+    }
+
+    p->Ns = malloc(rank * sizeof(*p->Ns));
+    if (!p->Ns) {
+        goto cleanup;
+    }
+
+    vol = p->Ns[0] = Ns[0];
+    for (i = 1; i < rank; i++) {
+        p->Ns[i] = Ns[i];
+        vol *= Ns[i];
+    }
+
+    p->buf = ffts_aligned_malloc(2 * vol * sizeof(float));
+    if (!p->buf) {
+        goto cleanup;
+    }
+
+    p->transpose_buf = ffts_aligned_malloc(2 * 8 * 8 * sizeof(float));
+    if (!p->transpose_buf) {
+        goto cleanup;
+    }
 
-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+    p->plans = calloc(rank, sizeof(*p->plans));
+    if (!p->plans) {
+        goto cleanup;
+    }
 
-	p->transform = &ffts_execute_nd;
-	p->destroy = &ffts_free_nd;
+    for (i = 0; i < rank; i++) {
+        int k;
 
-	p->rank = rank;
-	p->Ns = malloc(sizeof(size_t) * rank);
-	p->Ms = malloc(sizeof(size_t) * rank);
-	p->plans = malloc(sizeof(ffts_plan_t **) * rank);
-	int i;
-	for(i=0;i<rank;i++) {
-		p->Ns[i] = Ns[i];
-		vol *= Ns[i];	
-	}
-	p->buf = valloc(sizeof(float) * 2 * vol);
+        p->Ms[i] = vol / p->Ns[i];
 
-	for(i=0;i<rank;i++) {
-		p->Ms[i] = vol / p->Ns[i];
+        for (k = 0; k < i; k++) {
+            if (p->Ms[k] == p->Ms[i]) {
+                p->plans[i] = p->plans[k];
+                break;
+            }
+        }
 
-		p->plans[i] = NULL;
-		int k;
-		for(k=0;k<i;k++) {
-			if(p->Ms[k] == p->Ms[i]) 
-				p->plans[i] = p->plans[k];
-		}
+        if (!p->plans[i]) {
+            p->plans[i] = ffts_init_1d(p->Ms[i], sign);
+            if (!p->plans) {
+                goto cleanup;
+            }
+        }
+    }
 
-		if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); 
-	}
+    return p;
 
-	p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
-	return p;
+cleanup:
+    ffts_free_nd(p);
+    return NULL;
 }
 
+ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign)
+{
+    size_t Ns[2];
 
-ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) {
-	size_t Ns[2];
-	Ns[0] = N1;
-	Ns[1] = N2;
-	return ffts_init_nd(2, Ns, sign);
+    Ns[0] = N1;
+    Ns[1] = N2;
+    return ffts_init_nd(2, Ns, sign);
 }
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
diff --git a/src/ffts_nd.h b/src/ffts_nd.h
index a9af3e2..a960cad 100644
--- a/src/ffts_nd.h
+++ b/src/ffts_nd.h
@@ -1,10 +1,10 @@
 /*
- 
+
  This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
  Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -31,29 +31,13 @@
 
 */
 
-#ifndef __FFTS_ND_H__
-#define __FFTS_ND_H__
-
-#include <stdint.h>
-#include <stddef.h>
-#include <stdio.h>
+#ifndef FFTS_ND_H
+#define FFTS_ND_H
 
 #include "ffts.h"
+#include <stddef.h>
 
-#ifdef HAVE_NEON 
-	#include <arm_neon.h>
-#endif
-#ifdef HAVE_SSE
-	#include <xmmintrin.h>
-#endif
-
-void ffts_free_nd(ffts_plan_t *p);
-void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf);
-
-void ffts_execute_nd(ffts_plan_t *p, const void *  in, void *  out); 
-ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign); 
-ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign); 
-
-#endif
+ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign);
+ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign);
 
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+#endif /* FFTS_ND_H */
diff --git a/src/ffts_real.c b/src/ffts_real.c
index 97ff942..77c57a0 100644
--- a/src/ffts_real.c
+++ b/src/ffts_real.c
@@ -1,227 +1,281 @@
 /*
 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
 
 #include "ffts_real.h"
 
-void ffts_free_1d_real(ffts_plan_t *p) {
-	ffts_free(p->plans[0]);
-	free(p->A);
-	free(p->B);
-	free(p->plans);
-	free(p->buf);
-	free(p);
-}
+#ifdef HAVE_NEON
+#include <arm_neon.h>
+#endif
+
+#ifdef HAVE_SSE
+#include <xmmintrin.h>
+#endif
+
+static void ffts_free_1d_real(ffts_plan_t *p)
+{
+    if (p->B) {
+        ffts_aligned_free(p->B);
+    }
 
-void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) {
-	float *out = (float *)vout;
-	float *buf = (float *)p->buf;
-	float *A = p->A;
-	float *B = p->B;
+    if (p->A) {
+        ffts_aligned_free(p->A);
+    }
 
-	p->plans[0]->transform(p->plans[0], vin, buf);
+    if (p->buf) {
+        ffts_aligned_free(p->buf);
+    }
 
-	size_t N = p->N;
-	buf[N] = buf[0];
-	buf[N+1] = buf[1];
+    if (p->plans) {
+        ffts_free(p->plans[0]);
+        free(p->plans);
+    }
 
-	float *p_buf0 = buf;
-	float *p_buf1 = buf + N - 2;
-	float *p_out = out;
+    free(p);
+}
+
+static void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout)
+{
+    float *out = (float*) vout;
+    float *buf = (float*) p->buf;
+    float *A = p->A;
+    float *B = p->B;
+    size_t N = p->N;
 
-	size_t i;
 #ifdef __ARM_NEON__
-	for(i=0;i<N/2;i+=2) {
-	__asm__ __volatile__ (
-		"vld1.32 {q8},  [%[pa]]!\n\t"
-		"vld1.32 {q9},  [%[pb]]!\n\t"
-		"vld1.32 {q10}, [%[buf0]]!\n\t"
-		"vld1.32 {q11}, [%[buf1]]\n\t"
-		"sub %[buf1], %[buf1], #16\n\t"
-
-		"vdup.32 d26, d16[1]\n\t"
-		"vdup.32 d27, d17[1]\n\t"
-		"vdup.32 d24, d16[0]\n\t"
-		"vdup.32 d25, d17[0]\n\t"
-
-		"vdup.32 d30, d23[1]\n\t"
-		"vdup.32 d31, d22[1]\n\t"
-		"vdup.32 d28, d23[0]\n\t"
-		"vdup.32 d29, d22[0]\n\t"
-
-		"vmul.f32 q13, q13, q10\n\t"
-		"vmul.f32 q15, q15, q9\n\t"
-		"vmul.f32 q12, q12, q10\n\t"
-		"vmul.f32 q14, q14, q9\n\t"
-		"vrev64.f32 q13, q13\n\t"
-		"vrev64.f32 q15, q15\n\t"
-
-		"vtrn.32 d26, d27\n\t"
-		"vtrn.32 d30, d31\n\t"
-		"vneg.f32 d26, d26\n\t"
-		"vneg.f32 d31, d31\n\t"
-		"vtrn.32 d26, d27\n\t"
-		"vtrn.32 d30, d31\n\t"
-
-		"vadd.f32 q12, q12, q14\n\t"
-		"vadd.f32 q13, q13, q15\n\t"
-		"vadd.f32 q12, q12, q13\n\t"
-		"vst1.32 {q12}, [%[pout]]!\n\t"
-		: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
-		  [pout] "+r" (p_out)
-		:
-		: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-	);
-#else
-	for(i=0;i<N/2;i++) {
-		out[2*i]   = buf[2*i]*A[2*i] - buf[2*i+1]*A[2*i+1] + buf[N-2*i]*B[2*i] + buf[N-2*i+1]*B[2*i+1];
-		out[2*i+1] = buf[2*i+1]*A[2*i] + buf[2*i]*A[2*i+1] + buf[N-2*i]*B[2*i+1] - buf[N-2*i+1]*B[2*i];
+    float *p_buf0 = buf;
+    float *p_buf1 = buf + N - 2;
+    float *p_out = out;
+#endif
 
-//	out[2*N-2*i] = out[2*i];
-//	out[2*N-2*i+1] = -out[2*i+1];
+    size_t i;
 
-#endif	
-	}
+    p->plans[0]->transform(p->plans[0], vin, buf);
 
-	out[N] = buf[0] - buf[1];
-	out[N+1] = 0.0f;
+    buf[N + 0] = buf[0];
+    buf[N + 1] = buf[1];
 
+#ifdef __ARM_NEON__
+    for (i = 0; i < N/2; i += 2) {
+        __asm__ __volatile__ (
+            "vld1.32 {q8},  [%[pa]]!\n\t"
+            "vld1.32 {q9},  [%[pb]]!\n\t"
+            "vld1.32 {q10}, [%[buf0]]!\n\t"
+            "vld1.32 {q11}, [%[buf1]]\n\t"
+            "sub %[buf1], %[buf1], #16\n\t"
+
+            "vdup.32 d26, d16[1]\n\t"
+            "vdup.32 d27, d17[1]\n\t"
+            "vdup.32 d24, d16[0]\n\t"
+            "vdup.32 d25, d17[0]\n\t"
+
+            "vdup.32 d30, d23[1]\n\t"
+            "vdup.32 d31, d22[1]\n\t"
+            "vdup.32 d28, d23[0]\n\t"
+            "vdup.32 d29, d22[0]\n\t"
+
+            "vmul.f32 q13, q13, q10\n\t"
+            "vmul.f32 q15, q15, q9\n\t"
+            "vmul.f32 q12, q12, q10\n\t"
+            "vmul.f32 q14, q14, q9\n\t"
+            "vrev64.f32 q13, q13\n\t"
+            "vrev64.f32 q15, q15\n\t"
+
+            "vtrn.32 d26, d27\n\t"
+            "vtrn.32 d30, d31\n\t"
+            "vneg.f32 d26, d26\n\t"
+            "vneg.f32 d31, d31\n\t"
+            "vtrn.32 d26, d27\n\t"
+            "vtrn.32 d30, d31\n\t"
+
+            "vadd.f32 q12, q12, q14\n\t"
+            "vadd.f32 q13, q13, q15\n\t"
+            "vadd.f32 q12, q12, q13\n\t"
+            "vst1.32 {q12}, [%[pout]]!\n\t"
+            : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
+            [pout] "+r" (p_out)
+            :
+            : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+    }
+#else
+    for (i = 0; i < N/2; i++) {
+        out[2*i + 0] = buf[2*i + 0] * A[2*i] - buf[2*i + 1] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1];
+        out[2*i + 1] = buf[2*i + 1] * A[2*i] + buf[2*i + 0] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
+
+        /* out[2*N-2*i+0] =  out[2*i+0];
+           out[2*N-2*i+1] = -out[2*i+1];
+         */
+    }
+#endif
+
+    out[N + 0] = buf[0] - buf[1];
+    out[N + 1] = 0.0f;
 }
 
-void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) {
-	float *out = (float *)vout;
-	float *in = (float *)vin;
-	float *buf = (float *)p->buf;
-	float *A = p->A;
-	float *B = p->B;
-	size_t N = p->N;
+static void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout)
+{
+    float *out = (float*) vout;
+    float *in = (float*) vin;
+    float *buf = (float*) p->buf;
+    float *A = p->A;
+    float *B = p->B;
+    size_t N = p->N;
 
-	float *p_buf0 = in;
-	float *p_buf1 = in + N - 2;
+#ifdef __ARM_NEON__
+    float *p_buf0 = in;
+    float *p_buf1 = in + N - 2;
+    float *p_out = buf;
+#endif
 
-	float *p_out = buf;
+    size_t i;
 
-	size_t i;
 #ifdef __ARM_NEON__
-	for(i=0;i<N/2;i+=2) {
-	__asm__ __volatile__ (
-		"vld1.32 {q8},  [%[pa]]!\n\t"
-		"vld1.32 {q9},  [%[pb]]!\n\t"
-		"vld1.32 {q10}, [%[buf0]]!\n\t"
-		"vld1.32 {q11}, [%[buf1]]\n\t"
-		"sub %[buf1], %[buf1], #16\n\t"
-
-		"vdup.32 d26, d16[1]\n\t"
-		"vdup.32 d27, d17[1]\n\t"
-		"vdup.32 d24, d16[0]\n\t"
-		"vdup.32 d25, d17[0]\n\t"
-
-		"vdup.32 d30, d23[1]\n\t"
-		"vdup.32 d31, d22[1]\n\t"
-		"vdup.32 d28, d23[0]\n\t"
-		"vdup.32 d29, d22[0]\n\t"
-
-		"vmul.f32 q13, q13, q10\n\t"
-		"vmul.f32 q15, q15, q9\n\t"
-		"vmul.f32 q12, q12, q10\n\t"
-		"vmul.f32 q14, q14, q9\n\t"
-		"vrev64.f32 q13, q13\n\t"
-		"vrev64.f32 q15, q15\n\t"
-
-		"vtrn.32 d26, d27\n\t"
-		"vtrn.32 d28, d29\n\t"
-		"vneg.f32 d27, d27\n\t"
-		"vneg.f32 d29, d29\n\t"
-		"vtrn.32 d26, d27\n\t"
-		"vtrn.32 d28, d29\n\t"
-
-		"vadd.f32 q12, q12, q14\n\t"
-		"vsub.f32 q13, q13, q15\n\t"
-		"vadd.f32 q12, q12, q13\n\t"
-		"vst1.32 {q12}, [%[pout]]!\n\t"
-		: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
-		  [pout] "+r" (p_out)
-		:
-		: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-    );
+    for (i = 0; i < N/2; i += 2) {
+        __asm__ __volatile__ (
+            "vld1.32 {q8},  [%[pa]]!\n\t"
+            "vld1.32 {q9},  [%[pb]]!\n\t"
+            "vld1.32 {q10}, [%[buf0]]!\n\t"
+            "vld1.32 {q11}, [%[buf1]]\n\t"
+            "sub %[buf1], %[buf1], #16\n\t"
+
+            "vdup.32 d26, d16[1]\n\t"
+            "vdup.32 d27, d17[1]\n\t"
+            "vdup.32 d24, d16[0]\n\t"
+            "vdup.32 d25, d17[0]\n\t"
+
+            "vdup.32 d30, d23[1]\n\t"
+            "vdup.32 d31, d22[1]\n\t"
+            "vdup.32 d28, d23[0]\n\t"
+            "vdup.32 d29, d22[0]\n\t"
+
+            "vmul.f32 q13, q13, q10\n\t"
+            "vmul.f32 q15, q15, q9\n\t"
+            "vmul.f32 q12, q12, q10\n\t"
+            "vmul.f32 q14, q14, q9\n\t"
+            "vrev64.f32 q13, q13\n\t"
+            "vrev64.f32 q15, q15\n\t"
+
+            "vtrn.32 d26, d27\n\t"
+            "vtrn.32 d28, d29\n\t"
+            "vneg.f32 d27, d27\n\t"
+            "vneg.f32 d29, d29\n\t"
+            "vtrn.32 d26, d27\n\t"
+            "vtrn.32 d28, d29\n\t"
+
+            "vadd.f32 q12, q12, q14\n\t"
+            "vsub.f32 q13, q13, q15\n\t"
+            "vadd.f32 q12, q12, q13\n\t"
+            "vst1.32 {q12}, [%[pout]]!\n\t"
+            : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
+            [pout] "+r" (p_out)
+            :
+            : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+    }
 #else
-	for(i=0;i<N/2;i++) {
-		buf[2*i]   = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1];
-		buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i];
+    for (i = 0; i < N/2; i++) {
+        buf[2*i + 0] = in[2*i + 0] * A[2*i] + in[2*i + 1] * A[2*i + 1] + in[N - 2*i] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1];
+        buf[2*i + 1] = in[2*i + 1] * A[2*i] - in[2*i + 0] * A[2*i + 1] - in[N - 2*i] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
+    }
 #endif
-	}
 
-	p->plans[0]->transform(p->plans[0], buf, out);
-
-}
-
-ffts_plan_t *ffts_init_1d_real(size_t N, int sign) {
-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
-
-	if(sign < 0) p->transform = &ffts_execute_1d_real;
-	else         p->transform = &ffts_execute_1d_real_inv;
-
-	p->destroy = &ffts_free_1d_real;
-	p->N = N;
-	p->rank = 1;
-	p->plans = malloc(sizeof(ffts_plan_t **) * 1);
-
-	p->plans[0] = ffts_init_1d(N/2, sign);
-
-	p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1));
-
-	p->A = valloc(sizeof(float) * N);
-	p->B = valloc(sizeof(float) * N);
-
-  if(sign < 0) {
-		int i;
-		for (i = 0; i < N/2; i++) {
-			p->A[2 * i]     = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
-			p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
-			p->B[2 * i]     = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
-			p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
-		}
-	}else{
-		int i;
-		for (i = 0; i < N/2; i++) {
-			p->A[2 * i]     = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
-			p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
-			p->B[2 * i]     = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
-			p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
-		}
-  }
-
-	return p;
+    p->plans[0]->transform(p->plans[0], buf, out);
 }
 
-
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+ffts_plan_t *ffts_init_1d_real(size_t N, int sign)
+{
+    ffts_plan_t *p;
+    size_t i;
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    if (sign < 0) {
+        p->transform = &ffts_execute_1d_real;
+    } else {
+        p->transform = &ffts_execute_1d_real_inv;
+    }
+
+    p->destroy = &ffts_free_1d_real;
+    p->N       = N;
+    p->rank    = 1;
+
+    p->plans = (ffts_plan_t**) malloc(1 * sizeof(*p->plans));
+    if (!p->plans) {
+        goto cleanup;
+    }
+
+    p->plans[0] = ffts_init_1d(N/2, sign);
+    if (!p->plans[0]) {
+        goto cleanup;
+    }
+
+    p->buf = ffts_aligned_malloc(2 * ((N/2) + 1) * sizeof(float));
+    if (!p->buf) {
+        goto cleanup;
+    }
+
+    p->A = (float*) ffts_aligned_malloc(N * sizeof(float));
+    if (!p->A) {
+        goto cleanup;
+    }
+
+    p->B = (float*) ffts_aligned_malloc(N * sizeof(float));
+    if (!p->B) {
+        goto cleanup;
+    }
+
+    if (sign < 0) {
+        for (i = 0; i < N/2; i++) {
+            p->A[2 * i + 0] = 0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i));
+            p->A[2 * i + 1] = 0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i));
+            p->B[2 * i + 0] = 0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i));
+            p->B[2 * i + 1] = 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i));
+        }
+    } else {
+        for (i = 0; i < N/2; i++) {
+            p->A[2 * i + 0] = 1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i));
+            p->A[2 * i + 1] = 1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i));
+            p->B[2 * i + 0] = 1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i));
+            p->B[2 * i + 1] = 1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i));
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_1d_real(p);
+    return NULL;
+}
+\ No newline at end of file
diff --git a/src/ffts_real.h b/src/ffts_real.h
index d3f5316..81ca80f 100644
--- a/src/ffts_real.h
+++ b/src/ffts_real.h
@@ -1,10 +1,10 @@
 /*
- 
+
  This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
  Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -31,24 +31,16 @@
 
 */
 
-#ifndef __FFTS_REAL_H__
-#define __FFTS_REAL_H__
+#ifndef FFTS_REAL_H
+#define FFTS_REAL_H
 
-#include <stdint.h>
-#include <stddef.h>
-#include <stdio.h>
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
 
 #include "ffts.h"
-
-#ifdef HAVE_NEON 
-	#include <arm_neon.h>
-#endif
-#ifdef HAVE_SSE
-	#include <xmmintrin.h>
-#endif
+#include <stddef.h>
 
 ffts_plan_t *ffts_init_1d_real(size_t N, int sign);
 
-#endif
-
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+#endif /* FFTS_REAL_H */
diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c
index 151b72a..05bcc9c 100644
--- a/src/ffts_real_nd.c
+++ b/src/ffts_real_nd.c
@@ -32,199 +32,305 @@
 */
 
 #include "ffts_real_nd.h"
+#include "ffts_real.h"
 
 #ifdef __ARM_NEON__
 #include "neon.h"
 #endif
 
-void ffts_free_nd_real(ffts_plan_t *p) {
+#ifdef HAVE_NEON
+#include <arm_neon.h>
+#endif
+
+#ifdef HAVE_SSE
+#include <xmmintrin.h>
+#endif
+
+#include <stdio.h>
+
+static void ffts_free_nd_real(ffts_plan_t *p)
+{
+    if (p->plans) {
+        int i;
+
+        for (i = 0; i < p->rank; i++) {
+            ffts_plan_t *plan = p->plans[i];
+
+            if (plan) {
+                int j;
+
+                for (j = i + 1; j < p->rank; j++) {
+                    if (plan == p->plans[j]) {
+                        p->plans[j] = NULL;
+                    }
+                }
 
-	int i;
-	for(i=0;i<p->rank;i++) {
+                ffts_free(plan);
+            }
+        }
 
-		ffts_plan_t *x = p->plans[i];
+        free(p->plans);
+    }
 
-		int k;
-		for(k=i+1;k<p->rank;k++) {
-			if(x == p->plans[k]) p->plans[k] = NULL;
-		}
+    if (p->transpose_buf) {
+        ffts_aligned_free(p->transpose_buf);
+    }
 
-		if(x)	ffts_free(x);
-	}
+    if (p->buf) {
+        ffts_aligned_free(p->buf);
+    }
 
-	free(p->Ns);
-	free(p->Ms);
-	free(p->plans);
-	free(p->buf);
-	free(p->transpose_buf);
-	free(p);
+    if (p->Ns) {
+        free(p->Ns);
+    }
+
+    if (p->Ms) {
+        free(p->Ms);
+    }
+
+    free(p);
 }
 
-void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf) {
-	int const bw = 1;
-	int const bh = 8;
-	int i = 0, j = 0;
-	for (; i <= h-bh; i += bh) {
-		for (j = 0; j <= w-bw; j += bw) {
-			uint64_t const * ib = &src[w*i + j];
-			uint64_t * ob = &dst[h*j + i];
-
-			uint64_t s_0_0 = ib[0*w+0];
-			uint64_t s_1_0 = ib[1*w+0];
-			uint64_t s_2_0 = ib[2*w+0];
-			uint64_t s_3_0 = ib[3*w+0];
-			uint64_t s_4_0 = ib[4*w+0];
-			uint64_t s_5_0 = ib[5*w+0];
-			uint64_t s_6_0 = ib[6*w+0];
-			uint64_t s_7_0 = ib[7*w+0];
-
-			ob[0*h+0] = s_0_0;
-			ob[0*h+1] = s_1_0;
-			ob[0*h+2] = s_2_0;
-			ob[0*h+3] = s_3_0;
-			ob[0*h+4] = s_4_0;
-			ob[0*h+5] = s_5_0;
-			ob[0*h+6] = s_6_0;
-			ob[0*h+7] = s_7_0;
-		}
-	}
-	if (i < h) {
-		for (int i1 = 0; i1 < w; i1++) {
-			for (int j = i; j < h; j++) {
-				dst[i1*h + j] = src[j*w + i1];
-			}
-		}
-	}
-	if (j < w) {
-		for (int i = j; i < w; i++) {
-			for (int j1 = 0; j1 < h; j1++) {
-				dst[i*h + j1] = src[j1*w + i];
-			}
-		}
-	}
+static void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf)
+{
+    const int bw = 1;
+    const int bh = 8;
+    int i = 0, j = 0;
+
+    for (; i <= h - bh; i += bh) {
+        for (j = 0; j <= w - bw; j += bw) {
+            uint64_t const *ib = &src[w*i + j];
+            uint64_t *ob = &dst[h*j + i];
+
+            uint64_t s_0_0 = ib[0*w + 0];
+            uint64_t s_1_0 = ib[1*w + 0];
+            uint64_t s_2_0 = ib[2*w + 0];
+            uint64_t s_3_0 = ib[3*w + 0];
+            uint64_t s_4_0 = ib[4*w + 0];
+            uint64_t s_5_0 = ib[5*w + 0];
+            uint64_t s_6_0 = ib[6*w + 0];
+            uint64_t s_7_0 = ib[7*w + 0];
+
+            ob[0*h + 0] = s_0_0;
+            ob[0*h + 1] = s_1_0;
+            ob[0*h + 2] = s_2_0;
+            ob[0*h + 3] = s_3_0;
+            ob[0*h + 4] = s_4_0;
+            ob[0*h + 5] = s_5_0;
+            ob[0*h + 6] = s_6_0;
+            ob[0*h + 7] = s_7_0;
+        }
+    }
+
+    if (i < h) {
+        int i1;
+
+        for (i1 = 0; i1 < w; i1++) {
+            for (j = i; j < h; j++) {
+                dst[i1*h + j] = src[j*w + i1];
+            }
+        }
+    }
+
+    if (j < w) {
+        int j1;
+
+        for (i = j; i < w; i++) {
+            for (j1 = 0; j1 < h; j1++) {
+                dst[i*h + j1] = src[j1*w + i];
+            }
+        }
+    }
 }
 
-void ffts_execute_nd_real(ffts_plan_t *p, const void *  in, void *  out) {
+static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
+{
+    const size_t Ms0 = p->Ms[0];
+    const size_t Ns0 = p->Ns[0];
+
+    uint32_t *din = (uint32_t*) in;
+    uint64_t *buf = p->buf;
+    uint64_t *dout = (uint64_t*) out;
+    uint64_t *transpose_buf = (uint64_t*) p->transpose_buf;
+
+    ffts_plan_t *plan;
+    size_t i, j;
 
-	uint32_t *din = (uint32_t *)in;
-	uint64_t *buf = p->buf;
-	uint64_t *dout = (uint64_t *)out;
+    plan = p->plans[0];
+    for (i = 0; i < Ns0; i++) {
+        plan->transform(plan, din + (i * Ms0), buf + (i * (Ms0 / 2 + 1)));
+    }
 
-	size_t i,j;
-	for(i=0;i<p->Ns[0];i++) {
-		p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1)));
-	}
-	ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf);
+    ffts_scalar_transpose(buf, dout, Ms0 / 2 + 1, Ns0, transpose_buf);
 
-	for(i=1;i<p->rank;i++) {
-		for(j=0;j<p->Ns[i];j++) {
-			p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
-		}
-		ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
-	}
+    for (i = 1; i < p->rank; i++) {
+        const size_t Ms = p->Ms[i];
+        const size_t Ns = p->Ns[i];
+
+        plan = p->plans[i];
+
+        for (j = 0; j < Ns; j++) {
+            plan->transform(plan, dout + (j * Ms), buf + (j * Ms));
+        }
+
+        ffts_scalar_transpose(buf, dout, Ms, Ns, transpose_buf);
+    }
 }
 
-void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *  in, void *  out) {
+static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out)
+{
+    const size_t Ms0 = p->Ms[0];
+    const size_t Ms1 = p->Ms[1];
+    const size_t Ns0 = p->Ns[0];
+    const size_t Ns1 = p->Ns[1];
+
+    uint64_t *din = (uint64_t*) in;
+    uint64_t *buf = p->buf;
+    uint64_t *buf2;
+    uint64_t *transpose_buf = (uint64_t*) p->transpose_buf;
+    float    *doutr = (float*) out;
 
-	uint64_t *din = (uint64_t *)in;
-	uint64_t *buf = p->buf;
-	uint64_t *buf2;
-	uint64_t *dout = (uint64_t *)out;
-	size_t vol = 1;
+    ffts_plan_t *plan;
+    size_t vol;
 
-	float *bufr = (float *)(p->buf);
-	float *doutr = (float *)out;
+    size_t i, j;
 
-	size_t i,j;
+    vol = p->Ns[0];
+    for (i = 1; i < p->rank; i++) {
+        vol *= p->Ns[i];
+    }
 
-	for(i=0;i<p->rank;i++) {
-		vol *= p->Ns[i];
-	}
+    buf2 = buf + vol;
 
-	buf2 = buf + vol;
+    ffts_scalar_transpose(din, buf, Ms0, Ns0, transpose_buf);
 
-	ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf);
+    plan = p->plans[0];
+    for (i = 0; i < Ms0; i++) {
+        plan->transform(plan, buf + (i * Ns0), buf2 + (i * Ns0));
+    }
 
-	for(i=0;i<p->Ms[0];i++) {
-		p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), buf2 + (i * p->Ns[0]));
-	}
+    ffts_scalar_transpose(buf2, buf, Ns0, Ms0, transpose_buf);
 
-	ffts_scalar_transpose(buf2, buf, p->Ns[0], p->Ms[0], p->transpose_buf);
-	for(j=0;j<p->Ms[1];j++) {
-  	p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]);
-  }
+    plan = p->plans[1];
+    for (j = 0; j < Ms1; j++) {
+        plan->transform(plan, buf + (j * Ms0), &doutr[j * Ns1]);
+    }
 }
 
-ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
-	size_t vol = 1;
-	size_t bufsize;
-
-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
-
-	if(sign < 0) p->transform = &ffts_execute_nd_real;
-	else         p->transform = &ffts_execute_nd_real_inv;
-
-	p->destroy = &ffts_free_nd_real;
-
-	p->rank = rank;
-	p->Ns = malloc(sizeof(size_t) * rank);
-	p->Ms = malloc(sizeof(size_t) * rank);
-	p->plans = malloc(sizeof(ffts_plan_t **) * rank);
-	int i;
-	for(i=0;i<rank;i++) {
-		p->Ns[i] = Ns[i];
-		vol *= Ns[i];
-	}
-
-	//There is probably a prettier way of doing this, but it works..
-	if(sign < 0) {
-		bufsize = 2 * vol;
-	}
-	else {
-		bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
-	}
-
-	p->buf = valloc(sizeof(float) * bufsize);
-
-	for(i=0;i<rank;i++) {
-		p->Ms[i] = vol / p->Ns[i];
-
-		p->plans[i] = NULL;
-		int k;
-
-		if(sign < 0) {
-			for(k=1;k<i;k++) {
-				if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k];
-			}
-			if(!i)                p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
-			else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
-		}else{
-  		for(k=0;k<i;k++) {
-  			if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k];
-  		}
-			if(i==rank-1)         p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
-			else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign);
-		}
-	}
-	if(sign < 0) {
-		for(i=1;i<rank;i++) {
-			p->Ns[i] = p->Ns[i] / 2 + 1;
-		}
-	}else{
-		for(i=0;i<rank-1;i++) {
-			p->Ms[i] = p->Ms[i] / 2 + 1;
-		}
-	}
-
-	p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
-	return p;
+ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign)
+{
+    int i;
+    size_t vol = 1;
+    size_t bufsize;
+    ffts_plan_t *p;
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    if (sign < 0) {
+        p->transform = &ffts_execute_nd_real;
+    } else {
+        p->transform = &ffts_execute_nd_real_inv;
+    }
+
+    p->destroy = &ffts_free_nd_real;
+    p->rank    = rank;
+
+    p->Ms = (size_t*) malloc(rank * sizeof(*p->Ms));
+    if (!p->Ms) {
+        goto cleanup;
+    }
+
+    p->Ns = (size_t*) malloc(rank * sizeof(*p->Ns));
+    if (!p->Ns) {
+        goto cleanup;
+    }
+
+    for (i = 0; i < rank; i++) {
+        p->Ns[i] = Ns[i];
+        vol *= Ns[i];
+    }
+
+    /* there is probably a prettier way of doing this, but it works.. */
+    if (sign < 0) {
+        bufsize = 2 * vol;
+    } else {
+        bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
+    }
+
+    p->buf = ffts_aligned_malloc(bufsize * sizeof(float));
+    if (!p->buf) {
+        goto cleanup;
+    }
+
+    p->transpose_buf = ffts_aligned_malloc(2 * 8 * 8 * sizeof(float));
+    if (!p->transpose_buf) {
+        goto cleanup;
+    }
+
+    p->plans = (ffts_plan_t**) calloc(rank, sizeof(*p->plans));
+    if (!p->plans) {
+        goto cleanup;
+    }
+
+    for (i = 0; i < rank; i++) {
+        int k;
+
+        p->Ms[i] = vol / p->Ns[i];
+
+        if (sign < 0) {
+            if (!i) {
+                p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
+            } else {
+                for (k = 1; k < i; k++) {
+                    if (p->Ms[k] == p->Ms[i]) {
+                        p->plans[i] = p->plans[k];
+                        break;
+                    }
+                }
+
+                if (!p->plans[i]) {
+                    p->plans[i] = ffts_init_1d(p->Ms[i], sign);
+                    p->Ns[i] = p->Ns[i] / 2 + 1;
+                }
+            }
+        } else {
+            if (i == rank - 1) {
+                p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
+            } else {
+                for (k = 0; k < i; k++) {
+                    if (p->Ns[k] == p->Ns[i]) {
+                        p->plans[i] = p->plans[k];
+                        break;
+                    }
+                }
+
+                if (!p->plans[i]) {
+                    p->plans[i] = ffts_init_1d(p->Ns[i], sign);
+                    p->Ms[i] = p->Ms[i] / 2 + 1;
+                }
+            }
+        }
+
+        if (!p->plans[i]) {
+            goto cleanup;
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_nd_real(p);
+    return NULL;
 }
 
+ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign)
+{
+    size_t Ns[2];
 
-ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) {
-	size_t Ns[2];
-	Ns[0] = N1;
-	Ns[1] = N2;
-	return ffts_init_nd_real(2, Ns, sign);
+    Ns[0] = N1;
+    Ns[1] = N2;
+    return ffts_init_nd_real(2, Ns, sign);
 }
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
diff --git a/src/ffts_real_nd.h b/src/ffts_real_nd.h
index bc8ed75..d23a002 100644
--- a/src/ffts_real_nd.h
+++ b/src/ffts_real_nd.h
@@ -1,10 +1,10 @@
 /*
- 
+
  This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
  Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -31,24 +31,17 @@
 
 */
 
-#ifndef __FFTS_REAL_ND_H__
-#define __FFTS_REAL_ND_H__
+#ifndef FFTS_REAL_ND_H
+#define FFTS_REAL_ND_H
 
-#include <stdint.h>
-#include <stddef.h>
-#include <stdio.h>
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
 
-#include "ffts_nd.h"
-#include "ffts_real.h"
 #include "ffts.h"
+#include <stddef.h>
 
-#ifdef HAVE_NEON 
-	#include <arm_neon.h>
-#endif
-#ifdef HAVE_SSE
-	#include <xmmintrin.h>
-#endif
-
-#endif
+ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign);
+ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign);
 
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+#endif /* FFTS_REAL_ND_H */
+\ No newline at end of file
diff --git a/src/ffts_small.c b/src/ffts_small.c
index e53493c..6f700c6 100644
--- a/src/ffts_small.c
+++ b/src/ffts_small.c
@@ -1,10 +1,10 @@
 /*
- 
+
  This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
+
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
  Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
- 
+
  All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -31,127 +31,153 @@
 
 */
 
-#include "ffts.h"
+#include "ffts_small.h"
 #include "macros.h"
 
 #include <stdlib.h>
 
-#define DEBUG(x)
-
-#include "ffts_small.h"
-
- void firstpass_16_f(ffts_plan_t *  p, const void *  in, void *  out)
+void ffts_firstpass_16_f(ffts_plan_t *p, const void *in, void *out)
 {
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
-    float *LUT8 = p->ws;
-
-    L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
-    L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
-    K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
-    K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
-    S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
-    K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
-    S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
+    const data_t *din = (const data_t*) in;
+    data_t *dout = (data_t*) out;
+    float *LUT8 = (float*) p->ws;
+    V r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+
+    L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
+    L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
+    K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    K_N(0, VLD(LUT8+8), VLD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13);
+    S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
+    K_N(0, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
+    S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }
 
- void firstpass_16_b(ffts_plan_t *  p, const void *  in, void *  out)
+void ffts_firstpass_16_b(ffts_plan_t *p, const void *in, void *out)
 {
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
-    float *LUT8 = p->ws;
-
-    L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
-    L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
-    K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
-    K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
-    S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
-    K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
-    S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
-}
+    const data_t *din = (const data_t*) in;
+    data_t *dout = (data_t*) out;
+    float *LUT8 = (float*) p->ws;
+    V r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
 
+    L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
+    L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
+    K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    K_N(1, VLD(LUT8+8), VLD(LUT8+12),&r0_1, &r4_5, &r8_9, &r12_13);
+    S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
+    K_N(1, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
+    S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
+}
 
- void firstpass_8_f(ffts_plan_t *p, const void *in, void *out)
+void ffts_firstpass_8_f(ffts_plan_t *p, const void *in, void *out)
 {
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
+    const data_t *din = (const data_t*) in;
+    data_t *dout = (data_t*) out;
     V r0_1, r2_3, r4_5, r6_7;
-    float *LUT8 = p->ws + p->ws_is[0];
+    float *LUT8 = (float*) p->ws + p->ws_is[0];
 
     L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
     K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
+    S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }
 
- void firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
+void ffts_firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
 {
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
+    const data_t *din = (const data_t*) in;
+    data_t *dout = (data_t*) out;
     V r0_1, r2_3, r4_5, r6_7;
-    float *LUT8 = p->ws + p->ws_is[0];
+    float *LUT8 = (float*) p->ws + p->ws_is[0];
 
     L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
     K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
+    S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }
 
-
- void firstpass_4_f(ffts_plan_t *p, const void *in, void *out)
+void ffts_firstpass_4_f(ffts_plan_t *p, const void *in, void *out)
 {
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
+    const data_t *din = (const data_t*) in;
+    data_t *dout = (data_t*) out;
     cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
-    t0[0] = din[0]; t0[1] = din[1];
-    t1[0] = din[4]; t1[1] = din[5];
-    t2[0] = din[2]; t2[1] = din[3];
-    t3[0] = din[6]; t3[1] = din[7];
-        
-    t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
-    t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
-    t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
-    t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
-
-    dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
-    dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
-    dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0];
-    dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0];
+
+    t0[0] = din[0];
+    t0[1] = din[1];
+    t1[0] = din[4];
+    t1[1] = din[5];
+    t2[0] = din[2];
+    t2[1] = din[3];
+    t3[0] = din[6];
+    t3[1] = din[7];
+
+    t4[0] = t0[0] + t1[0];
+    t4[1] = t0[1] + t1[1];
+    t5[0] = t0[0] - t1[0];
+    t5[1] = t0[1] - t1[1];
+    t6[0] = t2[0] + t3[0];
+    t6[1] = t2[1] + t3[1];
+    t7[0] = t2[0] - t3[0];
+    t7[1] = t2[1] - t3[1];
+
+    dout[0] = t4[0] + t6[0];
+    dout[1] = t4[1] + t6[1];
+    dout[4] = t4[0] - t6[0];
+    dout[5] = t4[1] - t6[1];
+    dout[2] = t5[0] + t7[1];
+    dout[3] = t5[1] - t7[0];
+    dout[6] = t5[0] - t7[1];
+    dout[7] = t5[1] + t7[0];
 }
 
- void firstpass_4_b(ffts_plan_t *p, const void *in, void *out)
+void ffts_firstpass_4_b(ffts_plan_t *p, const void *in, void *out)
 {
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
+    const data_t *din = (const data_t*) in;
+    data_t *dout = (data_t*) out;
     cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
-    t0[0] = din[0]; t0[1] = din[1];
-    t1[0] = din[4]; t1[1] = din[5];
-    t2[0] = din[2]; t2[1] = din[3];
-    t3[0] = din[6]; t3[1] = din[7];
-        
-    t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
-    t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
-    t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
-    t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
-
-    dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
-    dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
-    dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0];
-    dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0];
+
+    t0[0] = din[0];
+    t0[1] = din[1];
+    t1[0] = din[4];
+    t1[1] = din[5];
+    t2[0] = din[2];
+    t2[1] = din[3];
+    t3[0] = din[6];
+    t3[1] = din[7];
+
+    t4[0] = t0[0] + t1[0];
+    t4[1] = t0[1] + t1[1];
+    t5[0] = t0[0] - t1[0];
+    t5[1] = t0[1] - t1[1];
+    t6[0] = t2[0] + t3[0];
+    t6[1] = t2[1] + t3[1];
+    t7[0] = t2[0] - t3[0];
+    t7[1] = t2[1] - t3[1];
+
+    dout[0] = t4[0] + t6[0];
+    dout[1] = t4[1] + t6[1];
+    dout[4] = t4[0] - t6[0];
+    dout[5] = t4[1] - t6[1];
+    dout[2] = t5[0] - t7[1];
+    dout[3] = t5[1] + t7[0];
+    dout[6] = t5[0] + t7[1];
+    dout[7] = t5[1] - t7[0];
 }
 
- void firstpass_2(ffts_plan_t *p, const void *in, void *out)
+void ffts_firstpass_2(ffts_plan_t *p, const void *in, void *out)
 {
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    cdata_t t0, t1, r0,r1;
-    t0[0] = din[0]; t0[1] = din[1];
-    t1[0] = din[2]; t1[1] = din[3];
+    const data_t *din = (const data_t*) in;
+    data_t *dout = (data_t*) out;
+    cdata_t t0, t1, r0, r1;
+
+    t0[0] = din[0];
+    t0[1] = din[1];
+    t1[0] = din[2];
+    t1[1] = din[3];
+
     r0[0] = t0[0] + t1[0];
     r0[1] = t0[1] + t1[1];
     r1[0] = t0[0] - t1[0];
     r1[1] = t0[1] - t1[1];
-    dout[0] = r0[0]; dout[1] = r0[1];
-    dout[2] = r1[0]; dout[3] = r1[1];
-}
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+
+    dout[0] = r0[0];
+    dout[1] = r0[1];
+    dout[2] = r1[0];
+    dout[3] = r1[1];
+}
+\ No newline at end of file
diff --git a/src/ffts_small.h b/src/ffts_small.h
index 683537a..5ae48cc 100644
--- a/src/ffts_small.h
+++ b/src/ffts_small.h
@@ -1,14 +1,14 @@
-#ifndef __FFTS_SMALL_H__
-#define __FFTS_SMALL_H__
+#ifndef FFTS_SMALL_H
+#define FFTS_SMALL_H
 
+#include "ffts.h"
 
-void firstpass_16_f(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_16_b(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_8_f(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_8_b(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_4_f(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_4_b(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_2(ffts_plan_t *  p, const void *  in, void *  out);
+void ffts_firstpass_16_f(ffts_plan_t *p, const void *in, void *out);
+void ffts_firstpass_16_b(ffts_plan_t *p, const void *in, void *out);
+void ffts_firstpass_8_f(ffts_plan_t *p, const void *in, void *out);
+void ffts_firstpass_8_b(ffts_plan_t *p, const void *in, void *out);
+void ffts_firstpass_4_f(ffts_plan_t *p, const void *in, void *out);
+void ffts_firstpass_4_b(ffts_plan_t *p, const void *in, void *out);
+void ffts_firstpass_2(ffts_plan_t *p, const void *in, void *out);
 
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+#endif /* FFTS_SMALL_H */