1 files changed, 175 insertions, 133 deletions
diff --git a/src/macros.h b/src/macros.h
index 08029a3..b755775 100644
--- a/src/macros.h
+++ b/src/macros.h
@@ -1,162 +1,204 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
- Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
 
-#ifndef __MACROS_H__
-#define __MACROS_H__
+#ifndef FFTS_MACROS_H
+#define FFTS_MACROS_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
 
 #ifdef HAVE_NEON
 #include "macros-neon.h"
+#elif HAVE_SSE
+#include "macros-sse.h"
+#elif __powerpc__
+#include "macros-altivec.h"
 #else
-#ifdef __alpha__
 #include "macros-alpha.h"
-#else
-#ifdef __powerpc__
-#include "macros-altivec.h"
-#endif
 #endif
 
-#endif
+static FFTS_INLINE void
+V4SF_TX2(V4SF *a, V4SF *b)
+{
+    V4SF t0 = V4SF_UNPACK_LO(*a, *b);
+    V4SF t1 = V4SF_UNPACK_HI(*a, *b);
+    *a = t0;
+    *b = t1;
+}
 
+static FFTS_INLINE void
+V4SF_K_N(int inv,
+         V4SF re,
+         V4SF im,
+         V4SF *r0,
+         V4SF *r1,
+         V4SF *r2,
+         V4SF *r3)
+{
+    V4SF uk, uk2, zk_p, zk_n, zk, zk_d;
 
-#ifdef HAVE_VFP
-#include "macros-alpha.h"
-#endif
-#ifdef HAVE_SSE
-	#include "macros-sse.h"
-#endif
+    uk  = *r0;
+    uk2 = *r1;
 
-static inline void TX2(V *a, V *b)
-{
-    V TX2_t0 = VUNPACKLO(*a, *b);
-    V TX2_t1 = VUNPACKHI(*a, *b);
-    *a = TX2_t0; *b = TX2_t1; 
+    zk_p = V4SF_IMUL(*r2, re, im);
+    zk_n = V4SF_IMULJ(*r3, re, im);
+
+    zk   = V4SF_ADD(zk_p, zk_n);
+    zk_d = V4SF_IMULI(inv, V4SF_SUB(zk_p, zk_n));
+
+    *r2 = V4SF_SUB(uk, zk);
+    *r0 = V4SF_ADD(uk, zk);
+    *r3 = V4SF_ADD(uk2, zk_d);
+    *r1 = V4SF_SUB(uk2, zk_d);
 }
 
-static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void
+V4SF_L_2_4(int inv,
+           const float *FFTS_RESTRICT i0,
+           const float *FFTS_RESTRICT i1,
+           const float *FFTS_RESTRICT i2,
+           const float *FFTS_RESTRICT i3,
+           V4SF *r0,
+           V4SF *r1,
+           V4SF *r2,
+           V4SF *r3)
 {
-    V uk, uk2, zk_p, zk_n, zk, zk_d;
-    uk   = *r0; uk2  = *r1;
-    zk_p = IMUL(*r2, re, im);
-    zk_n = IMULJ(*r3, re, im);
-
-    zk   = VADD(zk_p, zk_n);
-    zk_d = IMULI(inv, VSUB(zk_p, zk_n));
-  
-    *r2 = VSUB(uk, zk);
-    *r0 = VADD(uk, zk);
-    *r3 = VADD(uk2, zk_d);
-    *r1 = VSUB(uk2, zk_d);
-}
+    V4SF t0, t1, t2, t3, t4, t5, t6, t7;
 
+    t0 = V4SF_LD(i0);
+    t1 = V4SF_LD(i1);
+    t2 = V4SF_LD(i2);
+    t3 = V4SF_LD(i3);
 
-static inline void S_4(V r0, V r1, V r2, V r3, 
-		       data_t * restrict o0, data_t * restrict o1,
-		       data_t * restrict o2, data_t * restrict o3)
-{
-    VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
-}
+    t4 = V4SF_ADD(t0, t1);
+    t5 = V4SF_SUB(t0, t1);
+    t6 = V4SF_ADD(t2, t3);
+    t7 = V4SF_SUB(t2, t3);
 
+    *r0 = V4SF_UNPACK_LO(t4, t5);
+    *r1 = V4SF_UNPACK_LO(t6, t7);
 
-static inline void L_2_4(int inv, 
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
-{
-    V t0, t1, t2, t3, t4, t5, t6, t7;
-
-    t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);    
-    t4 = VADD(t0, t1);
-    t5 = VSUB(t0, t1);
-    t6 = VADD(t2, t3);
-    t7 = VSUB(t2, t3);
-    *r0 = VUNPACKLO(t4, t5);
-    *r1 = VUNPACKLO(t6, t7);
-    t5 = IMULI(inv, t5);
-    t0 = VADD(t6, t4);
-    t2 = VSUB(t6, t4);
-    t1 = VSUB(t7, t5);
-    t3 = VADD(t7, t5);
-    *r3 = VUNPACKHI(t0, t1);
-    *r2 = VUNPACKHI(t2, t3);
-}
+    t5 = V4SF_IMULI(inv, t5);
 
+    t0 = V4SF_ADD(t6, t4);
+    t2 = V4SF_SUB(t6, t4);
+    t1 = V4SF_SUB(t7, t5);
+    t3 = V4SF_ADD(t7, t5);
 
-static inline void L_4_4(int inv,  
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
-{
-    V t0, t1, t2, t3, t4, t5, t6, t7;
- 
-    t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);   
-    t4 = VADD(t0, t1);
-    t5 = VSUB(t0, t1);
-    t6 = VADD(t2, t3);
-    t7 = IMULI(inv, VSUB(t2, t3));
-    t0 = VADD(t4, t6);
-    t2 = VSUB(t4, t6);
-    t1 = VSUB(t5, t7);
-    t3 = VADD(t5, t7);
-    TX2(&t0, &t1);
-    TX2(&t2, &t3);
-    *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; 
+    *r3 = V4SF_UNPACK_HI(t0, t1);
+    *r2 = V4SF_UNPACK_HI(t2, t3);
 }
 
+static FFTS_INLINE void
+V4SF_L_4_4(int inv,
+           const float *FFTS_RESTRICT i0,
+           const float *FFTS_RESTRICT i1,
+           const float *FFTS_RESTRICT i2,
+           const float *FFTS_RESTRICT i3,
+           V4SF *r0,
+           V4SF *r1,
+           V4SF *r2,
+           V4SF *r3)
+{
+    V4SF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4SF_LD(i0);
+    t1 = V4SF_LD(i1);
+    t2 = V4SF_LD(i2);
+    t3 = V4SF_LD(i3);
 
+    t4 = V4SF_ADD(t0, t1);
+    t5 = V4SF_SUB(t0, t1);
+    t6 = V4SF_ADD(t2, t3);
+
+    t7 = V4SF_IMULI(inv, V4SF_SUB(t2, t3));
+
+    t0 = V4SF_ADD(t4, t6);
+    t2 = V4SF_SUB(t4, t6);
+    t1 = V4SF_SUB(t5, t7);
+    t3 = V4SF_ADD(t5, t7);
+
+    V4SF_TX2(&t0, &t1);
+    V4SF_TX2(&t2, &t3);
+
+    *r0 = t0;
+    *r2 = t1;
+    *r1 = t2;
+    *r3 = t3;
+}
 
-static inline void L_4_2(int inv,  
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void
+V4SF_L_4_2(int inv,
+           const float *FFTS_RESTRICT i0,
+           const float *FFTS_RESTRICT i1,
+           const float *FFTS_RESTRICT i2,
+           const float *FFTS_RESTRICT i3,
+           V4SF *r0,
+           V4SF *r1,
+           V4SF *r2,
+           V4SF *r3)
 {
-    V t0, t1, t2, t3, t4, t5, t6, t7;
-
-    t0 = VLD(i0);    t1 = VLD(i1);    t6 = VLD(i2);    t7 = VLD(i3);
-    t2 = VBLEND(t6, t7);
-    t3 = VBLEND(t7, t6);
-    t4 = VADD(t0, t1);
-    t5 = VSUB(t0, t1);
-    t6 = VADD(t2, t3);
-    t7 = VSUB(t2, t3);
-    *r2 = VUNPACKHI(t4, t5);
-    *r3 = VUNPACKHI(t6, t7); 
-    t7 = IMULI(inv, t7);
-    t0 = VADD(t4, t6);
-    t2 = VSUB(t4, t6);
-    t1 = VSUB(t5, t7);
-    t3 = VADD(t5, t7);
-    *r0 = VUNPACKLO(t0, t1);
-    *r1 = VUNPACKLO(t2, t3);
+    V4SF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4SF_LD(i0);
+    t1 = V4SF_LD(i1);
+    t6 = V4SF_LD(i2);
+    t7 = V4SF_LD(i3);
+
+    t2 = V4SF_BLEND(t6, t7);
+    t3 = V4SF_BLEND(t7, t6);
+
+    t4 = V4SF_ADD(t0, t1);
+    t5 = V4SF_SUB(t0, t1);
+    t6 = V4SF_ADD(t2, t3);
+    t7 = V4SF_SUB(t2, t3);
+
+    *r2 = V4SF_UNPACK_HI(t4, t5);
+    *r3 = V4SF_UNPACK_HI(t6, t7);
+
+    t7 = V4SF_IMULI(inv, t7);
+
+    t0 = V4SF_ADD(t4, t6);
+    t2 = V4SF_SUB(t4, t6);
+    t1 = V4SF_SUB(t5, t7);
+    t3 = V4SF_ADD(t5, t7);
+
+    *r0 = V4SF_UNPACK_LO(t0, t1);
+    *r1 = V4SF_UNPACK_LO(t2, t3);
 }
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+
+#define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
+    V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3);
+
+#endif /* FFTS_MACROS_H */
+\ No newline at end of file