summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorAnthony Blake <anthonix@me.com>2013-04-22 16:45:17 +1200
committerAnthony Blake <anthonix@me.com>2013-04-22 16:45:17 +1200
commit7b3907cff81fb82380787e63e4304fb8af807c0c (patch)
tree11ae035cd02aab22e1a4069737fb6f422a7531eb /src
parenta45464980b8de7faef21eb46479f7e09fd056441 (diff)
downloadffts-7b3907cff81fb82380787e63e4304fb8af807c0c.zip
ffts-7b3907cff81fb82380787e63e4304fb8af807c0c.tar.gz
Fixed up the smaller VFP transforms. Inverse VFP and real/nd VFP still not working yet.
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am2
-rw-r--r--src/Makefile.in16
-rw-r--r--src/ffts.c20
-rw-r--r--src/ffts_nd.c6
-rw-r--r--src/ffts_nd.h5
-rw-r--r--src/ffts_real.h5
-rw-r--r--src/ffts_real_nd.h6
-rw-r--r--src/macros.h398
-rw-r--r--src/sse_float.h18
9 files changed, 168 insertions, 308 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index d439a2c..a07becc 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -2,7 +2,7 @@
lib_LTLIBRARIES = libffts.la
-libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c
+libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c
if DYNAMIC_DISABLED
libffts_la_SOURCES += ffts_static.c
diff --git a/src/Makefile.in b/src/Makefile.in
index 49590ff..97bb79f 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -99,9 +99,9 @@ am__installdirs = "$(DESTDIR)$(libdir)" \
"$(DESTDIR)$(libffts_includedir)"
LTLIBRARIES = $(lib_LTLIBRARIES)
libffts_la_LIBADD =
-am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \
- ffts_real_nd.c patterns.c ffts_static.c codegen.c vfp.s \
- neon_static_f.s neon_static_i.s neon.s sse.s
+am__libffts_la_SOURCES_DIST = ffts.c ffts_small.c ffts_nd.c \
+ ffts_real.c ffts_real_nd.c patterns.c ffts_static.c codegen.c \
+ vfp.s neon_static_f.s neon_static_i.s neon.s sse.s
@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
@HAVE_VFP_TRUE@am__objects_3 = vfp.lo
@@ -110,7 +110,7 @@ am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon.lo
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 = \
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@ sse.lo
-am_libffts_la_OBJECTS = ffts.lo ffts_nd.lo ffts_real.lo \
+am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \
ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
$(am__objects_3) $(am__objects_4) $(am__objects_5) \
$(am__objects_6)
@@ -264,9 +264,10 @@ top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
lib_LTLIBRARIES = libffts.la
-libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c \
- patterns.c $(am__append_1) $(am__append_2) $(am__append_3) \
- $(am__append_4) $(am__append_5) $(am__append_6)
+libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c \
+ ffts_real_nd.c patterns.c $(am__append_1) $(am__append_2) \
+ $(am__append_3) $(am__append_4) $(am__append_5) \
+ $(am__append_6)
libffts_includedir = $(includedir)/ffts
libffts_include_HEADERS = ../include/ffts.h
all: all-am
@@ -351,6 +352,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_nd.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real_nd.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_small.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_static.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@
diff --git a/src/ffts.c b/src/ffts.c
index fe4b590..e83bf6a 100644
--- a/src/ffts.c
+++ b/src/ffts.c
@@ -34,6 +34,7 @@
#include "macros.h"
//#include "mini_macros.h"
#include "patterns.h"
+#include "ffts_small.h"
#ifdef DYNAMIC_DISABLED
#include "ffts_static.h"
@@ -89,12 +90,12 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
size_t i;
#ifdef __arm__
-#ifdef HAVE_NEON
+//#ifdef HAVE_NEON
V MULI_SIGN;
if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
-#endif
+//#endif
#else
V MULI_SIGN;
@@ -230,22 +231,27 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
float *fw = (float *)w;
V temp0, temp1, temp2;
for(j=0;j<n/4;j+=2) {
- #ifdef HAVE_NEON
+ // #ifdef HAVE_NEON
temp0 = VLD(fw0 + j*2);
V re, im;
re = VDUPRE(temp0);
im = VDUPIM(temp0);
- im = VXOR(im, MULI_SIGN);
+ #ifdef HAVE_NEON
+ im = VXOR(im, MULI_SIGN);
+ //im = IMULI(sign>0, im);
+ #else
+ im = MULI(sign>0, im);
+ #endif
VST(fw + j*4 , re);
VST(fw + j*4+4, im);
- #endif
+ // #endif
}
w += n/4 * 2;
}else{
//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
float *fw = (float *)w;
- VS temp0, temp1, temp2;
#ifdef HAVE_NEON
+ VS temp0, temp1, temp2;
for(j=0;j<n/4;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
@@ -299,8 +305,8 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
#ifdef __arm__
//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
float *fw = (float *)w;
- VS temp0, temp1, temp2;
#ifdef HAVE_NEON
+ VS temp0, temp1, temp2;
for(j=0;j<n/8;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
diff --git a/src/ffts_nd.c b/src/ffts_nd.c
index a23ad7f..e65fe7f 100644
--- a/src/ffts_nd.c
+++ b/src/ffts_nd.c
@@ -33,7 +33,7 @@
#include "ffts_nd.h"
-#ifdef __ARM_NEON__
+#ifdef HAVE_NEON
#include "neon.h"
#endif
@@ -62,7 +62,7 @@ void ffts_free_nd(ffts_plan_t *p) {
#include <string.h>
void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
-#ifdef __ARM_NEON__
+#ifdef HAVE_NEON
size_t i,j,k;
int linebytes = w*8;
@@ -131,6 +131,7 @@ void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
}
}
#else
+#ifdef HAVE_SSE
uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64)));
int tx, ty;
int x, y;
@@ -212,6 +213,7 @@ void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
}
*/
#endif
+#endif
}
diff --git a/src/ffts_nd.h b/src/ffts_nd.h
index 7761f52..8f0c855 100644
--- a/src/ffts_nd.h
+++ b/src/ffts_nd.h
@@ -40,9 +40,10 @@
#include "ffts.h"
-#ifdef __ARM_NEON__
+#ifdef HAVE_NEON
#include <arm_neon.h>
-#else
+#endif
+#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif
diff --git a/src/ffts_real.h b/src/ffts_real.h
index e904b95..bf8834d 100644
--- a/src/ffts_real.h
+++ b/src/ffts_real.h
@@ -40,9 +40,10 @@
#include "ffts.h"
-#ifdef __ARM_NEON__
+#ifdef HAVE_NEON
#include <arm_neon.h>
-#else
+#endif
+#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif
diff --git a/src/ffts_real_nd.h b/src/ffts_real_nd.h
index 04def32..d777d42 100644
--- a/src/ffts_real_nd.h
+++ b/src/ffts_real_nd.h
@@ -42,12 +42,12 @@
#include "ffts_real.h"
#include "ffts.h"
-#ifdef __ARM_NEON__
+#ifdef HAVE_NEON
#include <arm_neon.h>
-#else
+#endif
+#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif
-
#endif
diff --git a/src/macros.h b/src/macros.h
index effc87d..10bd1e3 100644
--- a/src/macros.h
+++ b/src/macros.h
@@ -2,8 +2,8 @@
This file is part of FFTS -- The Fastest Fourier Transform in the South
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
All rights reserved.
@@ -31,301 +31,131 @@
*/
-
#ifndef __MACROS_H__
#define __MACROS_H__
-#include "../config.h"
-#include "types.h"
-
-#ifdef __ARM_NEON__
- //#include "neon_float.h"
- #include "neon.h"
-#include <arm_neon.h>
-
-typedef float32x4_t V;
-
-typedef float32x4x2_t VS;
-
-//#include <complex.h>
-//#include <stdalign.h>
-
-
-#define ADD vaddq_f32
-#define SUB vsubq_f32
-#define MUL vmulq_f32
-#define VADD vaddq_f32
-#define VSUB vsubq_f32
-#define VMUL vmulq_f32
-#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
-#define VST vst1q_f32
-#define VLD vld1q_f32
-#define VST2 vst2q_f32
-#define VLD2 vld2q_f32
-
-#define VSWAPPAIRS(x) (vrev64q_f32(x))
-
-#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
-#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
-
-#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
-
-__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
- data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
- return VLD(d);
-}
-
-#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
-#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
-
-#define FFTS_MALLOC(d,a) (valloc(d))
-#define FFTS_FREE(d) (free(d))
+#ifdef HAVE_NEON
+#include "macros-neon.h"
+#else
+#ifdef __alpha__
+#include "macros-alpha.h"
+#else
+#ifdef __powerpc__
+#include "macros-altivec.h"
+#endif
+#endif
-__INLINE void STORESPR(data_t * addr, VS p) {
+#endif
- vst1q_f32(addr, p.val[0]);
- vst1q_f32(addr + 4, p.val[1]);
-
-//__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t"
-// :
-// : "r" (addr), "w" (p.val[0]), "w" (p.val[1])
-// : "memory");
-}
-#else
+#ifdef HAVE_VFP
+#include "macros-alpha.h"
+#endif
+#ifdef HAVE_SSE
#include "sse_float.h"
#endif
-#include "ffts.h"
-
-
-//cdata_t SCALAR_MULI_SIGN;
-//V MULI_SIGN;
-//V LEAFLUT[12];
-
-__INLINE V IMULI(int inv, V a) {
- if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
- else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
-}
-
-__INLINE void
-S_4(V r0, V r1, V r2, V r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) {
- V t0, t1, t2, t3;
- VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
-}
-
-__INLINE V IMUL(V d, V re, V im) {
- re = VMUL(re, d);
- im = VMUL(im, VSWAPPAIRS(d));
- return VSUB(re, im);
-}
-
-__INLINE V IMULJ(V d, V re, V im) {
- re = VMUL(re, d);
- im = VMUL(im, VSWAPPAIRS(d));
- return VADD(re, im);
-}
-
-__INLINE void
-K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3) {
- V uk, uk2, zk_p, zk_n, zk, zk_d;
- uk = *r0; uk2 = *r1;
- zk_p = IMUL(*r2, re, im);
- zk_n = IMULJ(*r3, re, im);
-
- zk = VADD(zk_p, zk_n);
- zk_d = IMULI(inv, VSUB(zk_p, zk_n));
-
- *r2 = VSUB(uk, zk);
- *r0 = VADD(uk, zk);
- *r3 = VADD(uk2, zk_d);
- *r1 = VSUB(uk2, zk_d);
-}
-
-__INLINE void TX2(V *a, V *b) {
+static inline void TX2(V *a, V *b)
+{
V TX2_t0 = VUNPACKLO(*a, *b);
- V TX2_t1 = VUNPACKHI(*a, *b);
+ V TX2_t1 = VUNPACKHI(*a, *b);
*a = TX2_t0; *b = TX2_t1;
}
-__INLINE void
-L_4_4(int inv, const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
- V *r0, V *r1, V *r2, V *r3) {
- V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
- t4 = VADD(t0, t1);
- t5 = VSUB(t0, t1);
- t6 = VADD(t2, t3);
- t7 = IMULI(inv, VSUB(t2, t3));
- t0 = VADD(t4, t6);
- t2 = VSUB(t4, t6);
- t1 = VSUB(t5, t7);
- t3 = VADD(t5, t7);
- TX2(&t0,&t1);
- TX2(&t2,&t3);
- *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; }
-
-__INLINE void
-L_2_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
- V *r0, V *r1, V *r2, V *r3) {
- V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); t4 = VADD(t0, t1);
- t5 = VSUB(t0, t1);
- t6 = VADD(t2, t3);
- t7 = VSUB(t2, t3);
- TX2(&t4,&t5);
- TX2(&t6,&t7);
- *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7;
-}
-
-__INLINE void
-L_2_4(int inv, const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
- V *r0, V *r1, V *r2, V *r3) {
- V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
- t4 = VADD(t0, t1);
- t5 = VSUB(t0, t1);
- t6 = VADD(t2, t3);
- t7 = VSUB(t2, t3);
- *r0 = VUNPACKLO(t4, t5);
- *r1 = VUNPACKLO(t6, t7);
- t5 = IMULI(inv, t5);
- t0 = VADD(t6, t4);
- t2 = VSUB(t6, t4);
- t1 = VSUB(t7, t5);
- t3 = VADD(t7, t5);
- *r3 = VUNPACKHI(t0, t1);
- *r2 = VUNPACKHI(t2, t3);
-}
-
-__INLINE void
-L_4_2(int inv, const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
- V *r0, V *r1, V *r2, V *r3) {
- V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3);
- t2 = VBLEND(t6, t7);
- t3 = VBLEND(t7, t6);
- t4 = VADD(t0, t1);
- t5 = VSUB(t0, t1);
- t6 = VADD(t2, t3);
- t7 = VSUB(t2, t3);
- *r2 = VUNPACKHI(t4, t5);
- *r3 = VUNPACKHI(t6, t7);
- t7 = IMULI(inv, t7);
- t0 = VADD(t4, t6);
- t2 = VSUB(t4, t6);
- t1 = VSUB(t5, t7);
- t3 = VADD(t5, t7);
- *r0 = VUNPACKLO(t0, t1);
- *r1 = VUNPACKLO(t2, t3);
-}
-
-__INLINE void
-firstpass_16_f(ffts_plan_t * p, const void * in, void * out) {
- const data_t *din = (const data_t *)in;
- data_t *dout = (data_t *)out;
- V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
- float *LUT8 = p->ws ;
-
- L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
- L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
- K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
- K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
- S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
- K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
- S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
-}
+static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
+{
+ V uk, uk2, zk_p, zk_n, zk, zk_d;
+ uk = *r0; uk2 = *r1;
+ zk_p = IMUL(*r2, re, im);
+ zk_n = IMULJ(*r3, re, im);
-__INLINE void
-firstpass_16_b(ffts_plan_t * p, const void * in, void * out) {
- const data_t *din = (const data_t *)in;
- data_t *dout = (data_t *)out;
- V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
- float *LUT8 = p->ws ;
-
- L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
- L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
- K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
- K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
- S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
- K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
- S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
-}
-
-__INLINE void
-firstpass_8_f(ffts_plan_t * p, const void * in, void * out) {
- const data_t *din = (const data_t *)in;
- data_t *dout = (data_t *)out;
- V r0_1,r2_3,r4_5,r6_7;
- float *LUT8 = p->ws + p->ws_is[0];
- L_4_2(0, din+0,din+8,din+4,din+12,&r0_1,&r2_3,&r4_5,&r6_7);
- K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
- S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
-}
-
-__INLINE void
-firstpass_8_b(ffts_plan_t * p, const void * in, void * out) {
- const data_t *din = (const data_t *)in;
- data_t *dout = (data_t *)out;
- V r0_1,r2_3,r4_5,r6_7;
- float *LUT8 = p->ws + p->ws_is[0];
- L_4_2(1, din+0,din+8,din+4,din+12,&r0_1,&r2_3,&r4_5,&r6_7);
- K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
- S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
-}
-
-__INLINE void
-firstpass_4_f(ffts_plan_t * p, const void * in, void * out) {
- const data_t *din = (const data_t *)in;
- data_t *dout = (data_t *)out;
- cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
- t0[0] = din[0]; t0[1] = din[1];
- t1[0] = din[4]; t1[1] = din[5];
- t2[0] = din[2]; t2[1] = din[3];
- t3[0] = din[6]; t3[1] = din[7];
-
- t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
- t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
- t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
- t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
-
- dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
- dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
- dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0];
- dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0];
-}
-__INLINE void
-firstpass_4_b(ffts_plan_t * p, const void * in, void * out) {
- const data_t *din = (const data_t *)in;
- data_t *dout = (data_t *)out;
- cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
- t0[0] = din[0]; t0[1] = din[1];
- t1[0] = din[4]; t1[1] = din[5];
- t2[0] = din[2]; t2[1] = din[3];
- t3[0] = din[6]; t3[1] = din[7];
-
- t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
- t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
- t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
- t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
-
- dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
- dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
- dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0];
- dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0];
-}
-__INLINE void
-firstpass_2(ffts_plan_t * p, const void * in, void * out) {
- const data_t *din = (const data_t *)in;
- data_t *dout = (data_t *)out;
- cdata_t t0, t1, r0,r1;
- t0[0] = din[0]; t0[1] = din[1];
- t1[0] = din[2]; t1[1] = din[3];
- r0[0] = t0[0] + t1[0];
- r0[1] = t0[1] + t1[1];
- r1[0] = t0[0] - t1[0];
- r1[1] = t0[1] - t1[1];
- dout[0] = r0[0]; dout[1] = r0[1];
- dout[2] = r1[0]; dout[3] = r1[1];
+ zk = VADD(zk_p, zk_n);
+ zk_d = IMULI(inv, VSUB(zk_p, zk_n));
+
+ *r2 = VSUB(uk, zk);
+ *r0 = VADD(uk, zk);
+ *r3 = VADD(uk2, zk_d);
+ *r1 = VSUB(uk2, zk_d);
+}
+
+
+static inline void S_4(V r0, V r1, V r2, V r3,
+ data_t * restrict o0, data_t * restrict o1,
+ data_t * restrict o2, data_t * restrict o3)
+{
+ VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
+}
+
+
+static inline void L_2_4(int inv,
+ const data_t * restrict i0, const data_t * restrict i1,
+ const data_t * restrict i2, const data_t * restrict i3,
+ V *r0, V *r1, V *r2, V *r3)
+{
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
+ t4 = VADD(t0, t1);
+ t5 = VSUB(t0, t1);
+ t6 = VADD(t2, t3);
+ t7 = VSUB(t2, t3);
+ *r0 = VUNPACKLO(t4, t5);
+ *r1 = VUNPACKLO(t6, t7);
+ t5 = IMULI(inv, t5);
+ t0 = VADD(t6, t4);
+ t2 = VSUB(t6, t4);
+ t1 = VSUB(t7, t5);
+ t3 = VADD(t7, t5);
+ *r3 = VUNPACKHI(t0, t1);
+ *r2 = VUNPACKHI(t2, t3);
+}
+
+
+static inline void L_4_4(int inv,
+ const data_t * restrict i0, const data_t * restrict i1,
+ const data_t * restrict i2, const data_t * restrict i3,
+ V *r0, V *r1, V *r2, V *r3)
+{
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
+ t4 = VADD(t0, t1);
+ t5 = VSUB(t0, t1);
+ t6 = VADD(t2, t3);
+ t7 = IMULI(inv, VSUB(t2, t3));
+ t0 = VADD(t4, t6);
+ t2 = VSUB(t4, t6);
+ t1 = VSUB(t5, t7);
+ t3 = VADD(t5, t7);
+ TX2(&t0, &t1);
+ TX2(&t2, &t3);
+ *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3;
+}
+
+
+
+static inline void L_4_2(int inv,
+ const data_t * restrict i0, const data_t * restrict i1,
+ const data_t * restrict i2, const data_t * restrict i3,
+ V *r0, V *r1, V *r2, V *r3)
+{
+ V t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3);
+ t2 = VBLEND(t6, t7);
+ t3 = VBLEND(t7, t6);
+ t4 = VADD(t0, t1);
+ t5 = VSUB(t0, t1);
+ t6 = VADD(t2, t3);
+ t7 = VSUB(t2, t3);
+ *r2 = VUNPACKHI(t4, t5);
+ *r3 = VUNPACKHI(t6, t7);
+ t7 = IMULI(inv, t7);
+ t0 = VADD(t4, t6);
+ t2 = VSUB(t4, t6);
+ t1 = VSUB(t5, t7);
+ t3 = VADD(t5, t7);
+ *r0 = VUNPACKLO(t0, t1);
+ *r1 = VUNPACKLO(t2, t3);
}
#endif
diff --git a/src/sse_float.h b/src/sse_float.h
index 5b4eb92..229477c 100644
--- a/src/sse_float.h
+++ b/src/sse_float.h
@@ -63,4 +63,22 @@ typedef __m128 V;
#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
#define FFTS_FREE(d) (_mm_free(d))
+__INLINE V IMULI(int inv, V a) {
+ if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+ else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+}
+
+
+__INLINE V IMUL(V d, V re, V im) {
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VSUB(re, im);
+}
+
+__INLINE V IMULJ(V d, V re, V im) {
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VADD(re, im);
+}
+
#endif
OpenPOWER on IntegriCloud