diff options
author | Anthony Blake <anthonix@me.com> | 2013-04-22 16:45:17 +1200 |
---|---|---|
committer | Anthony Blake <anthonix@me.com> | 2013-04-22 16:45:17 +1200 |
commit | 7b3907cff81fb82380787e63e4304fb8af807c0c (patch) | |
tree | 11ae035cd02aab22e1a4069737fb6f422a7531eb /src | |
parent | a45464980b8de7faef21eb46479f7e09fd056441 (diff) | |
download | ffts-7b3907cff81fb82380787e63e4304fb8af807c0c.zip ffts-7b3907cff81fb82380787e63e4304fb8af807c0c.tar.gz |
Fixed up the smaller VFP transforms. Inverse VFP and real/nd VFP still not working yet.
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile.am | 2 | ||||
-rw-r--r-- | src/Makefile.in | 16 | ||||
-rw-r--r-- | src/ffts.c | 20 | ||||
-rw-r--r-- | src/ffts_nd.c | 6 | ||||
-rw-r--r-- | src/ffts_nd.h | 5 | ||||
-rw-r--r-- | src/ffts_real.h | 5 | ||||
-rw-r--r-- | src/ffts_real_nd.h | 6 | ||||
-rw-r--r-- | src/macros.h | 398 | ||||
-rw-r--r-- | src/sse_float.h | 18 |
9 files changed, 168 insertions, 308 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index d439a2c..a07becc 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,7 +2,7 @@ lib_LTLIBRARIES = libffts.la -libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c +libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c if DYNAMIC_DISABLED libffts_la_SOURCES += ffts_static.c diff --git a/src/Makefile.in b/src/Makefile.in index 49590ff..97bb79f 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -99,9 +99,9 @@ am__installdirs = "$(DESTDIR)$(libdir)" \ "$(DESTDIR)$(libffts_includedir)" LTLIBRARIES = $(lib_LTLIBRARIES) libffts_la_LIBADD = -am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \ - ffts_real_nd.c patterns.c ffts_static.c codegen.c vfp.s \ - neon_static_f.s neon_static_i.s neon.s sse.s +am__libffts_la_SOURCES_DIST = ffts.c ffts_small.c ffts_nd.c \ + ffts_real.c ffts_real_nd.c patterns.c ffts_static.c codegen.c \ + vfp.s neon_static_f.s neon_static_i.s neon.s sse.s @DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo @DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo @HAVE_VFP_TRUE@am__objects_3 = vfp.lo @@ -110,7 +110,7 @@ am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \ @DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon.lo @HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 = \ @HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@ sse.lo -am_libffts_la_OBJECTS = ffts.lo ffts_nd.lo ffts_real.lo \ +am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \ ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \ $(am__objects_3) $(am__objects_4) $(am__objects_5) \ $(am__objects_6) @@ -264,9 +264,10 @@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ lib_LTLIBRARIES = libffts.la -libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c \ - patterns.c $(am__append_1) $(am__append_2) $(am__append_3) \ - $(am__append_4) $(am__append_5) $(am__append_6) +libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c \ + ffts_real_nd.c patterns.c $(am__append_1) $(am__append_2) \ + $(am__append_3) $(am__append_4) $(am__append_5) \ + $(am__append_6) libffts_includedir = $(includedir)/ffts libffts_include_HEADERS = ../include/ffts.h all: all-am @@ -351,6 +352,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_nd.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real_nd.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_small.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_static.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@ @@ -34,6 +34,7 @@ #include "macros.h" //#include "mini_macros.h" #include "patterns.h" +#include "ffts_small.h" #ifdef DYNAMIC_DISABLED #include "ffts_static.h" @@ -89,12 +90,12 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { size_t i; #ifdef __arm__ -#ifdef HAVE_NEON +//#ifdef HAVE_NEON V MULI_SIGN; if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); -#endif +//#endif #else V MULI_SIGN; @@ -230,22 +231,27 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { float *fw = (float *)w; V temp0, temp1, temp2; for(j=0;j<n/4;j+=2) { - #ifdef HAVE_NEON + // #ifdef HAVE_NEON temp0 = VLD(fw0 + j*2); V re, im; re = VDUPRE(temp0); im = VDUPIM(temp0); - im = VXOR(im, MULI_SIGN); + #ifdef HAVE_NEON + im = VXOR(im, MULI_SIGN); + //im = IMULI(sign>0, im); + #else + im = MULI(sign>0, im); + #endif VST(fw + j*4 , re); VST(fw + j*4+4, im); - #endif + // #endif } w += n/4 * 2; }else{ //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); float *fw = (float *)w; - VS temp0, temp1, temp2; #ifdef HAVE_NEON + VS temp0, temp1, temp2; for(j=0;j<n/4;j+=4) { temp0 = VLD2(fw0 + j*2); temp0.val[1] = VXOR(temp0.val[1], neg); @@ -299,8 +305,8 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) { #ifdef __arm__ //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32); float *fw = (float *)w; - VS temp0, temp1, temp2; #ifdef HAVE_NEON + VS temp0, temp1, temp2; for(j=0;j<n/8;j+=4) { temp0 = VLD2(fw0 + j*2); temp0.val[1] = VXOR(temp0.val[1], neg); diff --git a/src/ffts_nd.c b/src/ffts_nd.c index a23ad7f..e65fe7f 100644 --- a/src/ffts_nd.c +++ b/src/ffts_nd.c @@ -33,7 +33,7 @@ #include "ffts_nd.h" -#ifdef __ARM_NEON__ +#ifdef HAVE_NEON #include "neon.h" #endif @@ -62,7 +62,7 @@ void ffts_free_nd(ffts_plan_t *p) { #include <string.h> void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) { -#ifdef __ARM_NEON__ +#ifdef HAVE_NEON size_t i,j,k; int linebytes = w*8; @@ -131,6 +131,7 @@ void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) { } } #else +#ifdef HAVE_SSE uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64))); int tx, ty; int x, y; @@ -212,6 +213,7 @@ void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) { } */ #endif +#endif } diff --git a/src/ffts_nd.h b/src/ffts_nd.h index 7761f52..8f0c855 100644 --- a/src/ffts_nd.h +++ b/src/ffts_nd.h @@ -40,9 +40,10 @@ #include "ffts.h" -#ifdef __ARM_NEON__ +#ifdef HAVE_NEON #include <arm_neon.h> -#else +#endif +#ifdef HAVE_SSE #include <xmmintrin.h> #endif diff --git a/src/ffts_real.h b/src/ffts_real.h index e904b95..bf8834d 100644 --- a/src/ffts_real.h +++ b/src/ffts_real.h @@ -40,9 +40,10 @@ #include "ffts.h" -#ifdef __ARM_NEON__ +#ifdef HAVE_NEON #include <arm_neon.h> -#else +#endif +#ifdef HAVE_SSE #include <xmmintrin.h> #endif diff --git a/src/ffts_real_nd.h b/src/ffts_real_nd.h index 04def32..d777d42 100644 --- a/src/ffts_real_nd.h +++ b/src/ffts_real_nd.h @@ -42,12 +42,12 @@ #include "ffts_real.h" #include "ffts.h" -#ifdef __ARM_NEON__ +#ifdef HAVE_NEON #include <arm_neon.h> -#else +#endif +#ifdef HAVE_SSE #include <xmmintrin.h> #endif - #endif diff --git a/src/macros.h b/src/macros.h index effc87d..10bd1e3 100644 --- a/src/macros.h +++ b/src/macros.h @@ -2,8 +2,8 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> All rights reserved. @@ -31,301 +31,131 @@ */ - #ifndef __MACROS_H__ #define __MACROS_H__ -#include "../config.h" -#include "types.h" - -#ifdef __ARM_NEON__ - //#include "neon_float.h" - #include "neon.h" -#include <arm_neon.h> - -typedef float32x4_t V; - -typedef float32x4x2_t VS; - -//#include <complex.h> -//#include <stdalign.h> - - -#define ADD vaddq_f32 -#define SUB vsubq_f32 -#define MUL vmulq_f32 -#define VADD vaddq_f32 -#define VSUB vsubq_f32 -#define VMUL vmulq_f32 -#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)))) -#define VST vst1q_f32 -#define VLD vld1q_f32 -#define VST2 vst2q_f32 -#define VLD2 vld2q_f32 - -#define VSWAPPAIRS(x) (vrev64q_f32(x)) - -#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b))) -#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b))) - -#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y))) - -__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) { - data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3}; - return VLD(d); -} - -#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0)) -#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1)) - -#define FFTS_MALLOC(d,a) (valloc(d)) -#define FFTS_FREE(d) (free(d)) +#ifdef HAVE_NEON +#include "macros-neon.h" +#else +#ifdef __alpha__ +#include "macros-alpha.h" +#else +#ifdef __powerpc__ +#include "macros-altivec.h" +#endif +#endif -__INLINE void STORESPR(data_t * addr, VS p) { +#endif - vst1q_f32(addr, p.val[0]); - vst1q_f32(addr + 4, p.val[1]); - -//__asm__ __volatile__ ("vst1.32 {%q1,%q2}, [%0, :128]\n\t" -// : -// : "r" (addr), "w" (p.val[0]), "w" (p.val[1]) -// : "memory"); -} -#else +#ifdef HAVE_VFP +#include "macros-alpha.h" +#endif +#ifdef HAVE_SSE #include "sse_float.h" #endif -#include "ffts.h" - - -//cdata_t SCALAR_MULI_SIGN; -//V MULI_SIGN; -//V LEAFLUT[12]; - -__INLINE V IMULI(int inv, V a) { - if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); - else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); -} - -__INLINE void -S_4(V r0, V r1, V r2, V r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) { - V t0, t1, t2, t3; - VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3); -} - -__INLINE V IMUL(V d, V re, V im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VSUB(re, im); -} - -__INLINE V IMULJ(V d, V re, V im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VADD(re, im); -} - -__INLINE void -K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3) { - V uk, uk2, zk_p, zk_n, zk, zk_d; - uk = *r0; uk2 = *r1; - zk_p = IMUL(*r2, re, im); - zk_n = IMULJ(*r3, re, im); - - zk = VADD(zk_p, zk_n); - zk_d = IMULI(inv, VSUB(zk_p, zk_n)); - - *r2 = VSUB(uk, zk); - *r0 = VADD(uk, zk); - *r3 = VADD(uk2, zk_d); - *r1 = VSUB(uk2, zk_d); -} - -__INLINE void TX2(V *a, V *b) { +static inline void TX2(V *a, V *b) +{ V TX2_t0 = VUNPACKLO(*a, *b); - V TX2_t1 = VUNPACKHI(*a, *b); + V TX2_t1 = VUNPACKHI(*a, *b); *a = TX2_t0; *b = TX2_t1; } -__INLINE void -L_4_4(int inv, const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - V *r0, V *r1, V *r2, V *r3) { - V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); - t4 = VADD(t0, t1); - t5 = VSUB(t0, t1); - t6 = VADD(t2, t3); - t7 = IMULI(inv, VSUB(t2, t3)); - t0 = VADD(t4, t6); - t2 = VSUB(t4, t6); - t1 = VSUB(t5, t7); - t3 = VADD(t5, t7); - TX2(&t0,&t1); - TX2(&t2,&t3); - *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; } - -__INLINE void -L_2_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - V *r0, V *r1, V *r2, V *r3) { - V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); t4 = VADD(t0, t1); - t5 = VSUB(t0, t1); - t6 = VADD(t2, t3); - t7 = VSUB(t2, t3); - TX2(&t4,&t5); - TX2(&t6,&t7); - *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7; -} - -__INLINE void -L_2_4(int inv, const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - V *r0, V *r1, V *r2, V *r3) { - V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); - t4 = VADD(t0, t1); - t5 = VSUB(t0, t1); - t6 = VADD(t2, t3); - t7 = VSUB(t2, t3); - *r0 = VUNPACKLO(t4, t5); - *r1 = VUNPACKLO(t6, t7); - t5 = IMULI(inv, t5); - t0 = VADD(t6, t4); - t2 = VSUB(t6, t4); - t1 = VSUB(t7, t5); - t3 = VADD(t7, t5); - *r3 = VUNPACKHI(t0, t1); - *r2 = VUNPACKHI(t2, t3); -} - -__INLINE void -L_4_2(int inv, const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, - V *r0, V *r1, V *r2, V *r3) { - V t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3); - t2 = VBLEND(t6, t7); - t3 = VBLEND(t7, t6); - t4 = VADD(t0, t1); - t5 = VSUB(t0, t1); - t6 = VADD(t2, t3); - t7 = VSUB(t2, t3); - *r2 = VUNPACKHI(t4, t5); - *r3 = VUNPACKHI(t6, t7); - t7 = IMULI(inv, t7); - t0 = VADD(t4, t6); - t2 = VSUB(t4, t6); - t1 = VSUB(t5, t7); - t3 = VADD(t5, t7); - *r0 = VUNPACKLO(t0, t1); - *r1 = VUNPACKLO(t2, t3); -} - -__INLINE void -firstpass_16_f(ffts_plan_t * p, const void * in, void * out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; - float *LUT8 = p->ws ; - - L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); - L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); - K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); - S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); - K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); - S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); -} +static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3) +{ + V uk, uk2, zk_p, zk_n, zk, zk_d; + uk = *r0; uk2 = *r1; + zk_p = IMUL(*r2, re, im); + zk_n = IMULJ(*r3, re, im); -__INLINE void -firstpass_16_b(ffts_plan_t * p, const void * in, void * out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; - float *LUT8 = p->ws ; - - L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); - L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); - K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); - S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); - K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); - S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); -} - -__INLINE void -firstpass_8_f(ffts_plan_t * p, const void * in, void * out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - V r0_1,r2_3,r4_5,r6_7; - float *LUT8 = p->ws + p->ws_is[0]; - L_4_2(0, din+0,din+8,din+4,din+12,&r0_1,&r2_3,&r4_5,&r6_7); - K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); -} - -__INLINE void -firstpass_8_b(ffts_plan_t * p, const void * in, void * out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - V r0_1,r2_3,r4_5,r6_7; - float *LUT8 = p->ws + p->ws_is[0]; - L_4_2(1, din+0,din+8,din+4,din+12,&r0_1,&r2_3,&r4_5,&r6_7); - K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); -} - -__INLINE void -firstpass_4_f(ffts_plan_t * p, const void * in, void * out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - cdata_t t0, t1, t2, t3, t4, t5, t6, t7; - t0[0] = din[0]; t0[1] = din[1]; - t1[0] = din[4]; t1[1] = din[5]; - t2[0] = din[2]; t2[1] = din[3]; - t3[0] = din[6]; t3[1] = din[7]; - - t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1]; - t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1]; - t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1]; - t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1]; - - dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1]; - dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1]; - dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0]; - dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0]; -} -__INLINE void -firstpass_4_b(ffts_plan_t * p, const void * in, void * out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - cdata_t t0, t1, t2, t3, t4, t5, t6, t7; - t0[0] = din[0]; t0[1] = din[1]; - t1[0] = din[4]; t1[1] = din[5]; - t2[0] = din[2]; t2[1] = din[3]; - t3[0] = din[6]; t3[1] = din[7]; - - t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1]; - t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1]; - t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1]; - t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1]; - - dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1]; - dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1]; - dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0]; - dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0]; -} -__INLINE void -firstpass_2(ffts_plan_t * p, const void * in, void * out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - cdata_t t0, t1, r0,r1; - t0[0] = din[0]; t0[1] = din[1]; - t1[0] = din[2]; t1[1] = din[3]; - r0[0] = t0[0] + t1[0]; - r0[1] = t0[1] + t1[1]; - r1[0] = t0[0] - t1[0]; - r1[1] = t0[1] - t1[1]; - dout[0] = r0[0]; dout[1] = r0[1]; - dout[2] = r1[0]; dout[3] = r1[1]; + zk = VADD(zk_p, zk_n); + zk_d = IMULI(inv, VSUB(zk_p, zk_n)); + + *r2 = VSUB(uk, zk); + *r0 = VADD(uk, zk); + *r3 = VADD(uk2, zk_d); + *r1 = VSUB(uk2, zk_d); +} + + +static inline void S_4(V r0, V r1, V r2, V r3, + data_t * restrict o0, data_t * restrict o1, + data_t * restrict o2, data_t * restrict o3) +{ + VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3); +} + + +static inline void L_2_4(int inv, + const data_t * restrict i0, const data_t * restrict i1, + const data_t * restrict i2, const data_t * restrict i3, + V *r0, V *r1, V *r2, V *r3) +{ + V t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = VSUB(t2, t3); + *r0 = VUNPACKLO(t4, t5); + *r1 = VUNPACKLO(t6, t7); + t5 = IMULI(inv, t5); + t0 = VADD(t6, t4); + t2 = VSUB(t6, t4); + t1 = VSUB(t7, t5); + t3 = VADD(t7, t5); + *r3 = VUNPACKHI(t0, t1); + *r2 = VUNPACKHI(t2, t3); +} + + +static inline void L_4_4(int inv, + const data_t * restrict i0, const data_t * restrict i1, + const data_t * restrict i2, const data_t * restrict i3, + V *r0, V *r1, V *r2, V *r3) +{ + V t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = IMULI(inv, VSUB(t2, t3)); + t0 = VADD(t4, t6); + t2 = VSUB(t4, t6); + t1 = VSUB(t5, t7); + t3 = VADD(t5, t7); + TX2(&t0, &t1); + TX2(&t2, &t3); + *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; +} + + + +static inline void L_4_2(int inv, + const data_t * restrict i0, const data_t * restrict i1, + const data_t * restrict i2, const data_t * restrict i3, + V *r0, V *r1, V *r2, V *r3) +{ + V t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3); + t2 = VBLEND(t6, t7); + t3 = VBLEND(t7, t6); + t4 = VADD(t0, t1); + t5 = VSUB(t0, t1); + t6 = VADD(t2, t3); + t7 = VSUB(t2, t3); + *r2 = VUNPACKHI(t4, t5); + *r3 = VUNPACKHI(t6, t7); + t7 = IMULI(inv, t7); + t0 = VADD(t4, t6); + t2 = VSUB(t4, t6); + t1 = VSUB(t5, t7); + t3 = VADD(t5, t7); + *r0 = VUNPACKLO(t0, t1); + *r1 = VUNPACKLO(t2, t3); } #endif diff --git a/src/sse_float.h b/src/sse_float.h index 5b4eb92..229477c 100644 --- a/src/sse_float.h +++ b/src/sse_float.h @@ -63,4 +63,22 @@ typedef __m128 V; #define FFTS_MALLOC(d,a) (_mm_malloc(d,a)) #define FFTS_FREE(d) (_mm_free(d)) +__INLINE V IMULI(int inv, V a) { + if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); + else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); +} + + +__INLINE V IMUL(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VSUB(re, im); +} + +__INLINE V IMULJ(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VADD(re, im); +} + #endif |