summaryrefslogtreecommitdiffstats
path: root/src/cp_sse.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/cp_sse.c')
-rw-r--r--src/cp_sse.c85
1 files changed, 8 insertions, 77 deletions
diff --git a/src/cp_sse.c b/src/cp_sse.c
index f36f90b..d32155f 100644
--- a/src/cp_sse.c
+++ b/src/cp_sse.c
@@ -2,74 +2,6 @@
#include "macros.h"
#include "patterns.h"
-__INLINE void
-firstpass_type_1(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
- size_t i, ii0 = p->i0, ii1 = p->i1;
- size_t *offsets = (size_t *)p->offsets;
- size_t *is = (size_t *)p->is;
-#ifdef __ARM_NEON__
- const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7];
- for(i=ii0;i>0;--i) {
- neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
- offsets += 2;
- }
- for(i=ii1;i>0;--i) {
- neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
- offsets += 2;
- }
- neon_shl8_oe(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
- offsets += 2;
- for(i=ii1;i>0;--i) {
- neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2);
- offsets += 2;
- }
-
-#else
- for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
- for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
- LEAF_OE(&is, in, &offsets, out);
- for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
-#endif
-}
-
-__INLINE void
-firstpass_type_2(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
- size_t i, ii0 = p->i0, ii1 = p->i1;
- size_t *offsets = (size_t *)p->offsets;
- size_t *is = (size_t *)p->is;
-#ifdef __ARM_NEON__
- const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7];
-
- for(i=ii0;i>0;--i) {
- neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
- offsets+=2;
- }
- neon_shl8_eo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7);
- offsets += 2;
- for(i=ii1;i>0;--i) {
- neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5);
- offsets += 2;
- }
- for(i=ii1;i>0;--i) {
- neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2);
- offsets += 2;
- }
-
-#else
- for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
- LEAF_EO(&is, in, &offsets, out);
- for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
- for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
-#endif
-}
-
-__INLINE void
-firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) {
- size_t *offsets = (size_t *)p->offsets;
- size_t *is = (size_t *)p->is;
- LEAF_EE(&is, in, &offsets, out);
- LEAF_OE(&is, in, &offsets, out);
-}
void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) {
transform_index_t *ps = p->transforms;
@@ -119,8 +51,6 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
// ffts_init_tree(p, N, leafN);
// if(N == 64) p->firstpass = &firstpass_64;
- if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_1;
- else p->firstpass = &firstpass_type_2;
LEAFLUT[0] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
LEAFLUT[1] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
@@ -157,12 +87,12 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
p->transforms = malloc(2 * sizeof(transform_index_t));
p->transforms[0] = 0;
p->transforms[1] = 1;
- if(N == 2) p->firstpass = &firstpass_2;
- else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f;
- else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b;
- else if(N == 8) p->firstpass = &firstpass_8;
- else if(N == 16) p->firstpass = &firstpass_16;
- else if(N == 32) p->firstpass = &firstpass_32;
+// if(N == 2) p->firstpass = &firstpass_2;
+// else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f;
+// else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b;
+// else if(N == 8) p->firstpass = &firstpass_8;
+// else if(N == 16) p->firstpass = &firstpass_16;
+// else if(N == 32) p->firstpass = &firstpass_32;
p->is = NULL;
p->offsets = NULL;
@@ -362,7 +292,8 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
// tmp[4], tmp[5], tmp[6], tmp[7]);
// tmp += 8;
//}
-
+
+ fprintf(stderr, "p0 %d p1 %d\n", p->i0, p->i1);
p->N = N;
p->lastlut = w;
p->n_luts = n_luts;
OpenPOWER on IntegriCloud