summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnthony Blake <anthonix@me.com>2012-08-01 14:48:29 +1200
committerAnthony Blake <anthonix@me.com>2012-08-01 14:48:29 +1200
commit9993b4b77bd21971c2e1a43dbb45567b692698c7 (patch)
tree8052860e84b199c0a5a8c67e222d58f4388a5bea
parent96ff9180191b360873a3b9c384902a0a39d5b37e (diff)
downloadffts-9993b4b77bd21971c2e1a43dbb45567b692698c7.zip
ffts-9993b4b77bd21971c2e1a43dbb45567b692698c7.tar.gz
Forwards and backwards working
-rw-r--r--Makefile.am2
-rw-r--r--Makefile.in2
-rwxr-xr-xconfigure3
-rw-r--r--configure.ac1
-rw-r--r--include/ffts.h19
-rw-r--r--src/Makefile.in4
-rw-r--r--src/cp_sse.c125
-rw-r--r--src/cp_sse.h11
-rw-r--r--src/macros.h58
-rw-r--r--src/patterns.c2
10 files changed, 134 insertions, 93 deletions
diff --git a/Makefile.am b/Makefile.am
index 27f515f..33b171c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,3 +1,3 @@
AUTOMAKE_OPTIONS = foreign
-SUBDIRS = src
+SUBDIRS = src tests
diff --git a/Makefile.in b/Makefile.in
index b63edd2..a889030 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -217,7 +217,7 @@ top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
AUTOMAKE_OPTIONS = foreign
-SUBDIRS = src
+SUBDIRS = src tests
all: config.h
$(MAKE) $(AM_MAKEFLAGS) all-recursive
diff --git a/configure b/configure
index 81fd0eb..d412018 100755
--- a/configure
+++ b/configure
@@ -5049,7 +5049,7 @@ done
-ac_config_files="$ac_config_files Makefile src/Makefile"
+ac_config_files="$ac_config_files Makefile src/Makefile tests/Makefile"
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
@@ -5784,6 +5784,7 @@ do
"depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
"Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
"src/Makefile") CONFIG_FILES="$CONFIG_FILES src/Makefile" ;;
+ "tests/Makefile") CONFIG_FILES="$CONFIG_FILES tests/Makefile" ;;
*) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
esac
diff --git a/configure.ac b/configure.ac
index 775ae7b..06cb913 100644
--- a/configure.ac
+++ b/configure.ac
@@ -55,5 +55,6 @@ AC_CHECK_FUNCS([gettimeofday pow])
AC_CONFIG_FILES([Makefile
src/Makefile
+ tests/Makefile
])
AC_OUTPUT
diff --git a/include/ffts.h b/include/ffts.h
index 1431cf6..e266491 100644
--- a/include/ffts.h
+++ b/include/ffts.h
@@ -37,11 +37,26 @@
#include <complex.h>
#include <math.h>
#include <stdint.h>
+#include <stddef.h>
-typedef struct ffts_plan_t;
+typedef size_t transform_index_t;
+
+struct _ffts_plan_t {
+ ptrdiff_t *is;
+ ptrdiff_t *offsets;
+ void __attribute__ ((aligned(32))) **ws;
+ void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict);
+ size_t i0, i1, i2;
+ uint64_t n_bits, leaftime;
+
+ transform_index_t *transforms;
+};
+
+
+typedef struct _ffts_plan_t ffts_plan_t;
void ffts_execute(ffts_plan_t * restrict, const void * restrict, const void * restrict);
-ffts_plan_t *ffts_init(size_t N);
+ffts_plan_t *ffts_init(size_t N, int sign);
#endif
diff --git a/src/Makefile.in b/src/Makefile.in
index 73fecf8..fbc26bb 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -171,9 +171,9 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
exit 1;; \
esac; \
done; \
- echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
$(am__cd) $(top_srcdir) && \
- $(AUTOMAKE) --foreign src/Makefile
+ $(AUTOMAKE) --gnu src/Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
diff --git a/src/cp_sse.c b/src/cp_sse.c
index f59c60b..8b09031 100644
--- a/src/cp_sse.c
+++ b/src/cp_sse.c
@@ -35,23 +35,26 @@ firstpass_64(const float * restrict in, float * restrict out, size_t N, ffts_pla
void
firstpass_32(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
__m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
+ float *LUT8 = p->ws[0];
+ float *LUT16 = p->ws[1];
+ float *LUT32 = p->ws[2];
L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19);
L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23);
- K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31);
L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27);
- K_N(VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,-0,0),&r0_1,&r4_5,&r8_9,&r12_13);
- K_N(VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15);
- K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r16_17,&r18_19,&r20_21,&r22_23);
- K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r24_25,&r26_27,&r28_29,&r30_31);
- K_N(VLIT4(0.98078528040323043057924223830923,0.98078528040323043057924223830923,1,1),VLIT4(0.19509032201612824808378832130984,-0.19509032201612824808378832130984,-0,0),&r0_1,&r8_9,&r16_17,&r24_25);
+ K_N(_mm_load_ps(LUT16),_mm_load_ps(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13);
+ K_N(_mm_load_ps(LUT16+8),_mm_load_ps(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15);
+ K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r16_17,&r18_19,&r20_21,&r22_23);
+ K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r24_25,&r26_27,&r28_29,&r30_31);
+ K_N(_mm_load_ps(LUT32),_mm_load_ps(LUT32+4),&r0_1,&r8_9,&r16_17,&r24_25);
S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48);
- K_N(VLIT4(0.8314696123025452356714026791451,0.8314696123025452356714026791451,0.92387953251128673848313610506011,0.92387953251128673848313610506011),VLIT4(0.55557023301960217764872140833177,-0.55557023301960217764872140833177,0.38268343236508978177923268049199,-0.38268343236508978177923268049199),&r2_3,&r10_11,&r18_19,&r26_27);
+ K_N(_mm_load_ps(LUT32+8),_mm_load_ps(LUT32+12),&r2_3,&r10_11,&r18_19,&r26_27);
S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52);
- K_N(VLIT4(0.55557023301960228867102387084742,0.55557023301960228867102387084742,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.83146961230254512464910021662945,-0.83146961230254512464910021662945,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r4_5,&r12_13,&r20_21,&r28_29);
+ K_N(_mm_load_ps(LUT32+16),_mm_load_ps(LUT32+20),&r4_5,&r12_13,&r20_21,&r28_29);
S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56);
- K_N(VLIT4(0.19509032201612830359493955256767,0.19509032201612830359493955256767,0.38268343236508983729038391174981,0.38268343236508983729038391174981),VLIT4(0.98078528040323043057924223830923,-0.98078528040323043057924223830923,0.92387953251128673848313610506011,-0.92387953251128673848313610506011),&r6_7,&r14_15,&r22_23,&r30_31);
+ K_N(_mm_load_ps(LUT32+24),_mm_load_ps(LUT32+28),&r6_7,&r14_15,&r22_23,&r30_31);
S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60);
}
@@ -59,21 +62,24 @@ firstpass_32(const data_t * restrict in, data_t * restrict out, size_t N, ffts_p
void
firstpass_16(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
__m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+ float *LUT8 = p->ws[0];
+ float *LUT16 = p->ws[1];
L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11);
L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13);
- K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7);
- K_N(VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,-0,0),&r0_1,&r4_5,&r8_9,&r12_13);
+ K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(_mm_load_ps(LUT16),_mm_load_ps(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13);
S_4(r0_1,r4_5,r8_9,r12_13,out+0,out+8,out+16,out+24);
- K_N(VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15);
+ K_N(_mm_load_ps(LUT16+8),_mm_load_ps(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15);
S_4(r2_3,r6_7,r10_11,r14_15,out+4,out+12,out+20,out+28);
-
}
+
void
firstpass_8(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
__m128 r0_1,r2_3,r4_5,r6_7;
- L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7);
- K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7);
+ float *LUT8 = p->ws[0];
+ L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
S_4(r0_1,r2_3,r4_5,r6_7,out+0,out+4,out+8,out+12);
}
void
@@ -165,10 +171,13 @@ void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out,
}
-ffts_plan_t *ffts_init(size_t N) {
+ffts_plan_t *ffts_init(size_t N, int sign) {
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
size_t leafN = 16;
size_t i;
+
+ if(sign < 0) MULI_SIGN = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f);
+ else MULI_SIGN = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f);
if(N > 32) {
init_offsets(p, N, leafN);
@@ -178,25 +187,68 @@ ffts_plan_t *ffts_init(size_t N) {
if(N == 64) p->firstpass = &firstpass_64;
else if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_2;
else p->firstpass = &firstpass_type_1;
- /* LUTS */
- size_t n_luts = __builtin_ctzl(N/leafN);
+ LEAFLUT[0] = _mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
+ LEAFLUT[1] = _mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
+ LEAFLUT[2] = _mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011);
+ LEAFLUT[3] = _mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199);
+ LEAFLUT[4] = _mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981);
+ LEAFLUT[5] = _mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011);
+
+ LEAFLUT[6] = _mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1);
+ LEAFLUT[7] = _mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0);
+ LEAFLUT[8] = _mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1);
+ LEAFLUT[9] = _mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0);
+ LEAFLUT[10] = _mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941);
+ LEAFLUT[11] = _mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376);
+
+ if(sign > 0) {
+ LEAFLUT[1] = _mm_xor_ps(LEAFLUT[1], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
+ LEAFLUT[3] = _mm_xor_ps(LEAFLUT[3], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
+ LEAFLUT[5] = _mm_xor_ps(LEAFLUT[5], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
+ LEAFLUT[7] = _mm_xor_ps(LEAFLUT[7], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
+ LEAFLUT[9] = _mm_xor_ps(LEAFLUT[9], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
+ LEAFLUT[11] = _mm_xor_ps(LEAFLUT[11], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f));
+ }
+
p->i0 = N/leafN/3+1;
p->i1 = N/leafN/3;
if((N/leafN) % 3 > 1) p->i1++;
p->i0/=2;
p->i1/=2;
+ }else{
+ p->transforms = malloc(2 * sizeof(transform_index_t));
+ p->transforms[0] = 0;
+ p->transforms[1] = 1;
+ if(N == 2) p->firstpass = &firstpass_2;
+ else if(N == 4) p->firstpass = &firstpass_4;
+ else if(N == 8) p->firstpass = &firstpass_8;
+ else if(N == 16) p->firstpass = &firstpass_16;
+ else if(N == 32) p->firstpass = &firstpass_32;
+
+ }
+
+ int hardcoded = 0;
- // printf("n_luts = %zu\n", n_luts);
+ /* LUTS */
+ size_t n_luts = __builtin_ctzl(N/leafN);
+ if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
+
+
+ //printf("n_luts = %zu\n", n_luts);
p->ws = malloc(n_luts * sizeof(data_t *));
cdata_t *w;
int n = leafN*2;
+ if(hardcoded) n = 8;
+
for(i=0;i<n_luts;i++) {
- // printf("LUT[%zu] = %d\n", i, n);
- if(!i) {
+
+ //printf("LUT[%zu] = %d\n", i, n);
+
+ if(!i || hardcoded) {
w = _mm_malloc(n/4 * 2 * sizeof(cdata_t), 32);
@@ -216,15 +268,15 @@ ffts_plan_t *ffts_init(size_t N) {
__m128 re, im;
re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0));
im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1));
- im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ im = _mm_xor_ps(im, MULI_SIGN);
_mm_store_ps(fw + j*4 , re);
_mm_store_ps(fw + j*4+4, im);
}
- // for(j=0;j<n/2;j++) {
- // printf("%f %f\n", creal(w[j]), cimag(w[j]));
+ // for(j=0;j<n/2;j++) {
+ // printf("%f %f\n", creal(w[j]), cimag(w[j]));
- // }
+ // }
_mm_free(w0);
}else{
@@ -252,21 +304,21 @@ ffts_plan_t *ffts_init(size_t N) {
temp0 = _mm_load_ps(fw0 + j*2);
re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0));
im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1));
- im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ im = _mm_xor_ps(im, MULI_SIGN);
_mm_store_ps(fw + j*2*6 , re);
_mm_store_ps(fw + j*2*6+4, im);
temp1 = _mm_load_ps(fw1 + j*2);
re = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(2, 2, 0, 0));
im = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 3, 1, 1));
- im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ im = _mm_xor_ps(im, MULI_SIGN);
_mm_store_ps(fw + j*2*6+8 , re);
_mm_store_ps(fw + j*2*6+12, im);
temp2 = _mm_load_ps(fw2 + j*2);
re = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 2, 0, 0));
im = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 1, 1));
- im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ im = _mm_xor_ps(im, MULI_SIGN);
_mm_store_ps(fw + j*2*6+16, re);
_mm_store_ps(fw + j*2*6+20, im);
}
@@ -280,22 +332,11 @@ ffts_plan_t *ffts_init(size_t N) {
n *= 2;
}
- p->n_bits = log(N)/log(2) - log(leafN*2)/log(2);
- }else{
- p->transforms = malloc(2 * sizeof(transform_index_t));
- p->transforms[0] = 0;
- p->transforms[1] = 1;
- if(N == 2) p->firstpass = &firstpass_2;
- else if(N == 4) p->firstpass = &firstpass_4;
- else if(N == 8) p->firstpass = &firstpass_8;
- else if(N == 16) p->firstpass = &firstpass_16;
- else if(N == 32) p->firstpass = &firstpass_32;
-
- }
+
return p;
}
-
+/*
int main(int argc, char *argv[]) {
int n = atoi(argv[1]);
int count = atoi(argv[2]);
@@ -341,4 +382,4 @@ int main(int argc, char *argv[]) {
printf("Time: %f seconds, CTGs: %f Leaftime: %f \n", tt, ctgs, lt);
return 0;
-}
+}*/
diff --git a/src/cp_sse.h b/src/cp_sse.h
index c6d15dc..2c6825f 100644
--- a/src/cp_sse.h
+++ b/src/cp_sse.h
@@ -8,7 +8,6 @@
#include <stddef.h>
#include <xmmintrin.h>
#include <stdint.h>
-#include <mach/mach_time.h>
typedef complex float cdata_t;
typedef float data_t;
@@ -22,19 +21,11 @@ struct _ffts_plan_t {
ptrdiff_t *offsets;
void __attribute__ ((aligned(32))) **ws;
void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict);
- size_t i0, i1, i2;
- uint64_t n_bits, leaftime;
+ size_t i0, i1;
transform_index_t *transforms;
};
-
typedef struct _ffts_plan_t ffts_plan_t;
-
-typedef struct _split_vec_t {
- __m128 re, im;
-} split_vec_t;
-
-
#endif
diff --git a/src/macros.h b/src/macros.h
index 0a84802..b2f44e6 100644
--- a/src/macros.h
+++ b/src/macros.h
@@ -7,8 +7,10 @@
#define VLIT4 _mm_set_ps
+__m128 MULI_SIGN;
+
__INLINE __m128 IMULI(__m128 a) {
- __m128 temp = _mm_xor_ps(a, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ __m128 temp = _mm_xor_ps(a, MULI_SIGN);//_mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(2,3,0,1));
}
@@ -113,6 +115,8 @@ __INLINE void TX2(__m128 *a, __m128 *b) {
*a = TX2_t0; *b = TX2_t1;
}
+__m128 __attribute__((aligned(32))) LEAFLUT[12];
+
__INLINE void
LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
__m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
@@ -124,23 +128,17 @@ LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou
L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3);
L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7);
K_0(&r0,&r2,&r4,&r6);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r1,&r3,&r5,&r7);
+ K_N(LEAFLUT[0],LEAFLUT[1],&r1,&r3,&r5,&r7);
L_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8,&r9,&r10,&r11);
L_4(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12,&r13,&r14,&r15);
K_0(&r0,&r4,&r8,&r12);
- K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199),&r1,&r5,&r9,&r13);
- TX2(&r0,&r1);
- TX2(&r4,&r5);
- TX2(&r8,&r9);
- TX2(&r12,&r13);
+ K_N(LEAFLUT[2],LEAFLUT[3],&r1,&r5,&r9,&r13);
+ TX2(&r0,&r1); TX2(&r4,&r5); TX2(&r8,&r9); TX2(&r12,&r13);
S_4(r0,r4,r8,r12,out0+0,out0+8,out0+16,out0+24);
S_4(r1,r5,r9,r13,out1+0,out1+8,out1+16,out1+24);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2,&r6,&r10,&r14);
- K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011),&r3,&r7,&r11,&r15);
- TX2(&r2,&r3);
- TX2(&r6,&r7);
- TX2(&r10,&r11);
- TX2(&r14,&r15);
+ K_N(LEAFLUT[0],LEAFLUT[1],&r2,&r6,&r10,&r14);
+ K_N(LEAFLUT[4],LEAFLUT[5],&r3,&r7,&r11,&r15);
+ TX2(&r2,&r3); TX2(&r6,&r7); TX2(&r10,&r11); TX2(&r14,&r15);
S_4(r2,r6,r10,r14,out0+4,out0+12,out0+20,out0+28);
S_4(r3,r7,r11,r15,out1+4,out1+12,out1+20,out1+28);
*is += 16;
@@ -158,21 +156,15 @@ LEAF_OO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou
L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3);
L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7);
K_0(&r0,&r2,&r4,&r6);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r1,&r3,&r5,&r7);
- TX2(&r0,&r1);
- TX2(&r2,&r3);
- TX2(&r4,&r5);
- TX2(&r6,&r7);
+ K_N(LEAFLUT[0],LEAFLUT[1],&r1,&r3,&r5,&r7);
+ TX2(&r0,&r1); TX2(&r2,&r3); TX2(&r4,&r5); TX2(&r6,&r7);
S_4(r0,r2,r4,r6,out0+0,out0+4,out0+8,out0+12);
S_4(r1,r3,r5,r7,out1+0,out1+4,out1+8,out1+12);
L_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8,&r9,&r10,&r11);
L_2(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12,&r13,&r14,&r15);
K_0(&r8,&r10,&r12,&r14);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r9,&r11,&r13,&r15);
- TX2(&r8,&r9);
- TX2(&r10,&r11);
- TX2(&r12,&r13);
- TX2(&r14,&r15);
+ K_N(LEAFLUT[0],LEAFLUT[1],&r9,&r11,&r13,&r15);
+ TX2(&r8,&r9); TX2(&r10,&r11); TX2(&r12,&r13); TX2(&r14,&r15);
S_4(r8,r10,r12,r14,out0+16,out0+20,out0+24,out0+28);
S_4(r9,r11,r13,r15,out1+16,out1+20,out1+24,out1+28);
@@ -264,16 +256,16 @@ LEAF_OE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou
L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r24_25,&r26_27);
L_2_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r30_31,&r28_29);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(LEAFLUT[6],LEAFLUT[7],&r0_1,&r2_3,&r4_5,&r6_7);
S_4(r0_1,r2_3,r4_5,r6_7,out0+0,out0+4,out0+8,out0+12);
L_4_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8_9,&r10_11,&r16_17,&r18_19);
L_2_2(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12_13,&r14_15,&r20_21,&r22_23);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r8_9,&r10_11,&r12_13,&r14_15);
+ K_N(LEAFLUT[6],LEAFLUT[7],&r8_9,&r10_11,&r12_13,&r14_15);
S_4(r8_9,r10_11,r12_13,r14_15,out0+16,out0+20,out0+24,out0+28);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r16_17,&r18_19,&r20_21,&r22_23);
- K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0),&r16_17,&r20_21,&r24_25,&r28_29);
+ K_N(LEAFLUT[6],LEAFLUT[7],&r16_17,&r18_19,&r20_21,&r22_23);
+ K_N(LEAFLUT[8],LEAFLUT[9],&r16_17,&r20_21,&r24_25,&r28_29);
S_4(r16_17,r20_21,r24_25,r28_29,out1+0,out1+8,out1+16,out1+24);
- K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r18_19,&r22_23,&r26_27,&r30_31);
+ K_N(LEAFLUT[10],LEAFLUT[11],&r18_19,&r22_23,&r26_27,&r30_31);
S_4(r18_19,r22_23,r26_27,r30_31,out1+4,out1+12,out1+20,out1+28);
*is += 16;
@@ -289,16 +281,16 @@ LEAF_EO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou
L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r16_17,&r18_19);
L_2_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r20_21,&r22_23);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(LEAFLUT[6],LEAFLUT[7],&r0_1,&r2_3,&r4_5,&r6_7);
L_4_2(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8_9,&r10_11,&r28_29,&r30_31);
L_4_4(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12_13,&r14_15,&r24_25,&r26_27);
- K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0),&r0_1,&r4_5,&r8_9,&r12_13);
+ K_N(LEAFLUT[8],LEAFLUT[9],&r0_1,&r4_5,&r8_9,&r12_13);
S_4(r0_1,r4_5,r8_9,r12_13,out0+0,out0+8,out0+16,out0+24);
- K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15);
+ K_N(LEAFLUT[10],LEAFLUT[11],&r2_3,&r6_7,&r10_11,&r14_15);
S_4(r2_3,r6_7,r10_11,r14_15,out0+4,out0+12,out0+20,out0+28);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r16_17,&r18_19,&r20_21,&r22_23);
+ K_N(LEAFLUT[6],LEAFLUT[7],&r16_17,&r18_19,&r20_21,&r22_23);
S_4(r16_17,r18_19,r20_21,r22_23,out1+0,out1+4,out1+8,out1+12);
- K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r24_25,&r26_27,&r28_29,&r30_31);
+ K_N(LEAFLUT[6],LEAFLUT[7],&r24_25,&r26_27,&r28_29,&r30_31);
S_4(r24_25,r26_27,r28_29,r30_31,out1+16,out1+20,out1+24,out1+28);
*is += 16;
diff --git a/src/patterns.c b/src/patterns.c
index 1ab593f..12d9a4c 100644
--- a/src/patterns.c
+++ b/src/patterns.c
@@ -66,7 +66,7 @@ void init_is(ffts_plan_t *p, int N, int leafN, int VL) {
// if(i % 16 == 15) printf("\n");
//}
- p->i0 = i0; p->i1 = i1; p->i2 = i2;
+ p->i0 = i0; p->i1 = i1;
}
void elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
OpenPOWER on IntegriCloud