diff options
author | Anthony Blake <anthonix@me.com> | 2012-08-01 14:48:29 +1200 |
---|---|---|
committer | Anthony Blake <anthonix@me.com> | 2012-08-01 14:48:29 +1200 |
commit | 9993b4b77bd21971c2e1a43dbb45567b692698c7 (patch) | |
tree | 8052860e84b199c0a5a8c67e222d58f4388a5bea | |
parent | 96ff9180191b360873a3b9c384902a0a39d5b37e (diff) | |
download | ffts-9993b4b77bd21971c2e1a43dbb45567b692698c7.zip ffts-9993b4b77bd21971c2e1a43dbb45567b692698c7.tar.gz |
Forwards and backwards working
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | Makefile.in | 2 | ||||
-rwxr-xr-x | configure | 3 | ||||
-rw-r--r-- | configure.ac | 1 | ||||
-rw-r--r-- | include/ffts.h | 19 | ||||
-rw-r--r-- | src/Makefile.in | 4 | ||||
-rw-r--r-- | src/cp_sse.c | 125 | ||||
-rw-r--r-- | src/cp_sse.h | 11 | ||||
-rw-r--r-- | src/macros.h | 58 | ||||
-rw-r--r-- | src/patterns.c | 2 |
10 files changed, 134 insertions, 93 deletions
diff --git a/Makefile.am b/Makefile.am index 27f515f..33b171c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,3 +1,3 @@ AUTOMAKE_OPTIONS = foreign -SUBDIRS = src +SUBDIRS = src tests diff --git a/Makefile.in b/Makefile.in index b63edd2..a889030 100644 --- a/Makefile.in +++ b/Makefile.in @@ -217,7 +217,7 @@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ AUTOMAKE_OPTIONS = foreign -SUBDIRS = src +SUBDIRS = src tests all: config.h $(MAKE) $(AM_MAKEFLAGS) all-recursive @@ -5049,7 +5049,7 @@ done -ac_config_files="$ac_config_files Makefile src/Makefile" +ac_config_files="$ac_config_files Makefile src/Makefile tests/Makefile" cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure @@ -5784,6 +5784,7 @@ do "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;; "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; "src/Makefile") CONFIG_FILES="$CONFIG_FILES src/Makefile" ;; + "tests/Makefile") CONFIG_FILES="$CONFIG_FILES tests/Makefile" ;; *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; esac diff --git a/configure.ac b/configure.ac index 775ae7b..06cb913 100644 --- a/configure.ac +++ b/configure.ac @@ -55,5 +55,6 @@ AC_CHECK_FUNCS([gettimeofday pow]) AC_CONFIG_FILES([Makefile src/Makefile + tests/Makefile ]) AC_OUTPUT diff --git a/include/ffts.h b/include/ffts.h index 1431cf6..e266491 100644 --- a/include/ffts.h +++ b/include/ffts.h @@ -37,11 +37,26 @@ #include <complex.h> #include <math.h> #include <stdint.h> +#include <stddef.h> -typedef struct ffts_plan_t; +typedef size_t transform_index_t; + +struct _ffts_plan_t { + ptrdiff_t *is; + ptrdiff_t *offsets; + void __attribute__ ((aligned(32))) **ws; + void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict); + size_t i0, i1, i2; + uint64_t n_bits, leaftime; + + transform_index_t *transforms; +}; + + +typedef struct _ffts_plan_t ffts_plan_t; void ffts_execute(ffts_plan_t * restrict, const void * restrict, const void * restrict); -ffts_plan_t *ffts_init(size_t N); +ffts_plan_t *ffts_init(size_t N, int sign); #endif diff --git a/src/Makefile.in b/src/Makefile.in index 73fecf8..fbc26bb 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -171,9 +171,9 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) exit 1;; \ esac; \ done; \ - echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \ $(am__cd) $(top_srcdir) && \ - $(AUTOMAKE) --foreign src/Makefile + $(AUTOMAKE) --gnu src/Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ diff --git a/src/cp_sse.c b/src/cp_sse.c index f59c60b..8b09031 100644 --- a/src/cp_sse.c +++ b/src/cp_sse.c @@ -35,23 +35,26 @@ firstpass_64(const float * restrict in, float * restrict out, size_t N, ffts_pla void firstpass_32(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; + float *LUT8 = p->ws[0]; + float *LUT16 = p->ws[1]; + float *LUT32 = p->ws[2]; L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19); L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23); - K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31); L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27); - K_N(VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,-0,0),&r0_1,&r4_5,&r8_9,&r12_13); - K_N(VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15); - K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r16_17,&r18_19,&r20_21,&r22_23); - K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r24_25,&r26_27,&r28_29,&r30_31); - K_N(VLIT4(0.98078528040323043057924223830923,0.98078528040323043057924223830923,1,1),VLIT4(0.19509032201612824808378832130984,-0.19509032201612824808378832130984,-0,0),&r0_1,&r8_9,&r16_17,&r24_25); + K_N(_mm_load_ps(LUT16),_mm_load_ps(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13); + K_N(_mm_load_ps(LUT16+8),_mm_load_ps(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15); + K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r16_17,&r18_19,&r20_21,&r22_23); + K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r24_25,&r26_27,&r28_29,&r30_31); + K_N(_mm_load_ps(LUT32),_mm_load_ps(LUT32+4),&r0_1,&r8_9,&r16_17,&r24_25); S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48); - K_N(VLIT4(0.8314696123025452356714026791451,0.8314696123025452356714026791451,0.92387953251128673848313610506011,0.92387953251128673848313610506011),VLIT4(0.55557023301960217764872140833177,-0.55557023301960217764872140833177,0.38268343236508978177923268049199,-0.38268343236508978177923268049199),&r2_3,&r10_11,&r18_19,&r26_27); + K_N(_mm_load_ps(LUT32+8),_mm_load_ps(LUT32+12),&r2_3,&r10_11,&r18_19,&r26_27); S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52); - K_N(VLIT4(0.55557023301960228867102387084742,0.55557023301960228867102387084742,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.83146961230254512464910021662945,-0.83146961230254512464910021662945,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r4_5,&r12_13,&r20_21,&r28_29); + K_N(_mm_load_ps(LUT32+16),_mm_load_ps(LUT32+20),&r4_5,&r12_13,&r20_21,&r28_29); S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56); - K_N(VLIT4(0.19509032201612830359493955256767,0.19509032201612830359493955256767,0.38268343236508983729038391174981,0.38268343236508983729038391174981),VLIT4(0.98078528040323043057924223830923,-0.98078528040323043057924223830923,0.92387953251128673848313610506011,-0.92387953251128673848313610506011),&r6_7,&r14_15,&r22_23,&r30_31); + K_N(_mm_load_ps(LUT32+24),_mm_load_ps(LUT32+28),&r6_7,&r14_15,&r22_23,&r30_31); S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60); } @@ -59,21 +62,24 @@ firstpass_32(const data_t * restrict in, data_t * restrict out, size_t N, ffts_p void firstpass_16(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + float *LUT8 = p->ws[0]; + float *LUT16 = p->ws[1]; L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11); L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13); - K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7); - K_N(VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,-0,0),&r0_1,&r4_5,&r8_9,&r12_13); + K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(_mm_load_ps(LUT16),_mm_load_ps(LUT16+4),&r0_1,&r4_5,&r8_9,&r12_13); S_4(r0_1,r4_5,r8_9,r12_13,out+0,out+8,out+16,out+24); - K_N(VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15); + K_N(_mm_load_ps(LUT16+8),_mm_load_ps(LUT16+12),&r2_3,&r6_7,&r10_11,&r14_15); S_4(r2_3,r6_7,r10_11,r14_15,out+4,out+12,out+20,out+28); - } + void firstpass_8(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { __m128 r0_1,r2_3,r4_5,r6_7; - L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7); - K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7); + float *LUT8 = p->ws[0]; + L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7); + K_N(_mm_load_ps(LUT8),_mm_load_ps(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); S_4(r0_1,r2_3,r4_5,r6_7,out+0,out+4,out+8,out+12); } void @@ -165,10 +171,13 @@ void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out, } -ffts_plan_t *ffts_init(size_t N) { +ffts_plan_t *ffts_init(size_t N, int sign) { ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); size_t leafN = 16; size_t i; + + if(sign < 0) MULI_SIGN = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f); + else MULI_SIGN = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f); if(N > 32) { init_offsets(p, N, leafN); @@ -178,25 +187,68 @@ ffts_plan_t *ffts_init(size_t N) { if(N == 64) p->firstpass = &firstpass_64; else if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_2; else p->firstpass = &firstpass_type_1; - /* LUTS */ - size_t n_luts = __builtin_ctzl(N/leafN); + LEAFLUT[0] = _mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941); + LEAFLUT[1] = _mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); + LEAFLUT[2] = _mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011); + LEAFLUT[3] = _mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199); + LEAFLUT[4] = _mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981); + LEAFLUT[5] = _mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011); + + LEAFLUT[6] = _mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1); + LEAFLUT[7] = _mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0); + LEAFLUT[8] = _mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1); + LEAFLUT[9] = _mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0); + LEAFLUT[10] = _mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941); + LEAFLUT[11] = _mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); + + if(sign > 0) { + LEAFLUT[1] = _mm_xor_ps(LEAFLUT[1], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); + LEAFLUT[3] = _mm_xor_ps(LEAFLUT[3], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); + LEAFLUT[5] = _mm_xor_ps(LEAFLUT[5], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); + LEAFLUT[7] = _mm_xor_ps(LEAFLUT[7], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); + LEAFLUT[9] = _mm_xor_ps(LEAFLUT[9], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); + LEAFLUT[11] = _mm_xor_ps(LEAFLUT[11], _mm_set_ps(-0.0f,-0.0f,-0.0f,-0.0f)); + } + p->i0 = N/leafN/3+1; p->i1 = N/leafN/3; if((N/leafN) % 3 > 1) p->i1++; p->i0/=2; p->i1/=2; + }else{ + p->transforms = malloc(2 * sizeof(transform_index_t)); + p->transforms[0] = 0; + p->transforms[1] = 1; + if(N == 2) p->firstpass = &firstpass_2; + else if(N == 4) p->firstpass = &firstpass_4; + else if(N == 8) p->firstpass = &firstpass_8; + else if(N == 16) p->firstpass = &firstpass_16; + else if(N == 32) p->firstpass = &firstpass_32; + + } + + int hardcoded = 0; - // printf("n_luts = %zu\n", n_luts); + /* LUTS */ + size_t n_luts = __builtin_ctzl(N/leafN); + if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } + + + //printf("n_luts = %zu\n", n_luts); p->ws = malloc(n_luts * sizeof(data_t *)); cdata_t *w; int n = leafN*2; + if(hardcoded) n = 8; + for(i=0;i<n_luts;i++) { - // printf("LUT[%zu] = %d\n", i, n); - if(!i) { + + //printf("LUT[%zu] = %d\n", i, n); + + if(!i || hardcoded) { w = _mm_malloc(n/4 * 2 * sizeof(cdata_t), 32); @@ -216,15 +268,15 @@ ffts_plan_t *ffts_init(size_t N) { __m128 re, im; re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0)); im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1)); - im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + im = _mm_xor_ps(im, MULI_SIGN); _mm_store_ps(fw + j*4 , re); _mm_store_ps(fw + j*4+4, im); } - // for(j=0;j<n/2;j++) { - // printf("%f %f\n", creal(w[j]), cimag(w[j])); + // for(j=0;j<n/2;j++) { + // printf("%f %f\n", creal(w[j]), cimag(w[j])); - // } + // } _mm_free(w0); }else{ @@ -252,21 +304,21 @@ ffts_plan_t *ffts_init(size_t N) { temp0 = _mm_load_ps(fw0 + j*2); re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0)); im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1)); - im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + im = _mm_xor_ps(im, MULI_SIGN); _mm_store_ps(fw + j*2*6 , re); _mm_store_ps(fw + j*2*6+4, im); temp1 = _mm_load_ps(fw1 + j*2); re = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(2, 2, 0, 0)); im = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 3, 1, 1)); - im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + im = _mm_xor_ps(im, MULI_SIGN); _mm_store_ps(fw + j*2*6+8 , re); _mm_store_ps(fw + j*2*6+12, im); temp2 = _mm_load_ps(fw2 + j*2); re = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 2, 0, 0)); im = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 1, 1)); - im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + im = _mm_xor_ps(im, MULI_SIGN); _mm_store_ps(fw + j*2*6+16, re); _mm_store_ps(fw + j*2*6+20, im); } @@ -280,22 +332,11 @@ ffts_plan_t *ffts_init(size_t N) { n *= 2; } - p->n_bits = log(N)/log(2) - log(leafN*2)/log(2); - }else{ - p->transforms = malloc(2 * sizeof(transform_index_t)); - p->transforms[0] = 0; - p->transforms[1] = 1; - if(N == 2) p->firstpass = &firstpass_2; - else if(N == 4) p->firstpass = &firstpass_4; - else if(N == 8) p->firstpass = &firstpass_8; - else if(N == 16) p->firstpass = &firstpass_16; - else if(N == 32) p->firstpass = &firstpass_32; - - } + return p; } - +/* int main(int argc, char *argv[]) { int n = atoi(argv[1]); int count = atoi(argv[2]); @@ -341,4 +382,4 @@ int main(int argc, char *argv[]) { printf("Time: %f seconds, CTGs: %f Leaftime: %f \n", tt, ctgs, lt); return 0; -} +}*/ diff --git a/src/cp_sse.h b/src/cp_sse.h index c6d15dc..2c6825f 100644 --- a/src/cp_sse.h +++ b/src/cp_sse.h @@ -8,7 +8,6 @@ #include <stddef.h> #include <xmmintrin.h> #include <stdint.h> -#include <mach/mach_time.h> typedef complex float cdata_t; typedef float data_t; @@ -22,19 +21,11 @@ struct _ffts_plan_t { ptrdiff_t *offsets; void __attribute__ ((aligned(32))) **ws; void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict); - size_t i0, i1, i2; - uint64_t n_bits, leaftime; + size_t i0, i1; transform_index_t *transforms; }; - typedef struct _ffts_plan_t ffts_plan_t; - -typedef struct _split_vec_t { - __m128 re, im; -} split_vec_t; - - #endif diff --git a/src/macros.h b/src/macros.h index 0a84802..b2f44e6 100644 --- a/src/macros.h +++ b/src/macros.h @@ -7,8 +7,10 @@ #define VLIT4 _mm_set_ps +__m128 MULI_SIGN; + __INLINE __m128 IMULI(__m128 a) { - __m128 temp = _mm_xor_ps(a, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + __m128 temp = _mm_xor_ps(a, MULI_SIGN);//_mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(2,3,0,1)); } @@ -113,6 +115,8 @@ __INLINE void TX2(__m128 *a, __m128 *b) { *a = TX2_t0; *b = TX2_t1; } +__m128 __attribute__((aligned(32))) LEAFLUT[12]; + __INLINE void LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; @@ -124,23 +128,17 @@ LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3); L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7); K_0(&r0,&r2,&r4,&r6); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r1,&r3,&r5,&r7); + K_N(LEAFLUT[0],LEAFLUT[1],&r1,&r3,&r5,&r7); L_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8,&r9,&r10,&r11); L_4(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12,&r13,&r14,&r15); K_0(&r0,&r4,&r8,&r12); - K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199),&r1,&r5,&r9,&r13); - TX2(&r0,&r1); - TX2(&r4,&r5); - TX2(&r8,&r9); - TX2(&r12,&r13); + K_N(LEAFLUT[2],LEAFLUT[3],&r1,&r5,&r9,&r13); + TX2(&r0,&r1); TX2(&r4,&r5); TX2(&r8,&r9); TX2(&r12,&r13); S_4(r0,r4,r8,r12,out0+0,out0+8,out0+16,out0+24); S_4(r1,r5,r9,r13,out1+0,out1+8,out1+16,out1+24); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2,&r6,&r10,&r14); - K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011),&r3,&r7,&r11,&r15); - TX2(&r2,&r3); - TX2(&r6,&r7); - TX2(&r10,&r11); - TX2(&r14,&r15); + K_N(LEAFLUT[0],LEAFLUT[1],&r2,&r6,&r10,&r14); + K_N(LEAFLUT[4],LEAFLUT[5],&r3,&r7,&r11,&r15); + TX2(&r2,&r3); TX2(&r6,&r7); TX2(&r10,&r11); TX2(&r14,&r15); S_4(r2,r6,r10,r14,out0+4,out0+12,out0+20,out0+28); S_4(r3,r7,r11,r15,out1+4,out1+12,out1+20,out1+28); *is += 16; @@ -158,21 +156,15 @@ LEAF_OO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3); L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7); K_0(&r0,&r2,&r4,&r6); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r1,&r3,&r5,&r7); - TX2(&r0,&r1); - TX2(&r2,&r3); - TX2(&r4,&r5); - TX2(&r6,&r7); + K_N(LEAFLUT[0],LEAFLUT[1],&r1,&r3,&r5,&r7); + TX2(&r0,&r1); TX2(&r2,&r3); TX2(&r4,&r5); TX2(&r6,&r7); S_4(r0,r2,r4,r6,out0+0,out0+4,out0+8,out0+12); S_4(r1,r3,r5,r7,out1+0,out1+4,out1+8,out1+12); L_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8,&r9,&r10,&r11); L_2(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12,&r13,&r14,&r15); K_0(&r8,&r10,&r12,&r14); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r9,&r11,&r13,&r15); - TX2(&r8,&r9); - TX2(&r10,&r11); - TX2(&r12,&r13); - TX2(&r14,&r15); + K_N(LEAFLUT[0],LEAFLUT[1],&r9,&r11,&r13,&r15); + TX2(&r8,&r9); TX2(&r10,&r11); TX2(&r12,&r13); TX2(&r14,&r15); S_4(r8,r10,r12,r14,out0+16,out0+20,out0+24,out0+28); S_4(r9,r11,r13,r15,out1+16,out1+20,out1+24,out1+28); @@ -264,16 +256,16 @@ LEAF_OE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r24_25,&r26_27); L_2_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r30_31,&r28_29); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(LEAFLUT[6],LEAFLUT[7],&r0_1,&r2_3,&r4_5,&r6_7); S_4(r0_1,r2_3,r4_5,r6_7,out0+0,out0+4,out0+8,out0+12); L_4_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8_9,&r10_11,&r16_17,&r18_19); L_2_2(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12_13,&r14_15,&r20_21,&r22_23); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r8_9,&r10_11,&r12_13,&r14_15); + K_N(LEAFLUT[6],LEAFLUT[7],&r8_9,&r10_11,&r12_13,&r14_15); S_4(r8_9,r10_11,r12_13,r14_15,out0+16,out0+20,out0+24,out0+28); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r16_17,&r18_19,&r20_21,&r22_23); - K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0),&r16_17,&r20_21,&r24_25,&r28_29); + K_N(LEAFLUT[6],LEAFLUT[7],&r16_17,&r18_19,&r20_21,&r22_23); + K_N(LEAFLUT[8],LEAFLUT[9],&r16_17,&r20_21,&r24_25,&r28_29); S_4(r16_17,r20_21,r24_25,r28_29,out1+0,out1+8,out1+16,out1+24); - K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r18_19,&r22_23,&r26_27,&r30_31); + K_N(LEAFLUT[10],LEAFLUT[11],&r18_19,&r22_23,&r26_27,&r30_31); S_4(r18_19,r22_23,r26_27,r30_31,out1+4,out1+12,out1+20,out1+28); *is += 16; @@ -289,16 +281,16 @@ LEAF_EO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict ou L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r16_17,&r18_19); L_2_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r20_21,&r22_23); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(LEAFLUT[6],LEAFLUT[7],&r0_1,&r2_3,&r4_5,&r6_7); L_4_2(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8_9,&r10_11,&r28_29,&r30_31); L_4_4(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12_13,&r14_15,&r24_25,&r26_27); - K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0),&r0_1,&r4_5,&r8_9,&r12_13); + K_N(LEAFLUT[8],LEAFLUT[9],&r0_1,&r4_5,&r8_9,&r12_13); S_4(r0_1,r4_5,r8_9,r12_13,out0+0,out0+8,out0+16,out0+24); - K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15); + K_N(LEAFLUT[10],LEAFLUT[11],&r2_3,&r6_7,&r10_11,&r14_15); S_4(r2_3,r6_7,r10_11,r14_15,out0+4,out0+12,out0+20,out0+28); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r16_17,&r18_19,&r20_21,&r22_23); + K_N(LEAFLUT[6],LEAFLUT[7],&r16_17,&r18_19,&r20_21,&r22_23); S_4(r16_17,r18_19,r20_21,r22_23,out1+0,out1+4,out1+8,out1+12); - K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r24_25,&r26_27,&r28_29,&r30_31); + K_N(LEAFLUT[6],LEAFLUT[7],&r24_25,&r26_27,&r28_29,&r30_31); S_4(r24_25,r26_27,r28_29,r30_31,out1+16,out1+20,out1+24,out1+28); *is += 16; diff --git a/src/patterns.c b/src/patterns.c index 1ab593f..12d9a4c 100644 --- a/src/patterns.c +++ b/src/patterns.c @@ -66,7 +66,7 @@ void init_is(ffts_plan_t *p, int N, int leafN, int VL) { // if(i % 16 == 15) printf("\n"); //} - p->i0 = i0; p->i1 = i1; p->i2 = i2; + p->i0 = i0; p->i1 = i1; } void elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) { |