diff options
-rw-r--r-- | src/codegen.c | 1 | ||||
-rw-r--r-- | src/cp_sse.c | 54 | ||||
-rw-r--r-- | src/cp_sse.h | 2 | ||||
-rw-r--r-- | src/macros.h | 2 |
4 files changed, 13 insertions, 46 deletions
diff --git a/src/codegen.c b/src/codegen.c index ab1f87a..8593f12 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -134,6 +134,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) uint32_t *func = valloc(8192); uint32_t *fp = func; + //p->transform_base = func; uint32_t *x_8_addr = fp; memcpy(fp, neon_x8, neon_x8_t - neon_x8); diff --git a/src/cp_sse.c b/src/cp_sse.c index b35605c..1356c7b 100644 --- a/src/cp_sse.c +++ b/src/cp_sse.c @@ -73,45 +73,8 @@ firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * rest void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) { transform_index_t *ps = p->transforms; - int leafN = 8; p->firstpass((const float *)in, (float *)out, p); - p->transform(out, p->N, p->ws); - - /* - size_t ps0_next = ps[0]; - while(ps0_next) { - size_t ps0 = ps0_next; - size_t ps1 = ps[1]; - ps0_next = ps[2]; - ps += 2; - - if(ps0 == 2*leafN) { - float *LUT = (float *)p->ws[0]; - float *data = (float *)(out) + ps1; - #ifdef __ARM_NEON__ - X_4_SPLIT(data, 16, LUT); - #else - X_4(data, 16, LUT); - #endif - - }else{ - int index = __builtin_ctzl(ps0)-4; - float *LUT = (float *)p->ws[__builtin_ctzl(ps0)-4]; - #ifdef __ARM_NEON__ - X_8_SPLIT(((float *)out) + ps1, ps0, LUT); - #else - X_8(((float *)out) + ps1, ps0, LUT); - #endif - } - - - } - - #ifdef __ARM_NEON__ - if(p->N>32) - X_8_SPLIT_T((float *)out, p->N, p->lastlut); - #endif -*/ + if(p->transform) p->transform(out, p->N, p->ws); } void ffts_free(ffts_plan_t *p) { @@ -119,15 +82,14 @@ void ffts_free(ffts_plan_t *p) { size_t i; if(p->ws) { -// for(i=0;i<p->n_luts;i++) { -// FFTS_FREE(p->ws[i]); -// } FFTS_FREE(p->ws); } if(p->is) free(p->is); if(p->offsets) free(p->offsets); //free(p->transforms); +// if(p->transform_base) free(p->transform_base); + free(p); } @@ -143,8 +105,9 @@ ffts_plan_t *ffts_init(size_t N, int sign) { else SCALAR_MULI_SIGN = -0.0f; p->transform = NULL; + p->transform_base = NULL; - if(N > 32) { + if(N >= 32) { ffts_init_offsets(p, N, leafN); ffts_init_is(p, N, leafN, 2); // ffts_init_tree(p, N, leafN); @@ -203,7 +166,7 @@ ffts_plan_t *ffts_init(size_t N, int sign) { /* LUTS */ size_t n_luts = __builtin_ctzl(N/leafN); - if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } + if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } if(n_luts >= 32) n_luts = 0; @@ -245,6 +208,9 @@ ffts_plan_t *ffts_init(size_t N, int sign) { w = p->ws; n = leafN*2; + if(hardcoded) n = 8; + + for(i=0;i<n_luts;i++) { p->ws_is[i] = w - (cdata_t *)p->ws; fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); @@ -391,7 +357,7 @@ ffts_plan_t *ffts_init(size_t N, int sign) { p->N = N; p->lastlut = w; p->n_luts = n_luts; - if(N>32) + if(N>=32) p->transform = ffts_generate_func_code(p, N, leafN); // fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t)); diff --git a/src/cp_sse.h b/src/cp_sse.h index d692fc9..6f793d8 100644 --- a/src/cp_sse.h +++ b/src/cp_sse.h @@ -30,7 +30,7 @@ struct _ffts_plan_t { size_t N; void *lastlut; transform_index_t *transforms; - transform_func_t transform; + transform_func_t transform, transform_base; }; typedef struct _ffts_plan_t ffts_plan_t; diff --git a/src/macros.h b/src/macros.h index 8dc2a8a..fcb2258 100644 --- a/src/macros.h +++ b/src/macros.h @@ -442,7 +442,7 @@ __INLINE void firstpass_16(const data_t * restrict in, data_t * restrict out, ffts_plan_t * restrict p) { V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; float *LUT8 = p->ws + p->ws_is[0]; - float *LUT16 = p->ws + p->ws_is[1]; + float *LUT16 = ((float *)p->ws) + 8;//(p->ws_is[1]*4); L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11); L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13); |