#include "cp_sse.h" #include "macros.h" #include "patterns.h" __INLINE void firstpass_type_1(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { size_t i, ii0 = p->i0, ii1 = p->i1; size_t *offsets = (size_t *)p->offsets; size_t *is = (size_t *)p->is; #ifdef __ARM_NEON__ const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7]; for(i=ii0;i>0;--i) { neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); offsets += 2; } for(i=ii1;i>0;--i) { neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); offsets += 2; } neon_shl8_oe(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); offsets += 2; for(i=ii1;i>0;--i) { neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2); offsets += 2; } #else for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out); for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out); LEAF_OE(&is, in, &offsets, out); for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out); #endif } __INLINE void firstpass_type_2(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { size_t i, ii0 = p->i0, ii1 = p->i1; size_t *offsets = (size_t *)p->offsets; size_t *is = (size_t *)p->is; #ifdef __ARM_NEON__ const data_t *i0=in+is[0],*i1=in+is[1],*i2=in+is[2],*i3=in+is[3],*i4=in+is[4],*i5=in+is[5],*i6=in+is[6],*i7=in+is[7]; for(i=ii0;i>0;--i) { neon_shl8_ee(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); offsets+=2; } neon_shl8_eo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i4,&i5,&i6,&i7); offsets += 2; for(i=ii1;i>0;--i) { neon_shl8_oo(out+offsets[0],out+offsets[1],&i0,&i1,&i2,&i3,&i6,&i7,&i4,&i5); offsets += 2; } for(i=ii1;i>0;--i) { neon_shl8_ee(out+offsets[0],out+offsets[1],&i6,&i7,&i4,&i5,&i0,&i1,&i3,&i2); offsets += 2; } #else for(i=ii0;i>0;--i) LEAF_EE(&is, in, &offsets, out); LEAF_EO(&is, in, &offsets, out); for(i=ii1;i>0;--i) LEAF_OO(&is, in, &offsets, out); for(i=ii1;i>0;--i) LEAF_EE(&is, in, &offsets, out); #endif } __INLINE void firstpass_64(const float * restrict in, float * restrict out, ffts_plan_t * restrict p) { size_t *offsets = (size_t *)p->offsets; size_t *is = (size_t *)p->is; LEAF_EE(&is, in, &offsets, out); LEAF_OE(&is, in, &offsets, out); } void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) { transform_index_t *ps = p->transforms; int leafN = 8; p->firstpass((const float *)in, (float *)out, p); p->transform(out, p->N, p->ws); /* size_t ps0_next = ps[0]; while(ps0_next) { size_t ps0 = ps0_next; size_t ps1 = ps[1]; ps0_next = ps[2]; ps += 2; if(ps0 == 2*leafN) { float *LUT = (float *)p->ws[0]; float *data = (float *)(out) + ps1; #ifdef __ARM_NEON__ X_4_SPLIT(data, 16, LUT); #else X_4(data, 16, LUT); #endif }else{ int index = __builtin_ctzl(ps0)-4; float *LUT = (float *)p->ws[__builtin_ctzl(ps0)-4]; #ifdef __ARM_NEON__ X_8_SPLIT(((float *)out) + ps1, ps0, LUT); #else X_8(((float *)out) + ps1, ps0, LUT); #endif } } #ifdef __ARM_NEON__ if(p->N>32) X_8_SPLIT_T((float *)out, p->N, p->lastlut); #endif */ } void ffts_free(ffts_plan_t *p) { size_t i; if(p->ws) { // for(i=0;in_luts;i++) { // FFTS_FREE(p->ws[i]); // } FFTS_FREE(p->ws); } if(p->is) free(p->is); if(p->offsets) free(p->offsets); //free(p->transforms); free(p); } ffts_plan_t *ffts_init(size_t N, int sign) { ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); size_t leafN = 8; size_t i; if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); if(sign < 0) SCALAR_MULI_SIGN = -0.0f*I; else SCALAR_MULI_SIGN = -0.0f; p->transform = NULL; if(N > 32) { ffts_init_offsets(p, N, leafN); ffts_init_is(p, N, leafN, 2); // ffts_init_tree(p, N, leafN); // if(N == 64) p->firstpass = &firstpass_64; if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_1; else p->firstpass = &firstpass_type_2; LEAFLUT[0] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941); LEAFLUT[1] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); LEAFLUT[2] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011); LEAFLUT[3] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199); LEAFLUT[4] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981); LEAFLUT[5] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011); LEAFLUT[6] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1); LEAFLUT[7] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0); LEAFLUT[8] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1); LEAFLUT[9] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0); LEAFLUT[10] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941); LEAFLUT[11] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); if(sign > 0) { V neg = VLIT4(-0.0f, -0.0f, -0.0f, -0.0f); LEAFLUT[1] = VXOR(LEAFLUT[1], neg); LEAFLUT[3] = VXOR(LEAFLUT[3], neg); LEAFLUT[5] = VXOR(LEAFLUT[5], neg); LEAFLUT[7] = VXOR(LEAFLUT[7], neg); LEAFLUT[9] = VXOR(LEAFLUT[9], neg); LEAFLUT[11] = VXOR(LEAFLUT[11], neg); } p->i0 = N/leafN/3+1; p->i1 = N/leafN/3; if((N/leafN) % 3 > 1) p->i1++; p->i0/=2; p->i1/=2; }else{ p->transforms = malloc(2 * sizeof(transform_index_t)); p->transforms[0] = 0; p->transforms[1] = 1; if(N == 2) p->firstpass = &firstpass_2; else if(N == 4 && sign == -1) p->firstpass = &firstpass_4_f; else if(N == 4 && sign == 1) p->firstpass = &firstpass_4_b; else if(N == 8) p->firstpass = &firstpass_8; else if(N == 16) p->firstpass = &firstpass_16; else if(N == 32) p->firstpass = &firstpass_32; p->is = NULL; p->offsets = NULL; } int hardcoded = 0; /* LUTS */ size_t n_luts = __builtin_ctzl(N/leafN); if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } if(n_luts >= 32) n_luts = 0; // fprintf(stderr, "n_luts = %zu\n", n_luts); cdata_t *w; int n = leafN*2; if(hardcoded) n = 8; size_t lut_size = 0; for(i=0;iws = FFTS_MALLOC(lut_size,32); p->ws_is = malloc(n_luts * sizeof(size_t)); }else{ p->ws = NULL; p->ws_is = NULL; } w = p->ws; n = leafN*2; for(i=0;iws_is[i] = w - (cdata_t *)p->ws; fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); if(!i || hardcoded) { cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); size_t j; for(j=0;jws[i] = w; n *= 2; } float *tmp = (float *)p->ws; for(i=0;iN = N; p->lastlut = w; p->n_luts = n_luts; if(N>32) p->transform = ffts_generate_func_code(p, N, leafN); // fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t)); return p; } /* int main(int argc, char *argv[]) { int n = atoi(argv[1]); int count = atoi(argv[2]); ffts_plan_t *p = ffts_init(n); cdata_t *in = FFTS_MALLOC(n * sizeof(cdata_t), 32); cdata_t *out = FFTS_MALLOC(n * sizeof(cdata_t), 32); size_t i; for(i=0;ileaftime = 0; if(count>1){ for(i=0;ileaftime = 0; uint64_t start = mach_absolute_time(); for(i=0;ileaftime); // lt /= (double) count; for(i=0;i