#include "cp_sse.h" #include "macros.h" //#include "mini_macros.h" #include "patterns.h" #include #include #include #include #include /* for PAGESIZE */ void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out) { transform_index_t *ps = p->transforms; //p->firstpass((const float *)in, (float *)out, p); p->transform(p, (const float *)in, (float *)out); //if(p->transform) p->transform(out, p->N, p->ws); } void ffts_free(ffts_plan_t *p) { size_t i; if(p->ws) { FFTS_FREE(p->ws); } if(p->is) free(p->is); if(p->offsets) free(p->offsets); //free(p->transforms); if(p->transforms) free(p->transforms); if(p->transform_base) { if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) { perror("Couldn't mprotect"); exit(errno); } free(p->transform_base); } free(p); } ffts_plan_t *ffts_init(size_t N, int sign) { ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); size_t leafN = 8; size_t i; if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); if(sign < 0) SCALAR_MULI_SIGN = -0.0f*I; else SCALAR_MULI_SIGN = -0.0f; p->transform = NULL; p->transform_base = NULL; p->transforms = NULL; p->is = NULL; p->ws = NULL; p->offsets = NULL; if(N >= 32) { ffts_init_offsets(p, N, leafN); ffts_init_is(p, N, leafN, 2); // ffts_init_tree(p, N, leafN); // if(N == 64) p->firstpass = &firstpass_64; LEAFLUT[0] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941); LEAFLUT[1] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); LEAFLUT[2] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011); LEAFLUT[3] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199); LEAFLUT[4] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981); LEAFLUT[5] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011); LEAFLUT[6] = VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1); LEAFLUT[7] = VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0); LEAFLUT[8] = VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1); LEAFLUT[9] = VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0); LEAFLUT[10] = VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941); LEAFLUT[11] = VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376); if(sign > 0) { V neg = VLIT4(-0.0f, -0.0f, -0.0f, -0.0f); LEAFLUT[1] = VXOR(LEAFLUT[1], neg); LEAFLUT[3] = VXOR(LEAFLUT[3], neg); LEAFLUT[5] = VXOR(LEAFLUT[5], neg); LEAFLUT[7] = VXOR(LEAFLUT[7], neg); LEAFLUT[9] = VXOR(LEAFLUT[9], neg); LEAFLUT[11] = VXOR(LEAFLUT[11], neg); } p->i0 = N/leafN/3+1; p->i1 = N/leafN/3; if((N/leafN) % 3 > 1) p->i1++; p->i0/=2; p->i1/=2; }else{ p->transforms = malloc(2 * sizeof(transform_index_t)); p->transforms[0] = 0; p->transforms[1] = 1; if(N == 2) p->transform = &firstpass_2; else if(N == 4 && sign == -1) p->transform = &firstpass_4_f; else if(N == 4 && sign == 1) p->transform = &firstpass_4_b; else if(N == 8) p->transform = &firstpass_8; else if(N == 16) p->transform = &firstpass_16; else if(N == 32) p->transform = &firstpass_32; p->is = NULL; p->offsets = NULL; } int hardcoded = 0; /* LUTS */ size_t n_luts = __builtin_ctzl(N/leafN); if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } if(n_luts >= 32) n_luts = 0; // fprintf(stderr, "n_luts = %zu\n", n_luts); cdata_t *w; int n = leafN*2; if(hardcoded) n = 8; size_t lut_size = 0; for(i=0;iws = FFTS_MALLOC(lut_size,32); p->ws_is = malloc(n_luts * sizeof(size_t)); }else{ p->ws = NULL; p->ws_is = NULL; } w = p->ws; n = leafN*2; if(hardcoded) n = 8; for(i=0;iws_is[i] = w - (cdata_t *)p->ws; //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); if(!i || hardcoded) { cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); size_t j; for(j=0;jws[i] = w; n *= 2; } float *tmp = (float *)p->ws; //for(i=0;iN = N; p->lastlut = w; p->n_luts = n_luts; if(N>=32) ffts_generate_func_code(p, N, leafN); // fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t)); return p; } /* int main(int argc, char *argv[]) { int n = atoi(argv[1]); int count = atoi(argv[2]); ffts_plan_t *p = ffts_init(n); cdata_t *in = FFTS_MALLOC(n * sizeof(cdata_t), 32); cdata_t *out = FFTS_MALLOC(n * sizeof(cdata_t), 32); size_t i; for(i=0;ileaftime = 0; if(count>1){ for(i=0;ileaftime = 0; uint64_t start = mach_absolute_time(); for(i=0;ileaftime); // lt /= (double) count; for(i=0;i