diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-10-29 16:13:33 +0200 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2014-10-29 16:13:33 +0200 |
commit | c602cee1b51e8c532e4817d41d973deea8ab257a (patch) | |
tree | c70cbd83bc8aa2bc44ee99bd44a6f69979b22668 /src/ffts_real_nd.c | |
parent | cf01293c196926d9bfccc84bc050682240feae35 (diff) | |
download | ffts-c602cee1b51e8c532e4817d41d973deea8ab257a.zip ffts-c602cee1b51e8c532e4817d41d973deea8ab257a.tar.gz |
Cleaning and reorganizing
Diffstat (limited to 'src/ffts_real_nd.c')
-rw-r--r-- | src/ffts_real_nd.c | 432 |
1 files changed, 269 insertions, 163 deletions
diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c index 151b72a..05bcc9c 100644 --- a/src/ffts_real_nd.c +++ b/src/ffts_real_nd.c @@ -32,199 +32,305 @@ */ #include "ffts_real_nd.h" +#include "ffts_real.h" #ifdef __ARM_NEON__ #include "neon.h" #endif -void ffts_free_nd_real(ffts_plan_t *p) { +#ifdef HAVE_NEON +#include <arm_neon.h> +#endif + +#ifdef HAVE_SSE +#include <xmmintrin.h> +#endif + +#include <stdio.h> + +static void ffts_free_nd_real(ffts_plan_t *p) +{ + if (p->plans) { + int i; + + for (i = 0; i < p->rank; i++) { + ffts_plan_t *plan = p->plans[i]; + + if (plan) { + int j; + + for (j = i + 1; j < p->rank; j++) { + if (plan == p->plans[j]) { + p->plans[j] = NULL; + } + } - int i; - for(i=0;i<p->rank;i++) { + ffts_free(plan); + } + } - ffts_plan_t *x = p->plans[i]; + free(p->plans); + } - int k; - for(k=i+1;k<p->rank;k++) { - if(x == p->plans[k]) p->plans[k] = NULL; - } + if (p->transpose_buf) { + ffts_aligned_free(p->transpose_buf); + } - if(x) ffts_free(x); - } + if (p->buf) { + ffts_aligned_free(p->buf); + } - free(p->Ns); - free(p->Ms); - free(p->plans); - free(p->buf); - free(p->transpose_buf); - free(p); + if (p->Ns) { + free(p->Ns); + } + + if (p->Ms) { + free(p->Ms); + } + + free(p); } -void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf) { - int const bw = 1; - int const bh = 8; - int i = 0, j = 0; - for (; i <= h-bh; i += bh) { - for (j = 0; j <= w-bw; j += bw) { - uint64_t const * ib = &src[w*i + j]; - uint64_t * ob = &dst[h*j + i]; - - uint64_t s_0_0 = ib[0*w+0]; - uint64_t s_1_0 = ib[1*w+0]; - uint64_t s_2_0 = ib[2*w+0]; - uint64_t s_3_0 = ib[3*w+0]; - uint64_t s_4_0 = ib[4*w+0]; - uint64_t s_5_0 = ib[5*w+0]; - uint64_t s_6_0 = ib[6*w+0]; - uint64_t s_7_0 = ib[7*w+0]; - - ob[0*h+0] = s_0_0; - ob[0*h+1] = s_1_0; - ob[0*h+2] = s_2_0; - ob[0*h+3] = s_3_0; - ob[0*h+4] = s_4_0; - ob[0*h+5] = s_5_0; - ob[0*h+6] = s_6_0; - ob[0*h+7] = s_7_0; - } - } - if (i < h) { - for (int i1 = 0; i1 < w; i1++) { - for (int j = i; j < h; j++) { - dst[i1*h + j] = src[j*w + i1]; - } - } - } - if (j < w) { - for (int i = j; i < w; i++) { - for (int j1 = 0; j1 < h; j1++) { - dst[i*h + j1] = src[j1*w + i]; - } - } - } +static void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf) +{ + const int bw = 1; + const int bh = 8; + int i = 0, j = 0; + + for (; i <= h - bh; i += bh) { + for (j = 0; j <= w - bw; j += bw) { + uint64_t const *ib = &src[w*i + j]; + uint64_t *ob = &dst[h*j + i]; + + uint64_t s_0_0 = ib[0*w + 0]; + uint64_t s_1_0 = ib[1*w + 0]; + uint64_t s_2_0 = ib[2*w + 0]; + uint64_t s_3_0 = ib[3*w + 0]; + uint64_t s_4_0 = ib[4*w + 0]; + uint64_t s_5_0 = ib[5*w + 0]; + uint64_t s_6_0 = ib[6*w + 0]; + uint64_t s_7_0 = ib[7*w + 0]; + + ob[0*h + 0] = s_0_0; + ob[0*h + 1] = s_1_0; + ob[0*h + 2] = s_2_0; + ob[0*h + 3] = s_3_0; + ob[0*h + 4] = s_4_0; + ob[0*h + 5] = s_5_0; + ob[0*h + 6] = s_6_0; + ob[0*h + 7] = s_7_0; + } + } + + if (i < h) { + int i1; + + for (i1 = 0; i1 < w; i1++) { + for (j = i; j < h; j++) { + dst[i1*h + j] = src[j*w + i1]; + } + } + } + + if (j < w) { + int j1; + + for (i = j; i < w; i++) { + for (j1 = 0; j1 < h; j1++) { + dst[i*h + j1] = src[j1*w + i]; + } + } + } } -void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) { +static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out) +{ + const size_t Ms0 = p->Ms[0]; + const size_t Ns0 = p->Ns[0]; + + uint32_t *din = (uint32_t*) in; + uint64_t *buf = p->buf; + uint64_t *dout = (uint64_t*) out; + uint64_t *transpose_buf = (uint64_t*) p->transpose_buf; + + ffts_plan_t *plan; + size_t i, j; - uint32_t *din = (uint32_t *)in; - uint64_t *buf = p->buf; - uint64_t *dout = (uint64_t *)out; + plan = p->plans[0]; + for (i = 0; i < Ns0; i++) { + plan->transform(plan, din + (i * Ms0), buf + (i * (Ms0 / 2 + 1))); + } - size_t i,j; - for(i=0;i<p->Ns[0];i++) { - p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1))); - } - ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf); + ffts_scalar_transpose(buf, dout, Ms0 / 2 + 1, Ns0, transpose_buf); - for(i=1;i<p->rank;i++) { - for(j=0;j<p->Ns[i];j++) { - p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i])); - } - ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf); - } + for (i = 1; i < p->rank; i++) { + const size_t Ms = p->Ms[i]; + const size_t Ns = p->Ns[i]; + + plan = p->plans[i]; + + for (j = 0; j < Ns; j++) { + plan->transform(plan, dout + (j * Ms), buf + (j * Ms)); + } + + ffts_scalar_transpose(buf, dout, Ms, Ns, transpose_buf); + } } -void ffts_execute_nd_real_inv(ffts_plan_t *p, const void * in, void * out) { +static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out) +{ + const size_t Ms0 = p->Ms[0]; + const size_t Ms1 = p->Ms[1]; + const size_t Ns0 = p->Ns[0]; + const size_t Ns1 = p->Ns[1]; + + uint64_t *din = (uint64_t*) in; + uint64_t *buf = p->buf; + uint64_t *buf2; + uint64_t *transpose_buf = (uint64_t*) p->transpose_buf; + float *doutr = (float*) out; - uint64_t *din = (uint64_t *)in; - uint64_t *buf = p->buf; - uint64_t *buf2; - uint64_t *dout = (uint64_t *)out; - size_t vol = 1; + ffts_plan_t *plan; + size_t vol; - float *bufr = (float *)(p->buf); - float *doutr = (float *)out; + size_t i, j; - size_t i,j; + vol = p->Ns[0]; + for (i = 1; i < p->rank; i++) { + vol *= p->Ns[i]; + } - for(i=0;i<p->rank;i++) { - vol *= p->Ns[i]; - } + buf2 = buf + vol; - buf2 = buf + vol; + ffts_scalar_transpose(din, buf, Ms0, Ns0, transpose_buf); - ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf); + plan = p->plans[0]; + for (i = 0; i < Ms0; i++) { + plan->transform(plan, buf + (i * Ns0), buf2 + (i * Ns0)); + } - for(i=0;i<p->Ms[0];i++) { - p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), buf2 + (i * p->Ns[0])); - } + ffts_scalar_transpose(buf2, buf, Ns0, Ms0, transpose_buf); - ffts_scalar_transpose(buf2, buf, p->Ns[0], p->Ms[0], p->transpose_buf); - for(j=0;j<p->Ms[1];j++) { - p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]); - } + plan = p->plans[1]; + for (j = 0; j < Ms1; j++) { + plan->transform(plan, buf + (j * Ms0), &doutr[j * Ns1]); + } } -ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) { - size_t vol = 1; - size_t bufsize; - - ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); - - if(sign < 0) p->transform = &ffts_execute_nd_real; - else p->transform = &ffts_execute_nd_real_inv; - - p->destroy = &ffts_free_nd_real; - - p->rank = rank; - p->Ns = malloc(sizeof(size_t) * rank); - p->Ms = malloc(sizeof(size_t) * rank); - p->plans = malloc(sizeof(ffts_plan_t **) * rank); - int i; - for(i=0;i<rank;i++) { - p->Ns[i] = Ns[i]; - vol *= Ns[i]; - } - - //There is probably a prettier way of doing this, but it works.. - if(sign < 0) { - bufsize = 2 * vol; - } - else { - bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol); - } - - p->buf = valloc(sizeof(float) * bufsize); - - for(i=0;i<rank;i++) { - p->Ms[i] = vol / p->Ns[i]; - - p->plans[i] = NULL; - int k; - - if(sign < 0) { - for(k=1;k<i;k++) { - if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k]; - } - if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); - else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); - }else{ - for(k=0;k<i;k++) { - if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k]; - } - if(i==rank-1) p->plans[i] = ffts_init_1d_real(p->Ns[i], sign); - else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign); - } - } - if(sign < 0) { - for(i=1;i<rank;i++) { - p->Ns[i] = p->Ns[i] / 2 + 1; - } - }else{ - for(i=0;i<rank-1;i++) { - p->Ms[i] = p->Ms[i] / 2 + 1; - } - } - - p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8); - return p; +ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) +{ + int i; + size_t vol = 1; + size_t bufsize; + ffts_plan_t *p; + + p = (ffts_plan_t*) calloc(1, sizeof(*p)); + if (!p) { + return NULL; + } + + if (sign < 0) { + p->transform = &ffts_execute_nd_real; + } else { + p->transform = &ffts_execute_nd_real_inv; + } + + p->destroy = &ffts_free_nd_real; + p->rank = rank; + + p->Ms = (size_t*) malloc(rank * sizeof(*p->Ms)); + if (!p->Ms) { + goto cleanup; + } + + p->Ns = (size_t*) malloc(rank * sizeof(*p->Ns)); + if (!p->Ns) { + goto cleanup; + } + + for (i = 0; i < rank; i++) { + p->Ns[i] = Ns[i]; + vol *= Ns[i]; + } + + /* there is probably a prettier way of doing this, but it works.. */ + if (sign < 0) { + bufsize = 2 * vol; + } else { + bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol); + } + + p->buf = ffts_aligned_malloc(bufsize * sizeof(float)); + if (!p->buf) { + goto cleanup; + } + + p->transpose_buf = ffts_aligned_malloc(2 * 8 * 8 * sizeof(float)); + if (!p->transpose_buf) { + goto cleanup; + } + + p->plans = (ffts_plan_t**) calloc(rank, sizeof(*p->plans)); + if (!p->plans) { + goto cleanup; + } + + for (i = 0; i < rank; i++) { + int k; + + p->Ms[i] = vol / p->Ns[i]; + + if (sign < 0) { + if (!i) { + p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); + } else { + for (k = 1; k < i; k++) { + if (p->Ms[k] == p->Ms[i]) { + p->plans[i] = p->plans[k]; + break; + } + } + + if (!p->plans[i]) { + p->plans[i] = ffts_init_1d(p->Ms[i], sign); + p->Ns[i] = p->Ns[i] / 2 + 1; + } + } + } else { + if (i == rank - 1) { + p->plans[i] = ffts_init_1d_real(p->Ns[i], sign); + } else { + for (k = 0; k < i; k++) { + if (p->Ns[k] == p->Ns[i]) { + p->plans[i] = p->plans[k]; + break; + } + } + + if (!p->plans[i]) { + p->plans[i] = ffts_init_1d(p->Ns[i], sign); + p->Ms[i] = p->Ms[i] / 2 + 1; + } + } + } + + if (!p->plans[i]) { + goto cleanup; + } + } + + return p; + +cleanup: + ffts_free_nd_real(p); + return NULL; } +ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) +{ + size_t Ns[2]; -ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) { - size_t Ns[2]; - Ns[0] = N1; - Ns[1] = N2; - return ffts_init_nd_real(2, Ns, sign); + Ns[0] = N1; + Ns[1] = N2; + return ffts_init_nd_real(2, Ns, sign); } -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: |