summaryrefslogtreecommitdiffstats
path: root/src/ffts_real_nd.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-10-29 16:13:33 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-10-29 16:13:33 +0200
commitc602cee1b51e8c532e4817d41d973deea8ab257a (patch)
treec70cbd83bc8aa2bc44ee99bd44a6f69979b22668 /src/ffts_real_nd.c
parentcf01293c196926d9bfccc84bc050682240feae35 (diff)
downloadffts-c602cee1b51e8c532e4817d41d973deea8ab257a.zip
ffts-c602cee1b51e8c532e4817d41d973deea8ab257a.tar.gz
Cleaning and reorganizing
Diffstat (limited to 'src/ffts_real_nd.c')
-rw-r--r--src/ffts_real_nd.c432
1 files changed, 269 insertions, 163 deletions
diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c
index 151b72a..05bcc9c 100644
--- a/src/ffts_real_nd.c
+++ b/src/ffts_real_nd.c
@@ -32,199 +32,305 @@
*/
#include "ffts_real_nd.h"
+#include "ffts_real.h"
#ifdef __ARM_NEON__
#include "neon.h"
#endif
-void ffts_free_nd_real(ffts_plan_t *p) {
+#ifdef HAVE_NEON
+#include <arm_neon.h>
+#endif
+
+#ifdef HAVE_SSE
+#include <xmmintrin.h>
+#endif
+
+#include <stdio.h>
+
+static void ffts_free_nd_real(ffts_plan_t *p)
+{
+ if (p->plans) {
+ int i;
+
+ for (i = 0; i < p->rank; i++) {
+ ffts_plan_t *plan = p->plans[i];
+
+ if (plan) {
+ int j;
+
+ for (j = i + 1; j < p->rank; j++) {
+ if (plan == p->plans[j]) {
+ p->plans[j] = NULL;
+ }
+ }
- int i;
- for(i=0;i<p->rank;i++) {
+ ffts_free(plan);
+ }
+ }
- ffts_plan_t *x = p->plans[i];
+ free(p->plans);
+ }
- int k;
- for(k=i+1;k<p->rank;k++) {
- if(x == p->plans[k]) p->plans[k] = NULL;
- }
+ if (p->transpose_buf) {
+ ffts_aligned_free(p->transpose_buf);
+ }
- if(x) ffts_free(x);
- }
+ if (p->buf) {
+ ffts_aligned_free(p->buf);
+ }
- free(p->Ns);
- free(p->Ms);
- free(p->plans);
- free(p->buf);
- free(p->transpose_buf);
- free(p);
+ if (p->Ns) {
+ free(p->Ns);
+ }
+
+ if (p->Ms) {
+ free(p->Ms);
+ }
+
+ free(p);
}
-void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf) {
- int const bw = 1;
- int const bh = 8;
- int i = 0, j = 0;
- for (; i <= h-bh; i += bh) {
- for (j = 0; j <= w-bw; j += bw) {
- uint64_t const * ib = &src[w*i + j];
- uint64_t * ob = &dst[h*j + i];
-
- uint64_t s_0_0 = ib[0*w+0];
- uint64_t s_1_0 = ib[1*w+0];
- uint64_t s_2_0 = ib[2*w+0];
- uint64_t s_3_0 = ib[3*w+0];
- uint64_t s_4_0 = ib[4*w+0];
- uint64_t s_5_0 = ib[5*w+0];
- uint64_t s_6_0 = ib[6*w+0];
- uint64_t s_7_0 = ib[7*w+0];
-
- ob[0*h+0] = s_0_0;
- ob[0*h+1] = s_1_0;
- ob[0*h+2] = s_2_0;
- ob[0*h+3] = s_3_0;
- ob[0*h+4] = s_4_0;
- ob[0*h+5] = s_5_0;
- ob[0*h+6] = s_6_0;
- ob[0*h+7] = s_7_0;
- }
- }
- if (i < h) {
- for (int i1 = 0; i1 < w; i1++) {
- for (int j = i; j < h; j++) {
- dst[i1*h + j] = src[j*w + i1];
- }
- }
- }
- if (j < w) {
- for (int i = j; i < w; i++) {
- for (int j1 = 0; j1 < h; j1++) {
- dst[i*h + j1] = src[j1*w + i];
- }
- }
- }
+static void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf)
+{
+ const int bw = 1;
+ const int bh = 8;
+ int i = 0, j = 0;
+
+ for (; i <= h - bh; i += bh) {
+ for (j = 0; j <= w - bw; j += bw) {
+ uint64_t const *ib = &src[w*i + j];
+ uint64_t *ob = &dst[h*j + i];
+
+ uint64_t s_0_0 = ib[0*w + 0];
+ uint64_t s_1_0 = ib[1*w + 0];
+ uint64_t s_2_0 = ib[2*w + 0];
+ uint64_t s_3_0 = ib[3*w + 0];
+ uint64_t s_4_0 = ib[4*w + 0];
+ uint64_t s_5_0 = ib[5*w + 0];
+ uint64_t s_6_0 = ib[6*w + 0];
+ uint64_t s_7_0 = ib[7*w + 0];
+
+ ob[0*h + 0] = s_0_0;
+ ob[0*h + 1] = s_1_0;
+ ob[0*h + 2] = s_2_0;
+ ob[0*h + 3] = s_3_0;
+ ob[0*h + 4] = s_4_0;
+ ob[0*h + 5] = s_5_0;
+ ob[0*h + 6] = s_6_0;
+ ob[0*h + 7] = s_7_0;
+ }
+ }
+
+ if (i < h) {
+ int i1;
+
+ for (i1 = 0; i1 < w; i1++) {
+ for (j = i; j < h; j++) {
+ dst[i1*h + j] = src[j*w + i1];
+ }
+ }
+ }
+
+ if (j < w) {
+ int j1;
+
+ for (i = j; i < w; i++) {
+ for (j1 = 0; j1 < h; j1++) {
+ dst[i*h + j1] = src[j1*w + i];
+ }
+ }
+ }
}
-void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) {
+static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
+{
+ const size_t Ms0 = p->Ms[0];
+ const size_t Ns0 = p->Ns[0];
+
+ uint32_t *din = (uint32_t*) in;
+ uint64_t *buf = p->buf;
+ uint64_t *dout = (uint64_t*) out;
+ uint64_t *transpose_buf = (uint64_t*) p->transpose_buf;
+
+ ffts_plan_t *plan;
+ size_t i, j;
- uint32_t *din = (uint32_t *)in;
- uint64_t *buf = p->buf;
- uint64_t *dout = (uint64_t *)out;
+ plan = p->plans[0];
+ for (i = 0; i < Ns0; i++) {
+ plan->transform(plan, din + (i * Ms0), buf + (i * (Ms0 / 2 + 1)));
+ }
- size_t i,j;
- for(i=0;i<p->Ns[0];i++) {
- p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1)));
- }
- ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf);
+ ffts_scalar_transpose(buf, dout, Ms0 / 2 + 1, Ns0, transpose_buf);
- for(i=1;i<p->rank;i++) {
- for(j=0;j<p->Ns[i];j++) {
- p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
- }
- ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
- }
+ for (i = 1; i < p->rank; i++) {
+ const size_t Ms = p->Ms[i];
+ const size_t Ns = p->Ns[i];
+
+ plan = p->plans[i];
+
+ for (j = 0; j < Ns; j++) {
+ plan->transform(plan, dout + (j * Ms), buf + (j * Ms));
+ }
+
+ ffts_scalar_transpose(buf, dout, Ms, Ns, transpose_buf);
+ }
}
-void ffts_execute_nd_real_inv(ffts_plan_t *p, const void * in, void * out) {
+static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out)
+{
+ const size_t Ms0 = p->Ms[0];
+ const size_t Ms1 = p->Ms[1];
+ const size_t Ns0 = p->Ns[0];
+ const size_t Ns1 = p->Ns[1];
+
+ uint64_t *din = (uint64_t*) in;
+ uint64_t *buf = p->buf;
+ uint64_t *buf2;
+ uint64_t *transpose_buf = (uint64_t*) p->transpose_buf;
+ float *doutr = (float*) out;
- uint64_t *din = (uint64_t *)in;
- uint64_t *buf = p->buf;
- uint64_t *buf2;
- uint64_t *dout = (uint64_t *)out;
- size_t vol = 1;
+ ffts_plan_t *plan;
+ size_t vol;
- float *bufr = (float *)(p->buf);
- float *doutr = (float *)out;
+ size_t i, j;
- size_t i,j;
+ vol = p->Ns[0];
+ for (i = 1; i < p->rank; i++) {
+ vol *= p->Ns[i];
+ }
- for(i=0;i<p->rank;i++) {
- vol *= p->Ns[i];
- }
+ buf2 = buf + vol;
- buf2 = buf + vol;
+ ffts_scalar_transpose(din, buf, Ms0, Ns0, transpose_buf);
- ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf);
+ plan = p->plans[0];
+ for (i = 0; i < Ms0; i++) {
+ plan->transform(plan, buf + (i * Ns0), buf2 + (i * Ns0));
+ }
- for(i=0;i<p->Ms[0];i++) {
- p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), buf2 + (i * p->Ns[0]));
- }
+ ffts_scalar_transpose(buf2, buf, Ns0, Ms0, transpose_buf);
- ffts_scalar_transpose(buf2, buf, p->Ns[0], p->Ms[0], p->transpose_buf);
- for(j=0;j<p->Ms[1];j++) {
- p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]);
- }
+ plan = p->plans[1];
+ for (j = 0; j < Ms1; j++) {
+ plan->transform(plan, buf + (j * Ms0), &doutr[j * Ns1]);
+ }
}
-ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
- size_t vol = 1;
- size_t bufsize;
-
- ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
-
- if(sign < 0) p->transform = &ffts_execute_nd_real;
- else p->transform = &ffts_execute_nd_real_inv;
-
- p->destroy = &ffts_free_nd_real;
-
- p->rank = rank;
- p->Ns = malloc(sizeof(size_t) * rank);
- p->Ms = malloc(sizeof(size_t) * rank);
- p->plans = malloc(sizeof(ffts_plan_t **) * rank);
- int i;
- for(i=0;i<rank;i++) {
- p->Ns[i] = Ns[i];
- vol *= Ns[i];
- }
-
- //There is probably a prettier way of doing this, but it works..
- if(sign < 0) {
- bufsize = 2 * vol;
- }
- else {
- bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
- }
-
- p->buf = valloc(sizeof(float) * bufsize);
-
- for(i=0;i<rank;i++) {
- p->Ms[i] = vol / p->Ns[i];
-
- p->plans[i] = NULL;
- int k;
-
- if(sign < 0) {
- for(k=1;k<i;k++) {
- if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k];
- }
- if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
- else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
- }else{
- for(k=0;k<i;k++) {
- if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k];
- }
- if(i==rank-1) p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
- else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign);
- }
- }
- if(sign < 0) {
- for(i=1;i<rank;i++) {
- p->Ns[i] = p->Ns[i] / 2 + 1;
- }
- }else{
- for(i=0;i<rank-1;i++) {
- p->Ms[i] = p->Ms[i] / 2 + 1;
- }
- }
-
- p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
- return p;
+ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign)
+{
+ int i;
+ size_t vol = 1;
+ size_t bufsize;
+ ffts_plan_t *p;
+
+ p = (ffts_plan_t*) calloc(1, sizeof(*p));
+ if (!p) {
+ return NULL;
+ }
+
+ if (sign < 0) {
+ p->transform = &ffts_execute_nd_real;
+ } else {
+ p->transform = &ffts_execute_nd_real_inv;
+ }
+
+ p->destroy = &ffts_free_nd_real;
+ p->rank = rank;
+
+ p->Ms = (size_t*) malloc(rank * sizeof(*p->Ms));
+ if (!p->Ms) {
+ goto cleanup;
+ }
+
+ p->Ns = (size_t*) malloc(rank * sizeof(*p->Ns));
+ if (!p->Ns) {
+ goto cleanup;
+ }
+
+ for (i = 0; i < rank; i++) {
+ p->Ns[i] = Ns[i];
+ vol *= Ns[i];
+ }
+
+ /* there is probably a prettier way of doing this, but it works.. */
+ if (sign < 0) {
+ bufsize = 2 * vol;
+ } else {
+ bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
+ }
+
+ p->buf = ffts_aligned_malloc(bufsize * sizeof(float));
+ if (!p->buf) {
+ goto cleanup;
+ }
+
+ p->transpose_buf = ffts_aligned_malloc(2 * 8 * 8 * sizeof(float));
+ if (!p->transpose_buf) {
+ goto cleanup;
+ }
+
+ p->plans = (ffts_plan_t**) calloc(rank, sizeof(*p->plans));
+ if (!p->plans) {
+ goto cleanup;
+ }
+
+ for (i = 0; i < rank; i++) {
+ int k;
+
+ p->Ms[i] = vol / p->Ns[i];
+
+ if (sign < 0) {
+ if (!i) {
+ p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
+ } else {
+ for (k = 1; k < i; k++) {
+ if (p->Ms[k] == p->Ms[i]) {
+ p->plans[i] = p->plans[k];
+ break;
+ }
+ }
+
+ if (!p->plans[i]) {
+ p->plans[i] = ffts_init_1d(p->Ms[i], sign);
+ p->Ns[i] = p->Ns[i] / 2 + 1;
+ }
+ }
+ } else {
+ if (i == rank - 1) {
+ p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
+ } else {
+ for (k = 0; k < i; k++) {
+ if (p->Ns[k] == p->Ns[i]) {
+ p->plans[i] = p->plans[k];
+ break;
+ }
+ }
+
+ if (!p->plans[i]) {
+ p->plans[i] = ffts_init_1d(p->Ns[i], sign);
+ p->Ms[i] = p->Ms[i] / 2 + 1;
+ }
+ }
+ }
+
+ if (!p->plans[i]) {
+ goto cleanup;
+ }
+ }
+
+ return p;
+
+cleanup:
+ ffts_free_nd_real(p);
+ return NULL;
}
+ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign)
+{
+ size_t Ns[2];
-ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) {
- size_t Ns[2];
- Ns[0] = N1;
- Ns[1] = N2;
- return ffts_init_nd_real(2, Ns, sign);
+ Ns[0] = N1;
+ Ns[1] = N2;
+ return ffts_init_nd_real(2, Ns, sign);
}
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
OpenPOWER on IntegriCloud