diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/ffts_nd.c | 581 | ||||
-rw-r--r-- | src/ffts_nd.h | 36 | ||||
-rw-r--r-- | src/ffts_real.c | 452 | ||||
-rw-r--r-- | src/ffts_real.h | 30 | ||||
-rw-r--r-- | src/ffts_real_nd.c | 432 | ||||
-rw-r--r-- | src/ffts_real_nd.h | 33 | ||||
-rw-r--r-- | src/ffts_small.c | 208 | ||||
-rw-r--r-- | src/ffts_small.h | 22 |
8 files changed, 1015 insertions, 779 deletions
diff --git a/src/ffts_nd.c b/src/ffts_nd.c index 15fc4d1..f982403 100644 --- a/src/ffts_nd.c +++ b/src/ffts_nd.c @@ -1,33 +1,33 @@ /* - - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ @@ -35,249 +35,330 @@ #ifdef HAVE_NEON #include "neon.h" +#include <arm_neon.h> +#endif + +#ifdef HAVE_SSE +#include <xmmintrin.h> #endif -void ffts_free_nd(ffts_plan_t *p) { - - int i; - for(i=0;i<p->rank;i++) { - - ffts_plan_t *x = p->plans[i]; - int k; - for(k=0;k<i;k++) { - if(p->Ms[i] == p->Ms[k]) x = NULL; - } - - if(x) ffts_free(x); - } - - free(p->Ns); - free(p->Ms); - free(p->plans); - free(p->buf); - free(p->transpose_buf); - free(p); -} #define TSIZE 8 -#include <string.h> -void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) { - -#ifdef HAVE_NEON - size_t i,j,k; - int linebytes = w*8; - - for(j=0;j<h;j+=8) { - for(i=0;i<w;i+=8) { - neon_transpose_to_buf(in + j*w + i, buf, w); - - uint64_t *p = out + i*h + j; - uint64_t *pbuf = buf; - uint64_t *ptemp; - - __asm__ __volatile__( - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" - "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" - "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" - "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" - "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" - "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" - "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" - "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" - "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" - "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" - "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" - "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" - "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" - "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" - "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" - "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" - "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" - "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" - "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" - "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" - "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" - "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" - "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" - "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" - "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" - - : [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp) - : [w] "r" (w) - : "memory", "q8", "q9", "q10", "q11" - ); -// out[i*h + j] = in[j*w + i]; - } - } + +static void ffts_free_nd(ffts_plan_t *p) +{ + if (p->plans) { + int i; + + for (i = 0; i < p->rank; i++) { + ffts_plan_t *plan = p->plans[i]; + + if (plan) { + int k; + + for (k = 0; k < i; k++) { + if (p->Ms[i] == p->Ms[k]) { + plan = NULL; + break; + } + } + + if (plan) { + ffts_free(plan); + } + } + } + + free(p->plans); + } + + if (p->Ns) { + free(p->Ns); + } + + if (p->Ms) { + free(p->Ms); + } + + if (p->buf) { + ffts_aligned_free(p->buf); + } + + if (p->transpose_buf) { + ffts_aligned_free(p->transpose_buf); + } + + free(p); +} + +static void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) +{ +#ifdef HAVE_NEON + size_t i, j, k; + int linebytes = 8 * w; + + for (j = 0; j < h; j += 8) { + for (i = 0; i < w; i += 8) { + neon_transpose_to_buf(in + j*w + i, buf, w); + + uint64_t *p = out + i*h + j; + uint64_t *pbuf = buf; + uint64_t *ptemp; + + __asm__ __volatile__( + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" + "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" + "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" + "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" + "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" + "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" + "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" + "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" + "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" + "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" + "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" + "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" + "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" + "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" + "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" + "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" + "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" + "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" + "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "add %[p], %[p], %[w], lsl #3\n\t" + "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" + "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" + "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" + "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" + "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" + "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" + "mov %[ptemp], %[p]\n\t" + "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" + "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" + + : [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp) + : [w] "r" (w) + : "memory", "q8", "q9", "q10", "q11" + ); + + /* out[i*h + j] = in[j*w + i]; */ + } + } #else #ifdef HAVE_SSE - uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64))); - int tx, ty; - int x, y; - int tw = w / TSIZE; - int th = h / TSIZE; - for (ty=0;ty<th;ty++) { - for (tx=0;tx<tw;tx++) { - uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE; - uint64_t *op0 = tmp;//out + h*TSIZE*tx + ty*TSIZE; - - // Copy/transpose to tmp - for (y=0;y<TSIZE;y+=2) { - //for (x=0;x<TSIZE;x+=2) { - //op[x*TSIZE] = ip[x]; - __m128d q0 = _mm_load_pd((double *)(ip0 + 0*w)); - __m128d q1 = _mm_load_pd((double *)(ip0 + 1*w)); - __m128d q2 = _mm_load_pd((double *)(ip0 + 2*w)); - __m128d q3 = _mm_load_pd((double *)(ip0 + 3*w)); - __m128d q4 = _mm_load_pd((double *)(ip0 + 4*w)); - __m128d q5 = _mm_load_pd((double *)(ip0 + 5*w)); - __m128d q6 = _mm_load_pd((double *)(ip0 + 6*w)); - __m128d q7 = _mm_load_pd((double *)(ip0 + 7*w)); - ip0 += 2; - - __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); - __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); - __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0)); - __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1)); - __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0)); - __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1)); - __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0)); - __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1)); - //_mm_store_pd((double *)(op0 + y*h + x), t0); - //_mm_store_pd((double *)(op0 + y*h + x + h), t1); - _mm_store_pd((double *)(op0 + 0), t0); - _mm_store_pd((double *)(op0 + 0 + TSIZE), t1); - _mm_store_pd((double *)(op0 + 2 ), t2); - _mm_store_pd((double *)(op0 + 2 + TSIZE), t3); - _mm_store_pd((double *)(op0 + 4 ), t4); - _mm_store_pd((double *)(op0 + 4 + TSIZE), t5); - _mm_store_pd((double *)(op0 + 6 ), t6); - _mm_store_pd((double *)(op0 + 6 + TSIZE), t7); - //} - op0 += 2*TSIZE; - } - - op0 = out + h*tx*TSIZE + ty*TSIZE; - ip0 = tmp; - for (y=0;y<TSIZE;y+=1) { - // memcpy(op0, ip0, TSIZE * sizeof(*ip0)); - - __m128d q0 = _mm_load_pd((double *)(ip0 + 0)); - __m128d q1 = _mm_load_pd((double *)(ip0 + 2)); - __m128d q2 = _mm_load_pd((double *)(ip0 + 4)); - __m128d q3 = _mm_load_pd((double *)(ip0 + 6)); - _mm_store_pd((double *)(op0 + 0), q0); - _mm_store_pd((double *)(op0 + 2), q1); - _mm_store_pd((double *)(op0 + 4), q2); - _mm_store_pd((double *)(op0 + 6), q3); - - op0 += h; - ip0 += TSIZE; - } - - } - } -/* - size_t i,j; - for(i=0;i<w;i+=2) { - for(j=0;j<h;j+=2) { -// out[i*h + j] = in[j*w + i]; - __m128d q0 = _mm_load_pd((double *)(in + j*w + i)); - __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w)); - __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); - __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); - _mm_store_pd((double *)(out + i*h + j), t0); - _mm_store_pd((double *)(out + i*h + j + h), t1); - } - } -*/ + uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE]; + int tx, ty; + /* int x; */ + int y; + int tw = w / TSIZE; + int th = h / TSIZE; + + for (ty = 0; ty < th; ty++) { + for (tx = 0; tx < tw; tx++) { + uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE; + uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */ + + /* copy/transpose to tmp */ + for (y = 0; y < TSIZE; y += 2) { + /* for (x=0;x<TSIZE;x+=2) { + op[x*TSIZE] = ip[x]; + */ + __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w)); + __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w)); + __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w)); + __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w)); + __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w)); + __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w)); + __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w)); + __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w)); + + __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); + __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); + __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0)); + __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1)); + __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0)); + __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1)); + __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0)); + __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1)); + + ip0 += 2; + /* _mm_store_pd((double *)(op0 + y*h + x), t0); + _mm_store_pd((double *)(op0 + y*h + x + h), t1); + */ + + _mm_store_pd((double*)(op0 + 0 ), t0); + _mm_store_pd((double*)(op0 + 0 + TSIZE), t1); + _mm_store_pd((double*)(op0 + 2 ), t2); + _mm_store_pd((double*)(op0 + 2 + TSIZE), t3); + _mm_store_pd((double*)(op0 + 4 ), t4); + _mm_store_pd((double*)(op0 + 4 + TSIZE), t5); + _mm_store_pd((double*)(op0 + 6 ), t6); + _mm_store_pd((double*)(op0 + 6 + TSIZE), t7); + /* } */ + + op0 += 2*TSIZE; + } + + op0 = out + h*tx*TSIZE + ty*TSIZE; + ip0 = tmp; + for (y = 0; y < TSIZE; y += 1) { + /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */ + + __m128d q0 = _mm_load_pd((double*)(ip0 + 0)); + __m128d q1 = _mm_load_pd((double*)(ip0 + 2)); + __m128d q2 = _mm_load_pd((double*)(ip0 + 4)); + __m128d q3 = _mm_load_pd((double*)(ip0 + 6)); + + _mm_store_pd((double*)(op0 + 0), q0); + _mm_store_pd((double*)(op0 + 2), q1); + _mm_store_pd((double*)(op0 + 4), q2); + _mm_store_pd((double*)(op0 + 6), q3); + + op0 += h; + ip0 += TSIZE; + } + } + } + /* + size_t i,j; + for(i=0;i<w;i+=2) { + for(j=0;j<h;j+=2) { + // out[i*h + j] = in[j*w + i]; + __m128d q0 = _mm_load_pd((double *)(in + j*w + i)); + __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w)); + __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); + __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); + _mm_store_pd((double *)(out + i*h + j), t0); + _mm_store_pd((double *)(out + i*h + j + h), t1); + } + } + */ #endif #endif - } -void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out) { +static void ffts_execute_nd(ffts_plan_t *p, const void *in, void *out) +{ + uint64_t *din = (uint64_t*) in; + uint64_t *buf = p->buf; + uint64_t *dout = (uint64_t*) out; + + ffts_plan_t *plan; + size_t i, j; - uint64_t *din = (uint64_t *)in; - uint64_t *buf = p->buf; - uint64_t *dout = (uint64_t *)out; + plan = p->plans[0]; + for (i = 0; i < p->Ns[0]; i++) { + plan->transform(plan, din + (i * p->Ms[0]), buf + (i * p->Ms[0])); + } - size_t i,j; - for(i=0;i<p->Ns[0];i++) { - p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0])); - } - ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf); + ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf); - for(i=1;i<p->rank;i++) { - for(j=0;j<p->Ns[i];j++) { - p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i])); - } - ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf); - } + for (i = 1; i < p->rank; i++) { + plan = p->plans[i]; + + for (j = 0; j < p->Ns[i]; j++) { + plan->transform(plan, dout + (j * p->Ms[i]), buf + (j * p->Ms[i])); + } + + ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf); + } } -ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) { - size_t vol = 1; +ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) +{ + ffts_plan_t *p; + size_t vol; + int i; + + p = calloc(1, sizeof(*p)); + if (!p) { + return NULL; + } + + p->transform = &ffts_execute_nd; + p->destroy = &ffts_free_nd; + p->rank = rank; + + p->Ms = malloc(rank * sizeof(*p->Ms)); + if (!p->Ms) { + goto cleanup; + } + + p->Ns = malloc(rank * sizeof(*p->Ns)); + if (!p->Ns) { + goto cleanup; + } + + vol = p->Ns[0] = Ns[0]; + for (i = 1; i < rank; i++) { + p->Ns[i] = Ns[i]; + vol *= Ns[i]; + } + + p->buf = ffts_aligned_malloc(2 * vol * sizeof(float)); + if (!p->buf) { + goto cleanup; + } + + p->transpose_buf = ffts_aligned_malloc(2 * 8 * 8 * sizeof(float)); + if (!p->transpose_buf) { + goto cleanup; + } - ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); + p->plans = calloc(rank, sizeof(*p->plans)); + if (!p->plans) { + goto cleanup; + } - p->transform = &ffts_execute_nd; - p->destroy = &ffts_free_nd; + for (i = 0; i < rank; i++) { + int k; - p->rank = rank; - p->Ns = malloc(sizeof(size_t) * rank); - p->Ms = malloc(sizeof(size_t) * rank); - p->plans = malloc(sizeof(ffts_plan_t **) * rank); - int i; - for(i=0;i<rank;i++) { - p->Ns[i] = Ns[i]; - vol *= Ns[i]; - } - p->buf = valloc(sizeof(float) * 2 * vol); + p->Ms[i] = vol / p->Ns[i]; - for(i=0;i<rank;i++) { - p->Ms[i] = vol / p->Ns[i]; + for (k = 0; k < i; k++) { + if (p->Ms[k] == p->Ms[i]) { + p->plans[i] = p->plans[k]; + break; + } + } - p->plans[i] = NULL; - int k; - for(k=0;k<i;k++) { - if(p->Ms[k] == p->Ms[i]) - p->plans[i] = p->plans[k]; - } + if (!p->plans[i]) { + p->plans[i] = ffts_init_1d(p->Ms[i], sign); + if (!p->plans) { + goto cleanup; + } + } + } - if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); - } + return p; - p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8); - return p; +cleanup: + ffts_free_nd(p); + return NULL; } +ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) +{ + size_t Ns[2]; -ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) { - size_t Ns[2]; - Ns[0] = N1; - Ns[1] = N2; - return ffts_init_nd(2, Ns, sign); + Ns[0] = N1; + Ns[1] = N2; + return ffts_init_nd(2, Ns, sign); } -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: diff --git a/src/ffts_nd.h b/src/ffts_nd.h index a9af3e2..a960cad 100644 --- a/src/ffts_nd.h +++ b/src/ffts_nd.h @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,29 +31,13 @@ */ -#ifndef __FFTS_ND_H__ -#define __FFTS_ND_H__ - -#include <stdint.h> -#include <stddef.h> -#include <stdio.h> +#ifndef FFTS_ND_H +#define FFTS_ND_H #include "ffts.h" +#include <stddef.h> -#ifdef HAVE_NEON - #include <arm_neon.h> -#endif -#ifdef HAVE_SSE - #include <xmmintrin.h> -#endif - -void ffts_free_nd(ffts_plan_t *p); -void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf); - -void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out); -ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign); -ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign); - -#endif +ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign); +ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign); -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +#endif /* FFTS_ND_H */ diff --git a/src/ffts_real.c b/src/ffts_real.c index 97ff942..77c57a0 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -1,227 +1,281 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "ffts_real.h" -void ffts_free_1d_real(ffts_plan_t *p) { - ffts_free(p->plans[0]); - free(p->A); - free(p->B); - free(p->plans); - free(p->buf); - free(p); -} +#ifdef HAVE_NEON +#include <arm_neon.h> +#endif + +#ifdef HAVE_SSE +#include <xmmintrin.h> +#endif + +static void ffts_free_1d_real(ffts_plan_t *p) +{ + if (p->B) { + ffts_aligned_free(p->B); + } -void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) { - float *out = (float *)vout; - float *buf = (float *)p->buf; - float *A = p->A; - float *B = p->B; + if (p->A) { + ffts_aligned_free(p->A); + } - p->plans[0]->transform(p->plans[0], vin, buf); + if (p->buf) { + ffts_aligned_free(p->buf); + } - size_t N = p->N; - buf[N] = buf[0]; - buf[N+1] = buf[1]; + if (p->plans) { + ffts_free(p->plans[0]); + free(p->plans); + } - float *p_buf0 = buf; - float *p_buf1 = buf + N - 2; - float *p_out = out; + free(p); +} + +static void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) +{ + float *out = (float*) vout; + float *buf = (float*) p->buf; + float *A = p->A; + float *B = p->B; + size_t N = p->N; - size_t i; #ifdef __ARM_NEON__ - for(i=0;i<N/2;i+=2) { - __asm__ __volatile__ ( - "vld1.32 {q8}, [%[pa]]!\n\t" - "vld1.32 {q9}, [%[pb]]!\n\t" - "vld1.32 {q10}, [%[buf0]]!\n\t" - "vld1.32 {q11}, [%[buf1]]\n\t" - "sub %[buf1], %[buf1], #16\n\t" - - "vdup.32 d26, d16[1]\n\t" - "vdup.32 d27, d17[1]\n\t" - "vdup.32 d24, d16[0]\n\t" - "vdup.32 d25, d17[0]\n\t" - - "vdup.32 d30, d23[1]\n\t" - "vdup.32 d31, d22[1]\n\t" - "vdup.32 d28, d23[0]\n\t" - "vdup.32 d29, d22[0]\n\t" - - "vmul.f32 q13, q13, q10\n\t" - "vmul.f32 q15, q15, q9\n\t" - "vmul.f32 q12, q12, q10\n\t" - "vmul.f32 q14, q14, q9\n\t" - "vrev64.f32 q13, q13\n\t" - "vrev64.f32 q15, q15\n\t" - - "vtrn.32 d26, d27\n\t" - "vtrn.32 d30, d31\n\t" - "vneg.f32 d26, d26\n\t" - "vneg.f32 d31, d31\n\t" - "vtrn.32 d26, d27\n\t" - "vtrn.32 d30, d31\n\t" - - "vadd.f32 q12, q12, q14\n\t" - "vadd.f32 q13, q13, q15\n\t" - "vadd.f32 q12, q12, q13\n\t" - "vst1.32 {q12}, [%[pout]]!\n\t" - : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), - [pout] "+r" (p_out) - : - : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -#else - for(i=0;i<N/2;i++) { - out[2*i] = buf[2*i]*A[2*i] - buf[2*i+1]*A[2*i+1] + buf[N-2*i]*B[2*i] + buf[N-2*i+1]*B[2*i+1]; - out[2*i+1] = buf[2*i+1]*A[2*i] + buf[2*i]*A[2*i+1] + buf[N-2*i]*B[2*i+1] - buf[N-2*i+1]*B[2*i]; + float *p_buf0 = buf; + float *p_buf1 = buf + N - 2; + float *p_out = out; +#endif -// out[2*N-2*i] = out[2*i]; -// out[2*N-2*i+1] = -out[2*i+1]; + size_t i; -#endif - } + p->plans[0]->transform(p->plans[0], vin, buf); - out[N] = buf[0] - buf[1]; - out[N+1] = 0.0f; + buf[N + 0] = buf[0]; + buf[N + 1] = buf[1]; +#ifdef __ARM_NEON__ + for (i = 0; i < N/2; i += 2) { + __asm__ __volatile__ ( + "vld1.32 {q8}, [%[pa]]!\n\t" + "vld1.32 {q9}, [%[pb]]!\n\t" + "vld1.32 {q10}, [%[buf0]]!\n\t" + "vld1.32 {q11}, [%[buf1]]\n\t" + "sub %[buf1], %[buf1], #16\n\t" + + "vdup.32 d26, d16[1]\n\t" + "vdup.32 d27, d17[1]\n\t" + "vdup.32 d24, d16[0]\n\t" + "vdup.32 d25, d17[0]\n\t" + + "vdup.32 d30, d23[1]\n\t" + "vdup.32 d31, d22[1]\n\t" + "vdup.32 d28, d23[0]\n\t" + "vdup.32 d29, d22[0]\n\t" + + "vmul.f32 q13, q13, q10\n\t" + "vmul.f32 q15, q15, q9\n\t" + "vmul.f32 q12, q12, q10\n\t" + "vmul.f32 q14, q14, q9\n\t" + "vrev64.f32 q13, q13\n\t" + "vrev64.f32 q15, q15\n\t" + + "vtrn.32 d26, d27\n\t" + "vtrn.32 d30, d31\n\t" + "vneg.f32 d26, d26\n\t" + "vneg.f32 d31, d31\n\t" + "vtrn.32 d26, d27\n\t" + "vtrn.32 d30, d31\n\t" + + "vadd.f32 q12, q12, q14\n\t" + "vadd.f32 q13, q13, q15\n\t" + "vadd.f32 q12, q12, q13\n\t" + "vst1.32 {q12}, [%[pout]]!\n\t" + : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), + [pout] "+r" (p_out) + : + : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + } +#else + for (i = 0; i < N/2; i++) { + out[2*i + 0] = buf[2*i + 0] * A[2*i] - buf[2*i + 1] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1]; + out[2*i + 1] = buf[2*i + 1] * A[2*i] + buf[2*i + 0] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; + + /* out[2*N-2*i+0] = out[2*i+0]; + out[2*N-2*i+1] = -out[2*i+1]; + */ + } +#endif + + out[N + 0] = buf[0] - buf[1]; + out[N + 1] = 0.0f; } -void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) { - float *out = (float *)vout; - float *in = (float *)vin; - float *buf = (float *)p->buf; - float *A = p->A; - float *B = p->B; - size_t N = p->N; +static void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) +{ + float *out = (float*) vout; + float *in = (float*) vin; + float *buf = (float*) p->buf; + float *A = p->A; + float *B = p->B; + size_t N = p->N; - float *p_buf0 = in; - float *p_buf1 = in + N - 2; +#ifdef __ARM_NEON__ + float *p_buf0 = in; + float *p_buf1 = in + N - 2; + float *p_out = buf; +#endif - float *p_out = buf; + size_t i; - size_t i; #ifdef __ARM_NEON__ - for(i=0;i<N/2;i+=2) { - __asm__ __volatile__ ( - "vld1.32 {q8}, [%[pa]]!\n\t" - "vld1.32 {q9}, [%[pb]]!\n\t" - "vld1.32 {q10}, [%[buf0]]!\n\t" - "vld1.32 {q11}, [%[buf1]]\n\t" - "sub %[buf1], %[buf1], #16\n\t" - - "vdup.32 d26, d16[1]\n\t" - "vdup.32 d27, d17[1]\n\t" - "vdup.32 d24, d16[0]\n\t" - "vdup.32 d25, d17[0]\n\t" - - "vdup.32 d30, d23[1]\n\t" - "vdup.32 d31, d22[1]\n\t" - "vdup.32 d28, d23[0]\n\t" - "vdup.32 d29, d22[0]\n\t" - - "vmul.f32 q13, q13, q10\n\t" - "vmul.f32 q15, q15, q9\n\t" - "vmul.f32 q12, q12, q10\n\t" - "vmul.f32 q14, q14, q9\n\t" - "vrev64.f32 q13, q13\n\t" - "vrev64.f32 q15, q15\n\t" - - "vtrn.32 d26, d27\n\t" - "vtrn.32 d28, d29\n\t" - "vneg.f32 d27, d27\n\t" - "vneg.f32 d29, d29\n\t" - "vtrn.32 d26, d27\n\t" - "vtrn.32 d28, d29\n\t" - - "vadd.f32 q12, q12, q14\n\t" - "vsub.f32 q13, q13, q15\n\t" - "vadd.f32 q12, q12, q13\n\t" - "vst1.32 {q12}, [%[pout]]!\n\t" - : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), - [pout] "+r" (p_out) - : - : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + for (i = 0; i < N/2; i += 2) { + __asm__ __volatile__ ( + "vld1.32 {q8}, [%[pa]]!\n\t" + "vld1.32 {q9}, [%[pb]]!\n\t" + "vld1.32 {q10}, [%[buf0]]!\n\t" + "vld1.32 {q11}, [%[buf1]]\n\t" + "sub %[buf1], %[buf1], #16\n\t" + + "vdup.32 d26, d16[1]\n\t" + "vdup.32 d27, d17[1]\n\t" + "vdup.32 d24, d16[0]\n\t" + "vdup.32 d25, d17[0]\n\t" + + "vdup.32 d30, d23[1]\n\t" + "vdup.32 d31, d22[1]\n\t" + "vdup.32 d28, d23[0]\n\t" + "vdup.32 d29, d22[0]\n\t" + + "vmul.f32 q13, q13, q10\n\t" + "vmul.f32 q15, q15, q9\n\t" + "vmul.f32 q12, q12, q10\n\t" + "vmul.f32 q14, q14, q9\n\t" + "vrev64.f32 q13, q13\n\t" + "vrev64.f32 q15, q15\n\t" + + "vtrn.32 d26, d27\n\t" + "vtrn.32 d28, d29\n\t" + "vneg.f32 d27, d27\n\t" + "vneg.f32 d29, d29\n\t" + "vtrn.32 d26, d27\n\t" + "vtrn.32 d28, d29\n\t" + + "vadd.f32 q12, q12, q14\n\t" + "vsub.f32 q13, q13, q15\n\t" + "vadd.f32 q12, q12, q13\n\t" + "vst1.32 {q12}, [%[pout]]!\n\t" + : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), + [pout] "+r" (p_out) + : + : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + } #else - for(i=0;i<N/2;i++) { - buf[2*i] = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1]; - buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i]; + for (i = 0; i < N/2; i++) { + buf[2*i + 0] = in[2*i + 0] * A[2*i] + in[2*i + 1] * A[2*i + 1] + in[N - 2*i] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1]; + buf[2*i + 1] = in[2*i + 1] * A[2*i] - in[2*i + 0] * A[2*i + 1] - in[N - 2*i] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0]; + } #endif - } - p->plans[0]->transform(p->plans[0], buf, out); - -} - -ffts_plan_t *ffts_init_1d_real(size_t N, int sign) { - ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); - - if(sign < 0) p->transform = &ffts_execute_1d_real; - else p->transform = &ffts_execute_1d_real_inv; - - p->destroy = &ffts_free_1d_real; - p->N = N; - p->rank = 1; - p->plans = malloc(sizeof(ffts_plan_t **) * 1); - - p->plans[0] = ffts_init_1d(N/2, sign); - - p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1)); - - p->A = valloc(sizeof(float) * N); - p->B = valloc(sizeof(float) * N); - - if(sign < 0) { - int i; - for (i = 0; i < N/2; i++) { - p->A[2 * i] = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i)); - p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i)); - p->B[2 * i] = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i)); - p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i)); - } - }else{ - int i; - for (i = 0; i < N/2; i++) { - p->A[2 * i] = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i)); - p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i)); - p->B[2 * i] = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i)); - p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i)); - } - } - - return p; + p->plans[0]->transform(p->plans[0], buf, out); } - -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +ffts_plan_t *ffts_init_1d_real(size_t N, int sign) +{ + ffts_plan_t *p; + size_t i; + + p = (ffts_plan_t*) calloc(1, sizeof(*p)); + if (!p) { + return NULL; + } + + if (sign < 0) { + p->transform = &ffts_execute_1d_real; + } else { + p->transform = &ffts_execute_1d_real_inv; + } + + p->destroy = &ffts_free_1d_real; + p->N = N; + p->rank = 1; + + p->plans = (ffts_plan_t**) malloc(1 * sizeof(*p->plans)); + if (!p->plans) { + goto cleanup; + } + + p->plans[0] = ffts_init_1d(N/2, sign); + if (!p->plans[0]) { + goto cleanup; + } + + p->buf = ffts_aligned_malloc(2 * ((N/2) + 1) * sizeof(float)); + if (!p->buf) { + goto cleanup; + } + + p->A = (float*) ffts_aligned_malloc(N * sizeof(float)); + if (!p->A) { + goto cleanup; + } + + p->B = (float*) ffts_aligned_malloc(N * sizeof(float)); + if (!p->B) { + goto cleanup; + } + + if (sign < 0) { + for (i = 0; i < N/2; i++) { + p->A[2 * i + 0] = 0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)); + p->A[2 * i + 1] = 0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + p->B[2 * i + 0] = 0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)); + p->B[2 * i + 1] = 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + } + } else { + for (i = 0; i < N/2; i++) { + p->A[2 * i + 0] = 1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)); + p->A[2 * i + 1] = 1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + p->B[2 * i + 0] = 1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)); + p->B[2 * i + 1] = 1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + } + } + + return p; + +cleanup: + ffts_free_1d_real(p); + return NULL; +}
\ No newline at end of file diff --git a/src/ffts_real.h b/src/ffts_real.h index d3f5316..81ca80f 100644 --- a/src/ffts_real.h +++ b/src/ffts_real.h @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,24 +31,16 @@ */ -#ifndef __FFTS_REAL_H__ -#define __FFTS_REAL_H__ +#ifndef FFTS_REAL_H +#define FFTS_REAL_H -#include <stdint.h> -#include <stddef.h> -#include <stdio.h> +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif #include "ffts.h" - -#ifdef HAVE_NEON - #include <arm_neon.h> -#endif -#ifdef HAVE_SSE - #include <xmmintrin.h> -#endif +#include <stddef.h> ffts_plan_t *ffts_init_1d_real(size_t N, int sign); -#endif - -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +#endif /* FFTS_REAL_H */ diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c index 151b72a..05bcc9c 100644 --- a/src/ffts_real_nd.c +++ b/src/ffts_real_nd.c @@ -32,199 +32,305 @@ */ #include "ffts_real_nd.h" +#include "ffts_real.h" #ifdef __ARM_NEON__ #include "neon.h" #endif -void ffts_free_nd_real(ffts_plan_t *p) { +#ifdef HAVE_NEON +#include <arm_neon.h> +#endif + +#ifdef HAVE_SSE +#include <xmmintrin.h> +#endif + +#include <stdio.h> + +static void ffts_free_nd_real(ffts_plan_t *p) +{ + if (p->plans) { + int i; + + for (i = 0; i < p->rank; i++) { + ffts_plan_t *plan = p->plans[i]; + + if (plan) { + int j; + + for (j = i + 1; j < p->rank; j++) { + if (plan == p->plans[j]) { + p->plans[j] = NULL; + } + } - int i; - for(i=0;i<p->rank;i++) { + ffts_free(plan); + } + } - ffts_plan_t *x = p->plans[i]; + free(p->plans); + } - int k; - for(k=i+1;k<p->rank;k++) { - if(x == p->plans[k]) p->plans[k] = NULL; - } + if (p->transpose_buf) { + ffts_aligned_free(p->transpose_buf); + } - if(x) ffts_free(x); - } + if (p->buf) { + ffts_aligned_free(p->buf); + } - free(p->Ns); - free(p->Ms); - free(p->plans); - free(p->buf); - free(p->transpose_buf); - free(p); + if (p->Ns) { + free(p->Ns); + } + + if (p->Ms) { + free(p->Ms); + } + + free(p); } -void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf) { - int const bw = 1; - int const bh = 8; - int i = 0, j = 0; - for (; i <= h-bh; i += bh) { - for (j = 0; j <= w-bw; j += bw) { - uint64_t const * ib = &src[w*i + j]; - uint64_t * ob = &dst[h*j + i]; - - uint64_t s_0_0 = ib[0*w+0]; - uint64_t s_1_0 = ib[1*w+0]; - uint64_t s_2_0 = ib[2*w+0]; - uint64_t s_3_0 = ib[3*w+0]; - uint64_t s_4_0 = ib[4*w+0]; - uint64_t s_5_0 = ib[5*w+0]; - uint64_t s_6_0 = ib[6*w+0]; - uint64_t s_7_0 = ib[7*w+0]; - - ob[0*h+0] = s_0_0; - ob[0*h+1] = s_1_0; - ob[0*h+2] = s_2_0; - ob[0*h+3] = s_3_0; - ob[0*h+4] = s_4_0; - ob[0*h+5] = s_5_0; - ob[0*h+6] = s_6_0; - ob[0*h+7] = s_7_0; - } - } - if (i < h) { - for (int i1 = 0; i1 < w; i1++) { - for (int j = i; j < h; j++) { - dst[i1*h + j] = src[j*w + i1]; - } - } - } - if (j < w) { - for (int i = j; i < w; i++) { - for (int j1 = 0; j1 < h; j1++) { - dst[i*h + j1] = src[j1*w + i]; - } - } - } +static void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf) +{ + const int bw = 1; + const int bh = 8; + int i = 0, j = 0; + + for (; i <= h - bh; i += bh) { + for (j = 0; j <= w - bw; j += bw) { + uint64_t const *ib = &src[w*i + j]; + uint64_t *ob = &dst[h*j + i]; + + uint64_t s_0_0 = ib[0*w + 0]; + uint64_t s_1_0 = ib[1*w + 0]; + uint64_t s_2_0 = ib[2*w + 0]; + uint64_t s_3_0 = ib[3*w + 0]; + uint64_t s_4_0 = ib[4*w + 0]; + uint64_t s_5_0 = ib[5*w + 0]; + uint64_t s_6_0 = ib[6*w + 0]; + uint64_t s_7_0 = ib[7*w + 0]; + + ob[0*h + 0] = s_0_0; + ob[0*h + 1] = s_1_0; + ob[0*h + 2] = s_2_0; + ob[0*h + 3] = s_3_0; + ob[0*h + 4] = s_4_0; + ob[0*h + 5] = s_5_0; + ob[0*h + 6] = s_6_0; + ob[0*h + 7] = s_7_0; + } + } + + if (i < h) { + int i1; + + for (i1 = 0; i1 < w; i1++) { + for (j = i; j < h; j++) { + dst[i1*h + j] = src[j*w + i1]; + } + } + } + + if (j < w) { + int j1; + + for (i = j; i < w; i++) { + for (j1 = 0; j1 < h; j1++) { + dst[i*h + j1] = src[j1*w + i]; + } + } + } } -void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) { +static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out) +{ + const size_t Ms0 = p->Ms[0]; + const size_t Ns0 = p->Ns[0]; + + uint32_t *din = (uint32_t*) in; + uint64_t *buf = p->buf; + uint64_t *dout = (uint64_t*) out; + uint64_t *transpose_buf = (uint64_t*) p->transpose_buf; + + ffts_plan_t *plan; + size_t i, j; - uint32_t *din = (uint32_t *)in; - uint64_t *buf = p->buf; - uint64_t *dout = (uint64_t *)out; + plan = p->plans[0]; + for (i = 0; i < Ns0; i++) { + plan->transform(plan, din + (i * Ms0), buf + (i * (Ms0 / 2 + 1))); + } - size_t i,j; - for(i=0;i<p->Ns[0];i++) { - p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1))); - } - ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf); + ffts_scalar_transpose(buf, dout, Ms0 / 2 + 1, Ns0, transpose_buf); - for(i=1;i<p->rank;i++) { - for(j=0;j<p->Ns[i];j++) { - p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i])); - } - ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf); - } + for (i = 1; i < p->rank; i++) { + const size_t Ms = p->Ms[i]; + const size_t Ns = p->Ns[i]; + + plan = p->plans[i]; + + for (j = 0; j < Ns; j++) { + plan->transform(plan, dout + (j * Ms), buf + (j * Ms)); + } + + ffts_scalar_transpose(buf, dout, Ms, Ns, transpose_buf); + } } -void ffts_execute_nd_real_inv(ffts_plan_t *p, const void * in, void * out) { +static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out) +{ + const size_t Ms0 = p->Ms[0]; + const size_t Ms1 = p->Ms[1]; + const size_t Ns0 = p->Ns[0]; + const size_t Ns1 = p->Ns[1]; + + uint64_t *din = (uint64_t*) in; + uint64_t *buf = p->buf; + uint64_t *buf2; + uint64_t *transpose_buf = (uint64_t*) p->transpose_buf; + float *doutr = (float*) out; - uint64_t *din = (uint64_t *)in; - uint64_t *buf = p->buf; - uint64_t *buf2; - uint64_t *dout = (uint64_t *)out; - size_t vol = 1; + ffts_plan_t *plan; + size_t vol; - float *bufr = (float *)(p->buf); - float *doutr = (float *)out; + size_t i, j; - size_t i,j; + vol = p->Ns[0]; + for (i = 1; i < p->rank; i++) { + vol *= p->Ns[i]; + } - for(i=0;i<p->rank;i++) { - vol *= p->Ns[i]; - } + buf2 = buf + vol; - buf2 = buf + vol; + ffts_scalar_transpose(din, buf, Ms0, Ns0, transpose_buf); - ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf); + plan = p->plans[0]; + for (i = 0; i < Ms0; i++) { + plan->transform(plan, buf + (i * Ns0), buf2 + (i * Ns0)); + } - for(i=0;i<p->Ms[0];i++) { - p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), buf2 + (i * p->Ns[0])); - } + ffts_scalar_transpose(buf2, buf, Ns0, Ms0, transpose_buf); - ffts_scalar_transpose(buf2, buf, p->Ns[0], p->Ms[0], p->transpose_buf); - for(j=0;j<p->Ms[1];j++) { - p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]); - } + plan = p->plans[1]; + for (j = 0; j < Ms1; j++) { + plan->transform(plan, buf + (j * Ms0), &doutr[j * Ns1]); + } } -ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) { - size_t vol = 1; - size_t bufsize; - - ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); - - if(sign < 0) p->transform = &ffts_execute_nd_real; - else p->transform = &ffts_execute_nd_real_inv; - - p->destroy = &ffts_free_nd_real; - - p->rank = rank; - p->Ns = malloc(sizeof(size_t) * rank); - p->Ms = malloc(sizeof(size_t) * rank); - p->plans = malloc(sizeof(ffts_plan_t **) * rank); - int i; - for(i=0;i<rank;i++) { - p->Ns[i] = Ns[i]; - vol *= Ns[i]; - } - - //There is probably a prettier way of doing this, but it works.. - if(sign < 0) { - bufsize = 2 * vol; - } - else { - bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol); - } - - p->buf = valloc(sizeof(float) * bufsize); - - for(i=0;i<rank;i++) { - p->Ms[i] = vol / p->Ns[i]; - - p->plans[i] = NULL; - int k; - - if(sign < 0) { - for(k=1;k<i;k++) { - if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k]; - } - if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); - else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); - }else{ - for(k=0;k<i;k++) { - if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k]; - } - if(i==rank-1) p->plans[i] = ffts_init_1d_real(p->Ns[i], sign); - else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign); - } - } - if(sign < 0) { - for(i=1;i<rank;i++) { - p->Ns[i] = p->Ns[i] / 2 + 1; - } - }else{ - for(i=0;i<rank-1;i++) { - p->Ms[i] = p->Ms[i] / 2 + 1; - } - } - - p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8); - return p; +ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) +{ + int i; + size_t vol = 1; + size_t bufsize; + ffts_plan_t *p; + + p = (ffts_plan_t*) calloc(1, sizeof(*p)); + if (!p) { + return NULL; + } + + if (sign < 0) { + p->transform = &ffts_execute_nd_real; + } else { + p->transform = &ffts_execute_nd_real_inv; + } + + p->destroy = &ffts_free_nd_real; + p->rank = rank; + + p->Ms = (size_t*) malloc(rank * sizeof(*p->Ms)); + if (!p->Ms) { + goto cleanup; + } + + p->Ns = (size_t*) malloc(rank * sizeof(*p->Ns)); + if (!p->Ns) { + goto cleanup; + } + + for (i = 0; i < rank; i++) { + p->Ns[i] = Ns[i]; + vol *= Ns[i]; + } + + /* there is probably a prettier way of doing this, but it works.. */ + if (sign < 0) { + bufsize = 2 * vol; + } else { + bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol); + } + + p->buf = ffts_aligned_malloc(bufsize * sizeof(float)); + if (!p->buf) { + goto cleanup; + } + + p->transpose_buf = ffts_aligned_malloc(2 * 8 * 8 * sizeof(float)); + if (!p->transpose_buf) { + goto cleanup; + } + + p->plans = (ffts_plan_t**) calloc(rank, sizeof(*p->plans)); + if (!p->plans) { + goto cleanup; + } + + for (i = 0; i < rank; i++) { + int k; + + p->Ms[i] = vol / p->Ns[i]; + + if (sign < 0) { + if (!i) { + p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); + } else { + for (k = 1; k < i; k++) { + if (p->Ms[k] == p->Ms[i]) { + p->plans[i] = p->plans[k]; + break; + } + } + + if (!p->plans[i]) { + p->plans[i] = ffts_init_1d(p->Ms[i], sign); + p->Ns[i] = p->Ns[i] / 2 + 1; + } + } + } else { + if (i == rank - 1) { + p->plans[i] = ffts_init_1d_real(p->Ns[i], sign); + } else { + for (k = 0; k < i; k++) { + if (p->Ns[k] == p->Ns[i]) { + p->plans[i] = p->plans[k]; + break; + } + } + + if (!p->plans[i]) { + p->plans[i] = ffts_init_1d(p->Ns[i], sign); + p->Ms[i] = p->Ms[i] / 2 + 1; + } + } + } + + if (!p->plans[i]) { + goto cleanup; + } + } + + return p; + +cleanup: + ffts_free_nd_real(p); + return NULL; } +ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) +{ + size_t Ns[2]; -ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) { - size_t Ns[2]; - Ns[0] = N1; - Ns[1] = N2; - return ffts_init_nd_real(2, Ns, sign); + Ns[0] = N1; + Ns[1] = N2; + return ffts_init_nd_real(2, Ns, sign); } -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: diff --git a/src/ffts_real_nd.h b/src/ffts_real_nd.h index bc8ed75..d23a002 100644 --- a/src/ffts_real_nd.h +++ b/src/ffts_real_nd.h @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,24 +31,17 @@ */ -#ifndef __FFTS_REAL_ND_H__ -#define __FFTS_REAL_ND_H__ +#ifndef FFTS_REAL_ND_H +#define FFTS_REAL_ND_H -#include <stdint.h> -#include <stddef.h> -#include <stdio.h> +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif -#include "ffts_nd.h" -#include "ffts_real.h" #include "ffts.h" +#include <stddef.h> -#ifdef HAVE_NEON - #include <arm_neon.h> -#endif -#ifdef HAVE_SSE - #include <xmmintrin.h> -#endif - -#endif +ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign); +ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign); -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +#endif /* FFTS_REAL_ND_H */
\ No newline at end of file diff --git a/src/ffts_small.c b/src/ffts_small.c index e53493c..6f700c6 100644 --- a/src/ffts_small.c +++ b/src/ffts_small.c @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> - + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,127 +31,153 @@ */ -#include "ffts.h" +#include "ffts_small.h" #include "macros.h" #include <stdlib.h> -#define DEBUG(x) - -#include "ffts_small.h" - - void firstpass_16_f(ffts_plan_t * p, const void * in, void * out) +void ffts_firstpass_16_f(ffts_plan_t *p, const void *in, void *out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; - float *LUT8 = p->ws; - - L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); - L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); - K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); - S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); - K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); - S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); + const data_t *din = (const data_t*) in; + data_t *dout = (data_t*) out; + float *LUT8 = (float*) p->ws; + V r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + + L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); + L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); + K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + K_N(0, VLD(LUT8+8), VLD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13); + S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); + K_N(0, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15); + S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); } - void firstpass_16_b(ffts_plan_t * p, const void * in, void * out) +void ffts_firstpass_16_b(ffts_plan_t *p, const void *in, void *out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; - float *LUT8 = p->ws; - - L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); - L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); - K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); - K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); - S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); - K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); - S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); -} + const data_t *din = (const data_t*) in; + data_t *dout = (data_t*) out; + float *LUT8 = (float*) p->ws; + V r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); + L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); + K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + K_N(1, VLD(LUT8+8), VLD(LUT8+12),&r0_1, &r4_5, &r8_9, &r12_13); + S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); + K_N(1, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15); + S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); +} - void firstpass_8_f(ffts_plan_t *p, const void *in, void *out) +void ffts_firstpass_8_f(ffts_plan_t *p, const void *in, void *out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; + const data_t *din = (const data_t*) in; + data_t *dout = (data_t*) out; V r0_1, r2_3, r4_5, r6_7; - float *LUT8 = p->ws + p->ws_is[0]; + float *LUT8 = (float*) p->ws + p->ws_is[0]; L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); + S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); } - void firstpass_8_b(ffts_plan_t *p, const void *in, void *out) +void ffts_firstpass_8_b(ffts_plan_t *p, const void *in, void *out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; + const data_t *din = (const data_t*) in; + data_t *dout = (data_t*) out; V r0_1, r2_3, r4_5, r6_7; - float *LUT8 = p->ws + p->ws_is[0]; + float *LUT8 = (float*) p->ws + p->ws_is[0]; L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); + S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); } - - void firstpass_4_f(ffts_plan_t *p, const void *in, void *out) +void ffts_firstpass_4_f(ffts_plan_t *p, const void *in, void *out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; + const data_t *din = (const data_t*) in; + data_t *dout = (data_t*) out; cdata_t t0, t1, t2, t3, t4, t5, t6, t7; - t0[0] = din[0]; t0[1] = din[1]; - t1[0] = din[4]; t1[1] = din[5]; - t2[0] = din[2]; t2[1] = din[3]; - t3[0] = din[6]; t3[1] = din[7]; - - t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1]; - t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1]; - t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1]; - t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1]; - - dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1]; - dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1]; - dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0]; - dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0]; + + t0[0] = din[0]; + t0[1] = din[1]; + t1[0] = din[4]; + t1[1] = din[5]; + t2[0] = din[2]; + t2[1] = din[3]; + t3[0] = din[6]; + t3[1] = din[7]; + + t4[0] = t0[0] + t1[0]; + t4[1] = t0[1] + t1[1]; + t5[0] = t0[0] - t1[0]; + t5[1] = t0[1] - t1[1]; + t6[0] = t2[0] + t3[0]; + t6[1] = t2[1] + t3[1]; + t7[0] = t2[0] - t3[0]; + t7[1] = t2[1] - t3[1]; + + dout[0] = t4[0] + t6[0]; + dout[1] = t4[1] + t6[1]; + dout[4] = t4[0] - t6[0]; + dout[5] = t4[1] - t6[1]; + dout[2] = t5[0] + t7[1]; + dout[3] = t5[1] - t7[0]; + dout[6] = t5[0] - t7[1]; + dout[7] = t5[1] + t7[0]; } - void firstpass_4_b(ffts_plan_t *p, const void *in, void *out) +void ffts_firstpass_4_b(ffts_plan_t *p, const void *in, void *out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; + const data_t *din = (const data_t*) in; + data_t *dout = (data_t*) out; cdata_t t0, t1, t2, t3, t4, t5, t6, t7; - t0[0] = din[0]; t0[1] = din[1]; - t1[0] = din[4]; t1[1] = din[5]; - t2[0] = din[2]; t2[1] = din[3]; - t3[0] = din[6]; t3[1] = din[7]; - - t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1]; - t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1]; - t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1]; - t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1]; - - dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1]; - dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1]; - dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0]; - dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0]; + + t0[0] = din[0]; + t0[1] = din[1]; + t1[0] = din[4]; + t1[1] = din[5]; + t2[0] = din[2]; + t2[1] = din[3]; + t3[0] = din[6]; + t3[1] = din[7]; + + t4[0] = t0[0] + t1[0]; + t4[1] = t0[1] + t1[1]; + t5[0] = t0[0] - t1[0]; + t5[1] = t0[1] - t1[1]; + t6[0] = t2[0] + t3[0]; + t6[1] = t2[1] + t3[1]; + t7[0] = t2[0] - t3[0]; + t7[1] = t2[1] - t3[1]; + + dout[0] = t4[0] + t6[0]; + dout[1] = t4[1] + t6[1]; + dout[4] = t4[0] - t6[0]; + dout[5] = t4[1] - t6[1]; + dout[2] = t5[0] - t7[1]; + dout[3] = t5[1] + t7[0]; + dout[6] = t5[0] + t7[1]; + dout[7] = t5[1] - t7[0]; } - void firstpass_2(ffts_plan_t *p, const void *in, void *out) +void ffts_firstpass_2(ffts_plan_t *p, const void *in, void *out) { - const data_t *din = (const data_t *)in; - data_t *dout = (data_t *)out; - cdata_t t0, t1, r0,r1; - t0[0] = din[0]; t0[1] = din[1]; - t1[0] = din[2]; t1[1] = din[3]; + const data_t *din = (const data_t*) in; + data_t *dout = (data_t*) out; + cdata_t t0, t1, r0, r1; + + t0[0] = din[0]; + t0[1] = din[1]; + t1[0] = din[2]; + t1[1] = din[3]; + r0[0] = t0[0] + t1[0]; r0[1] = t0[1] + t1[1]; r1[0] = t0[0] - t1[0]; r1[1] = t0[1] - t1[1]; - dout[0] = r0[0]; dout[1] = r0[1]; - dout[2] = r1[0]; dout[3] = r1[1]; -} -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: + + dout[0] = r0[0]; + dout[1] = r0[1]; + dout[2] = r1[0]; + dout[3] = r1[1]; +}
\ No newline at end of file diff --git a/src/ffts_small.h b/src/ffts_small.h index 683537a..5ae48cc 100644 --- a/src/ffts_small.h +++ b/src/ffts_small.h @@ -1,14 +1,14 @@ -#ifndef __FFTS_SMALL_H__ -#define __FFTS_SMALL_H__ +#ifndef FFTS_SMALL_H +#define FFTS_SMALL_H +#include "ffts.h" -void firstpass_16_f(ffts_plan_t * p, const void * in, void * out); -void firstpass_16_b(ffts_plan_t * p, const void * in, void * out); -void firstpass_8_f(ffts_plan_t * p, const void * in, void * out); -void firstpass_8_b(ffts_plan_t * p, const void * in, void * out); -void firstpass_4_f(ffts_plan_t * p, const void * in, void * out); -void firstpass_4_b(ffts_plan_t * p, const void * in, void * out); -void firstpass_2(ffts_plan_t * p, const void * in, void * out); +void ffts_firstpass_16_f(ffts_plan_t *p, const void *in, void *out); +void ffts_firstpass_16_b(ffts_plan_t *p, const void *in, void *out); +void ffts_firstpass_8_f(ffts_plan_t *p, const void *in, void *out); +void ffts_firstpass_8_b(ffts_plan_t *p, const void *in, void *out); +void ffts_firstpass_4_f(ffts_plan_t *p, const void *in, void *out); +void ffts_firstpass_4_b(ffts_plan_t *p, const void *in, void *out); +void ffts_firstpass_2(ffts_plan_t *p, const void *in, void *out); -#endif -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +#endif /* FFTS_SMALL_H */ |