From 835c5ab5b3d9f3104959dc6722b4bad600eae8fe Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Thu, 12 Mar 2015 18:03:00 +0200 Subject: Rename vector V as V4SF; vector of 4 single precision floats. Rename all vector V macros accordingly. Redefine ffts_constants as ffts_constants_32f and ffts_constants_64f. --- src/ffts.c | 66 +++--- src/ffts_small.c | 60 ++--- src/ffts_static.c | 635 ++++++++++++++++++++++++++++++----------------------- src/ffts_static.h | 63 +++--- src/macros-alpha.h | 167 ++++++++------ src/macros-neon.h | 153 ++++++------- src/macros-sse.h | 125 ++++++----- src/macros.h | 200 +++++++++-------- 8 files changed, 796 insertions(+), 673 deletions(-) diff --git a/src/ffts.c b/src/ffts.c index 94d6f1b..fd0b716 100644 --- a/src/ffts.c +++ b/src/ffts.c @@ -203,7 +203,7 @@ void ffts_free_1d(ffts_plan_t *p) static int ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) { - V MULI_SIGN; + V4SF MULI_SIGN; int hardcoded; size_t lut_size; size_t n_luts; @@ -212,9 +212,9 @@ static int ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) size_t n; if (sign < 0) { - MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); + MULI_SIGN = V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f); } else { - MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); + MULI_SIGN = V4SF_LIT4(0.0f, -0.0f, 0.0f, -0.0f); } /* LUTS */ @@ -348,13 +348,13 @@ static int ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) #else //w = FFTS_MALLOC(n/4 * 2 * sizeof(ffts_cpx_32f), 32); for (j = 0; j < n/4; j += 2) { - V re, im, temp0; - temp0 = VLD(fw0 + j*2); - re = VDUPRE(temp0); - im = VDUPIM(temp0); - im = VXOR(im, MULI_SIGN); - VST(fw + j*4 + 0, re); - VST(fw + j*4 + 4, im); + V4SF re, im, temp0; + temp0 = V4SF_LD(fw0 + j*2); + re = V4SF_DUPLICATE_RE(temp0); + im = V4SF_DUPLICATE_IM(temp0); + im = V4SF_XOR(im, MULI_SIGN); + V4SF_ST(fw + j*4 + 0, re); + V4SF_ST(fw + j*4 + 4, im); } w += n/4 * 2; @@ -371,7 +371,7 @@ static int ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) float *fw2 = (float*) w2; float *fw = (float *)w; - V temp0, temp1, temp2, re, im; + V4SF temp0, temp1, temp2, re, im; size_t j; for (j = 0; j < n/8; j++) { @@ -413,26 +413,26 @@ static int ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) #else //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(ffts_cpx_32f), 32); for (j = 0; j < n/8; j += 2) { - temp0 = VLD(fw0 + j*2); - re = VDUPRE(temp0); - im = VDUPIM(temp0); - im = VXOR(im, MULI_SIGN); - VST(fw + j*2*6 , re); - VST(fw + j*2*6+4, im); - - temp1 = VLD(fw1 + j*2); - re = VDUPRE(temp1); - im = VDUPIM(temp1); - im = VXOR(im, MULI_SIGN); - VST(fw + j*2*6+8 , re); - VST(fw + j*2*6+12, im); - - temp2 = VLD(fw2 + j*2); - re = VDUPRE(temp2); - im = VDUPIM(temp2); - im = VXOR(im, MULI_SIGN); - VST(fw + j*2*6+16, re); - VST(fw + j*2*6+20, im); + temp0 = V4SF_LD(fw0 + j*2); + re = V4SF_DUPLICATE_RE(temp0); + im = V4SF_DUPLICATE_IM(temp0); + im = V4SF_XOR(im, MULI_SIGN); + V4SF_ST(fw + j*2*6 , re); + V4SF_ST(fw + j*2*6+4, im); + + temp1 = V4SF_LD(fw1 + j*2); + re = V4SF_DUPLICATE_RE(temp1); + im = V4SF_DUPLICATE_IM(temp1); + im = V4SF_XOR(im, MULI_SIGN); + V4SF_ST(fw + j*2*6+8 , re); + V4SF_ST(fw + j*2*6+12, im); + + temp2 = V4SF_LD(fw2 + j*2); + re = V4SF_DUPLICATE_RE(temp2); + im = V4SF_DUPLICATE_IM(temp2); + im = V4SF_XOR(im, MULI_SIGN); + V4SF_ST(fw + j*2*6+16, re); + V4SF_ST(fw + j*2*6+20, im); } w += n/8 * 3 * 2; @@ -514,9 +514,9 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) #ifdef DYNAMIC_DISABLED if (sign < 0) { - p->transform = ffts_static_transform_f; + p->transform = ffts_static_transform_f_32f; } else { - p->transform = ffts_static_transform_i; + p->transform = ffts_static_transform_i_32f; } #else /* determinate transform size */ diff --git a/src/ffts_small.c b/src/ffts_small.c index 34be7af..5bcbfc6 100644 --- a/src/ffts_small.c +++ b/src/ffts_small.c @@ -245,12 +245,12 @@ ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out) { const float *din = (const float*) in; float *dout = (float*) out; - V r0_1, r2_3, r4_5, r6_7; + V4SF r0_1, r2_3, r4_5, r6_7; float *LUT8 = (float*) p->ws + p->ws_is[0]; - L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); - K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); + V4SF_L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); + V4SF_K_N(0, V4SF_LD(LUT8), V4SF_LD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); } void @@ -258,7 +258,7 @@ ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out) { const double *din = (const double*) in; double *dout = (double*) out; - V r0_1, r2_3, r4_5, r6_7; + V4SF r0_1, r2_3, r4_5, r6_7; double *LUT8 = (double*) p->ws + p->ws_is[0]; #if MACROS_READY @@ -273,12 +273,12 @@ ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out) { const float *din = (const float*) in; float *dout = (float*) out; - V r0_1, r2_3, r4_5, r6_7; + V4SF r0_1, r2_3, r4_5, r6_7; float *LUT8 = (float*) p->ws + p->ws_is[0]; - L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); - K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); + V4SF_L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); + V4SF_K_N(1, V4SF_LD(LUT8), V4SF_LD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12); } void @@ -286,7 +286,7 @@ ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out) { const double *din = (const double*) in; double *dout = (double*) out; - V r0_1, r2_3, r4_5, r6_7; + V4SF r0_1, r2_3, r4_5, r6_7; double *LUT8 = (double*) p->ws + p->ws_is[0]; #if MACROS_READY @@ -302,15 +302,15 @@ ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out) const float *din = (const float*) in; float *dout = (float*) out; float *LUT8 = (float*) p->ws; - V r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; - - L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); - L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); - K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - K_N(0, VLD(LUT8+8), VLD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13); - S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); - K_N(0, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15); - S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); + V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + + V4SF_L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); + V4SF_L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); + V4SF_K_N(0, V4SF_LD(LUT8), V4SF_LD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + V4SF_K_N(0, V4SF_LD(LUT8+8), V4SF_LD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13); + V4SF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); + V4SF_K_N(0, V4SF_LD(LUT8+16), V4SF_LD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15); + V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); } void @@ -319,7 +319,7 @@ ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out) const double *din = (const double*) in; double *dout = (double*) out; double *LUT8 = (double*) p->ws; - V r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; #ifdef MACROS_READY L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); @@ -338,15 +338,15 @@ ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out) const float *din = (const float*) in; float *dout = (float*) out; float *LUT8 = (float*) p->ws; - V r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; - - L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); - L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); - K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); - K_N(1, VLD(LUT8+8), VLD(LUT8+12),&r0_1, &r4_5, &r8_9, &r12_13); - S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); - K_N(1, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15); - S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); + V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + + V4SF_L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); + V4SF_L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13); + V4SF_K_N(1, V4SF_LD(LUT8+ 0), V4SF_LD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + V4SF_K_N(1, V4SF_LD(LUT8+ 8), V4SF_LD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13); + V4SF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24); + V4SF_K_N(1, V4SF_LD(LUT8+16), V4SF_LD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15); + V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28); } void @@ -355,7 +355,7 @@ ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out) const double *din = (const double*) in; double *dout = (double*) out; double *LUT8 = (double*) p->ws; - V r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; + V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15; #ifdef MACROS_READY L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11); diff --git a/src/ffts_static.c b/src/ffts_static.c index cdecf1b..7a0bf4a 100644 --- a/src/ffts_static.c +++ b/src/ffts_static.c @@ -1,393 +1,465 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "ffts_static.h" + #include "ffts_internal.h" #include "macros.h" #include -static const FFTS_ALIGN(16) data_t ffts_constants[16] = { - 0.70710678118654757273731092936941, 0.70710678118654757273731092936941, - 0.70710678118654757273731092936941, 0.70710678118654757273731092936941, - -0.70710678118654746171500846685376, 0.70710678118654746171500846685376, - -0.70710678118654746171500846685376, 0.70710678118654746171500846685376, - 1.0, 1.0, - 0.70710678118654757273731092936941, 0.70710678118654757273731092936941, - 0.0, 0.0, - -0.70710678118654746171500846685376, 0.70710678118654746171500846685376 +static const FFTS_ALIGN(16) float ffts_constants_32f[16] = { + 0.70710678118654757273731092936941f, + 0.70710678118654757273731092936941f, + 0.70710678118654757273731092936941f, + 0.70710678118654757273731092936941f, + -0.70710678118654746171500846685376f, + 0.70710678118654746171500846685376f, + -0.70710678118654746171500846685376f, + 0.70710678118654746171500846685376f, + 1.0f, + 1.0f, + 0.70710678118654757273731092936941f, + 0.70710678118654757273731092936941f, + 0.0f, + 0.0f, + -0.70710678118654746171500846685376f, + 0.70710678118654746171500846685376f +}; + +static const FFTS_ALIGN(16) float ffts_constants_inv_32f[16] = { + 0.70710678118654757273731092936941f, + 0.70710678118654757273731092936941f, + 0.70710678118654757273731092936941f, + 0.70710678118654757273731092936941f, + 0.70710678118654746171500846685376f, + -0.70710678118654746171500846685376f, + 0.70710678118654746171500846685376f, + -0.70710678118654746171500846685376f, + 1.0f, + 1.0f, + 0.70710678118654757273731092936941f, + 0.70710678118654757273731092936941f, + 0.0f, + 0.0f, + 0.70710678118654746171500846685376f, + -0.70710678118654746171500846685376f +}; + +static const FFTS_ALIGN(16) double ffts_constants_64f[16] = { + 0.70710678118654757273731092936941, + 0.70710678118654757273731092936941, + 0.70710678118654757273731092936941, + 0.70710678118654757273731092936941, + -0.70710678118654746171500846685376, + 0.70710678118654746171500846685376, + -0.70710678118654746171500846685376, + 0.70710678118654746171500846685376, + 1.0, + 1.0, + 0.70710678118654757273731092936941, + 0.70710678118654757273731092936941, + 0.0, + 0.0, + -0.70710678118654746171500846685376, + 0.70710678118654746171500846685376 }; -static const FFTS_ALIGN(16) data_t ffts_constants_inv[16] = { - 0.70710678118654757273731092936941, 0.70710678118654757273731092936941, - 0.70710678118654757273731092936941, 0.70710678118654757273731092936941, - 0.70710678118654746171500846685376, -0.70710678118654746171500846685376, - 0.70710678118654746171500846685376, -0.70710678118654746171500846685376, - 1.0, 1.0, - 0.70710678118654757273731092936941, 0.70710678118654757273731092936941, - 0.0, 0.0, - 0.70710678118654746171500846685376, -0.70710678118654746171500846685376 +static const FFTS_ALIGN(16) double ffts_constants_inv_64f[16] = { + 0.70710678118654757273731092936941, + 0.70710678118654757273731092936941, + 0.70710678118654757273731092936941, + 0.70710678118654757273731092936941, + 0.70710678118654746171500846685376, + -0.70710678118654746171500846685376, + 0.70710678118654746171500846685376, + -0.70710678118654746171500846685376, + 1.0, + 1.0, + 0.70710678118654757273731092936941, + 0.70710678118654757273731092936941, + 0.0, + 0.0, + 0.70710678118654746171500846685376, + -0.70710678118654746171500846685376 }; -static FFTS_INLINE void K_0(int inv, V *r0, V *r1, V *r2, V *r3) +static FFTS_INLINE void +V4SF_K_0(int inv, + V4SF *r0, + V4SF *r1, + V4SF *r2, + V4SF *r3) { - V t0, t1, t2, t3; + V4SF t0, t1, t2, t3; t0 = *r0; t1 = *r1; - t2 = VADD(*r2, *r3); - t3 = IMULI(inv, VSUB(*r2, *r3)); + t2 = V4SF_ADD(*r2, *r3); + t3 = V4SF_IMULI(inv, V4SF_SUB(*r2, *r3)); - *r0 = VADD(t0, t2); - *r2 = VSUB(t0, t2); - *r1 = VSUB(t1, t3); - *r3 = VADD(t1, t3); + *r0 = V4SF_ADD(t0, t2); + *r2 = V4SF_SUB(t0, t2); + *r1 = V4SF_SUB(t1, t3); + *r3 = V4SF_ADD(t1, t3); } -static FFTS_INLINE void L_2(const data_t *FFTS_RESTRICT i0, - const data_t *FFTS_RESTRICT i1, - const data_t *FFTS_RESTRICT i2, - const data_t *FFTS_RESTRICT i3, - V *r0, - V *r1, - V *r2, - V *r3) +static FFTS_INLINE void +V4SF_L_2(const float *FFTS_RESTRICT i0, + const float *FFTS_RESTRICT i1, + const float *FFTS_RESTRICT i2, + const float *FFTS_RESTRICT i3, + V4SF *r0, + V4SF *r1, + V4SF *r2, + V4SF *r3) { - V t0, t1, t2, t3; + V4SF t0, t1, t2, t3; - t0 = VLD(i0); - t1 = VLD(i1); - t2 = VLD(i2); - t3 = VLD(i3); + t0 = V4SF_LD(i0); + t1 = V4SF_LD(i1); + t2 = V4SF_LD(i2); + t3 = V4SF_LD(i3); - *r0 = VADD(t0, t1); - *r1 = VSUB(t0, t1); - *r2 = VADD(t2, t3); - *r3 = VSUB(t2, t3); + *r0 = V4SF_ADD(t0, t1); + *r1 = V4SF_SUB(t0, t1); + *r2 = V4SF_ADD(t2, t3); + *r3 = V4SF_SUB(t2, t3); } -static FFTS_INLINE void L_4(int inv, - const data_t *FFTS_RESTRICT i0, - const data_t *FFTS_RESTRICT i1, - const data_t *FFTS_RESTRICT i2, - const data_t *FFTS_RESTRICT i3, - V *r0, - V *r1, - V *r2, - V *r3) +static FFTS_INLINE void +V4SF_L_4(int inv, + const float *FFTS_RESTRICT i0, + const float *FFTS_RESTRICT i1, + const float *FFTS_RESTRICT i2, + const float *FFTS_RESTRICT i3, + V4SF *r0, + V4SF *r1, + V4SF *r2, + V4SF *r3) { - V t0, t1, t2, t3, t4, t5, t6, t7; - - t0 = VLD(i0); - t1 = VLD(i1); - t2 = VLD(i2); - t3 = VLD(i3); - - t4 = VADD(t0, t1); - t5 = VSUB(t0, t1); - t6 = VADD(t2, t3); - t7 = IMULI(inv, VSUB(t2, t3)); - - *r0 = VADD(t4, t6); - *r2 = VSUB(t4, t6); - *r1 = VSUB(t5, t7); - *r3 = VADD(t5, t7); + V4SF t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = V4SF_LD(i0); + t1 = V4SF_LD(i1); + t2 = V4SF_LD(i2); + t3 = V4SF_LD(i3); + + t4 = V4SF_ADD(t0, t1); + t5 = V4SF_SUB(t0, t1); + t6 = V4SF_ADD(t2, t3); + t7 = V4SF_IMULI(inv, V4SF_SUB(t2, t3)); + + *r0 = V4SF_ADD(t4, t6); + *r2 = V4SF_SUB(t4, t6); + *r1 = V4SF_SUB(t5, t7); + *r3 = V4SF_ADD(t5, t7); } -static FFTS_INLINE void LEAF_EE(data_t *const FFTS_RESTRICT out, - const ptrdiff_t *FFTS_RESTRICT os, - const data_t *FFTS_RESTRICT in, - const ptrdiff_t *FFTS_RESTRICT is, - int inv) +static FFTS_INLINE void +V4SF_LEAF_EE(float *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const float *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) { - const data_t *FFTS_RESTRICT LUT = inv ? ffts_constants_inv : ffts_constants; + const float *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_32f : ffts_constants_32f; - V r0, r1, r2, r3, r4, r5, r6, r7; + V4SF r0, r1, r2, r3, r4, r5, r6, r7; - data_t *out0 = out + os[0]; - data_t *out1 = out + os[1]; + float *out0 = out + os[0]; + float *out1 = out + os[1]; - L_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); - L_2(in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7); + V4SF_L_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); + V4SF_L_2(in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7); - K_0(inv, &r0, &r2, &r4, &r6); - K_N(inv, VLD(LUT + 0), VLD(LUT + 4), &r1, &r3, &r5, &r7); - TX2(&r0, &r1); - TX2(&r2, &r3); - TX2(&r4, &r5); - TX2(&r6, &r7); + V4SF_K_0(inv, &r0, &r2, &r4, &r6); + V4SF_K_N(inv, V4SF_LD(LUT + 0), V4SF_LD(LUT + 4), &r1, &r3, &r5, &r7); + V4SF_TX2(&r0, &r1); + V4SF_TX2(&r2, &r3); + V4SF_TX2(&r4, &r5); + V4SF_TX2(&r6, &r7); - S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12); - S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); + V4SF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); } -static FFTS_INLINE void LEAF_EE2(data_t *const FFTS_RESTRICT out, - const ptrdiff_t *FFTS_RESTRICT os, - const data_t *FFTS_RESTRICT in, - const ptrdiff_t *FFTS_RESTRICT is, - int inv) +static FFTS_INLINE void +V4SF_LEAF_EE2(float *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const float *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) { - const data_t *FFTS_RESTRICT LUT = inv ? ffts_constants_inv : ffts_constants; + const float *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_32f : ffts_constants_32f; - V r0, r1, r2, r3, r4, r5, r6, r7; + V4SF r0, r1, r2, r3, r4, r5, r6, r7; - data_t *out0 = out + os[0]; - data_t *out1 = out + os[1]; + float *out0 = out + os[0]; + float *out1 = out + os[1]; - L_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r0, &r1, &r2, &r3); - L_2(in + is[0], in + is[1], in + is[3], in + is[2], &r4, &r5, &r6, &r7); + V4SF_L_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r0, &r1, &r2, &r3); + V4SF_L_2(in + is[0], in + is[1], in + is[3], in + is[2], &r4, &r5, &r6, &r7); - K_0(inv, &r0, &r2, &r4, &r6); - K_N(inv, VLD(LUT + 0), VLD(LUT + 4), &r1, &r3, &r5, &r7); - TX2(&r0, &r1); - TX2(&r2, &r3); - TX2(&r4, &r5); - TX2(&r6, &r7); + V4SF_K_0(inv, &r0, &r2, &r4, &r6); + V4SF_K_N(inv, V4SF_LD(LUT + 0), V4SF_LD(LUT + 4), &r1, &r3, &r5, &r7); + V4SF_TX2(&r0, &r1); + V4SF_TX2(&r2, &r3); + V4SF_TX2(&r4, &r5); + V4SF_TX2(&r6, &r7); - S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12); - S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); + V4SF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); } -static FFTS_INLINE void LEAF_EO(data_t *const FFTS_RESTRICT out, - const ptrdiff_t *FFTS_RESTRICT os, - const data_t *FFTS_RESTRICT in, - const ptrdiff_t *FFTS_RESTRICT is, - int inv) +static FFTS_INLINE void +V4SF_LEAF_EO(float *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const float *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) { - const data_t *FFTS_RESTRICT LUT = inv ? ffts_constants_inv : ffts_constants; + const float *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_32f : ffts_constants_32f; - V r0, r1, r2, r3, r4, r5, r6, r7; + V4SF r0, r1, r2, r3, r4, r5, r6, r7; - data_t *out0 = out + os[0]; - data_t *out1 = out + os[1]; + float *out0 = out + os[0]; + float *out1 = out + os[1]; - L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); - L_2_4(inv, in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7); + V4SF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); + V4SF_L_2_4(inv, in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7); - S_4(r2, r3, r7, r6, out1 + 0, out1 + 4, out1 + 8, out1 + 12); - K_N(inv, VLD(LUT + 8), VLD(LUT + 12), &r0, &r1, &r4, &r5); - S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4SF_S_4(r2, r3, r7, r6, out1 + 0, out1 + 4, out1 + 8, out1 + 12); + V4SF_K_N(inv, V4SF_LD(LUT + 8), V4SF_LD(LUT + 12), &r0, &r1, &r4, &r5); + V4SF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); } -static FFTS_INLINE void LEAF_OE(data_t *const FFTS_RESTRICT out, - const ptrdiff_t *FFTS_RESTRICT os, - const data_t *FFTS_RESTRICT in, - const ptrdiff_t *FFTS_RESTRICT is, - int inv) +static FFTS_INLINE void +V4SF_LEAF_OE(float *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const float *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) { - const data_t *FFTS_RESTRICT LUT = inv ? ffts_constants_inv : ffts_constants; + const float *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_32f : ffts_constants_32f; - V r0, r1, r2, r3, r4, r5, r6, r7; + V4SF r0, r1, r2, r3, r4, r5, r6, r7; - data_t *out0 = out + os[0]; - data_t *out1 = out + os[1]; + float *out0 = out + os[0]; + float *out1 = out + os[1]; - L_4_2(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); - L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7); + V4SF_L_4_2(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); + V4SF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7); - S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); - K_N(inv, VLD(LUT + 8), VLD(LUT + 12), &r6, &r7, &r2, &r3); - S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12); + V4SF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4SF_K_N(inv, V4SF_LD(LUT + 8), V4SF_LD(LUT + 12), &r6, &r7, &r2, &r3); + V4SF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12); } -static FFTS_INLINE void LEAF_OO(data_t *const FFTS_RESTRICT out, - const ptrdiff_t *FFTS_RESTRICT os, - const data_t *FFTS_RESTRICT in, - const ptrdiff_t *FFTS_RESTRICT is, - int inv) +static FFTS_INLINE void +V4SF_LEAF_OO(float *const FFTS_RESTRICT out, + const ptrdiff_t *FFTS_RESTRICT os, + const float *FFTS_RESTRICT in, + const ptrdiff_t *FFTS_RESTRICT is, + int inv) { - V r0, r1, r2, r3, r4, r5, r6, r7; + V4SF r0, r1, r2, r3, r4, r5, r6, r7; - data_t *out0 = out + os[0]; - data_t *out1 = out + os[1]; + float *out0 = out + os[0]; + float *out1 = out + os[1]; - L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); - L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7); + V4SF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3); + V4SF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7); - S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); - S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); + V4SF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12); + V4SF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12); } -static FFTS_INLINE void X_4(int inv, - data_t *FFTS_RESTRICT data, - size_t N, - const data_t *FFTS_RESTRICT LUT) +static FFTS_INLINE void +V4SF_X_4(int inv, + float *FFTS_RESTRICT data, + size_t N, + const float *FFTS_RESTRICT LUT) { size_t i; for (i = 0; i < N/8; i++) { - V r0 = VLD(data); - V r1 = VLD(data + 2*N/4); - V r2 = VLD(data + 4*N/4); - V r3 = VLD(data + 6*N/4); + V4SF r0 = V4SF_LD(data); + V4SF r1 = V4SF_LD(data + 2*N/4); + V4SF r2 = V4SF_LD(data + 4*N/4); + V4SF r3 = V4SF_LD(data + 6*N/4); - K_N(inv, VLD(LUT), VLD(LUT + 4), &r0, &r1, &r2, &r3); + V4SF_K_N(inv, V4SF_LD(LUT), V4SF_LD(LUT + 4), &r0, &r1, &r2, &r3); - VST(data , r0); - VST(data + 2*N/4, r1); - VST(data + 4*N/4, r2); - VST(data + 6*N/4, r3); + V4SF_ST(data , r0); + V4SF_ST(data + 2*N/4, r1); + V4SF_ST(data + 4*N/4, r2); + V4SF_ST(data + 6*N/4, r3); LUT += 8; data += 4; } } -static FFTS_INLINE void X_8(int inv, - data_t *FFTS_RESTRICT data0, - size_t N, - const data_t *FFTS_RESTRICT LUT) +static FFTS_INLINE void +V4SF_X_8(int inv, + float *FFTS_RESTRICT data0, + size_t N, + const float *FFTS_RESTRICT LUT) { - data_t *data1 = data0 + 1*N/4; - data_t *data2 = data0 + 2*N/4; - data_t *data3 = data0 + 3*N/4; - data_t *data4 = data0 + 4*N/4; - data_t *data5 = data0 + 5*N/4; - data_t *data6 = data0 + 6*N/4; - data_t *data7 = data0 + 7*N/4; + float *data1 = data0 + 1*N/4; + float *data2 = data0 + 2*N/4; + float *data3 = data0 + 3*N/4; + float *data4 = data0 + 4*N/4; + float *data5 = data0 + 5*N/4; + float *data6 = data0 + 6*N/4; + float *data7 = data0 + 7*N/4; size_t i; for (i = 0; i < N/16; i++) { - V r0, r1, r2, r3, r4, r5, r6, r7; + V4SF r0, r1, r2, r3, r4, r5, r6, r7; - r0 = VLD(data0); - r1 = VLD(data1); - r2 = VLD(data2); - r3 = VLD(data3); + r0 = V4SF_LD(data0); + r1 = V4SF_LD(data1); + r2 = V4SF_LD(data2); + r3 = V4SF_LD(data3); - K_N(inv, VLD(LUT), VLD(LUT + 4), &r0, &r1, &r2, &r3); - r4 = VLD(data4); - r6 = VLD(data6); + V4SF_K_N(inv, V4SF_LD(LUT), V4SF_LD(LUT + 4), &r0, &r1, &r2, &r3); + r4 = V4SF_LD(data4); + r6 = V4SF_LD(data6); - K_N(inv, VLD(LUT + 8), VLD(LUT + 12), &r0, &r2, &r4, &r6); - r5 = VLD(data5); - r7 = VLD(data7); + V4SF_K_N(inv, V4SF_LD(LUT + 8), V4SF_LD(LUT + 12), &r0, &r2, &r4, &r6); + r5 = V4SF_LD(data5); + r7 = V4SF_LD(data7); - K_N(inv, VLD(LUT + 16), VLD(LUT + 20), &r1, &r3, &r5, &r7); + V4SF_K_N(inv, V4SF_LD(LUT + 16), V4SF_LD(LUT + 20), &r1, &r3, &r5, &r7); LUT += 24; - VST(data0, r0); + V4SF_ST(data0, r0); data0 += 4; - VST(data1, r1); + V4SF_ST(data1, r1); data1 += 4; - VST(data2, r2); + V4SF_ST(data2, r2); data2 += 4; - VST(data3, r3); + V4SF_ST(data3, r3); data3 += 4; - VST(data4, r4); + V4SF_ST(data4, r4); data4 += 4; - VST(data5, r5); + V4SF_ST(data5, r5); data5 += 4; - VST(data6, r6); + V4SF_ST(data6, r6); data6 += 4; - VST(data7, r7); + V4SF_ST(data7, r7); data7 += 4; } } -static FFTS_INLINE void ffts_static_firstpass_odd(float *const FFTS_RESTRICT out, - const float *FFTS_RESTRICT in, - const ffts_plan_t *FFTS_RESTRICT p, - int inv) +static FFTS_INLINE void +ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out, + const float *FFTS_RESTRICT in, + const ffts_plan_t *FFTS_RESTRICT p, + int inv) { size_t i, i0 = p->i0, i1 = p->i1; const ptrdiff_t *is = (const ptrdiff_t*) p->is; const ptrdiff_t *os = (const ptrdiff_t*) p->offsets; for (i = i0; i > 0; --i) { - LEAF_EE(out, os, in, is, inv); + V4SF_LEAF_EE(out, os, in, is, inv); in += 4; os += 2; } for (i = i1; i > 0; --i) { - LEAF_OO(out, os, in, is, inv); + V4SF_LEAF_OO(out, os, in, is, inv); in += 4; os += 2; } - LEAF_OE(out, os, in, is, inv); + V4SF_LEAF_OE(out, os, in, is, inv); in += 4; os += 2; for (i = i1; i > 0; --i) { - LEAF_EE2(out, os, in, is, inv); + V4SF_LEAF_EE2(out, os, in, is, inv); in += 4; os += 2; } } -static FFTS_INLINE void ffts_static_firstpass_even(float *FFTS_RESTRICT out, - const float *FFTS_RESTRICT in, - const ffts_plan_t *FFTS_RESTRICT p, - int inv) +static FFTS_INLINE void +ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out, + const float *FFTS_RESTRICT in, + const ffts_plan_t *FFTS_RESTRICT p, + int inv) { size_t i, i0 = p->i0, i1 = p->i1; const ptrdiff_t *is = (const ptrdiff_t*) p->is; const ptrdiff_t *os = (const ptrdiff_t*) p->offsets; for(i = i0; i > 0; --i) { - LEAF_EE(out, os, in, is, inv); + V4SF_LEAF_EE(out, os, in, is, inv); in += 4; os += 2; } - LEAF_EO(out, os, in, is, inv); + V4SF_LEAF_EO(out, os, in, is, inv); in += 4; os += 2; for (i = i1; i > 0; --i) { - LEAF_OO(out, os, in, is, inv); + V4SF_LEAF_OO(out, os, in, is, inv); in += 4; os += 2; } for (i = i1; i > 0; --i) { - LEAF_EE2(out, os, in, is, inv); + V4SF_LEAF_EE2(out, os, in, is, inv); in += 4; os += 2; } } -void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) +static void +ffts_static_rec_f_32f(ffts_plan_t *p, float *data, size_t N) { const float *ws = (float*) p->ws; @@ -396,40 +468,41 @@ void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) size_t N2 = N >> 2; size_t N3 = N >> 3; - ffts_static_rec_f(p, data , N2); - ffts_static_rec_f(p, data + N1 , N3); - ffts_static_rec_f(p, data + N1 + N2, N3); - ffts_static_rec_f(p, data + N , N2); - ffts_static_rec_f(p, data + N + N1 , N2); + ffts_static_rec_f_32f(p, data , N2); + ffts_static_rec_f_32f(p, data + N1 , N3); + ffts_static_rec_f_32f(p, data + N1 + N2, N3); + ffts_static_rec_f_32f(p, data + N , N2); + ffts_static_rec_f_32f(p, data + N + N1 , N2); - X_8(0, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1)); + V4SF_X_8(0, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1)); } else if (N == 128) { const float *ws1 = ws + (p->ws_is[1] << 1); - X_8(0, data + 0, 32, ws1); + V4SF_X_8(0, data + 0, 32, ws1); - X_4(0, data + 64, 16, ws); - X_4(0, data + 96, 16, ws); + V4SF_X_4(0, data + 64, 16, ws); + V4SF_X_4(0, data + 96, 16, ws); - X_8(0, data + 128, 32, ws1); - X_8(0, data + 192, 32, ws1); + V4SF_X_8(0, data + 128, 32, ws1); + V4SF_X_8(0, data + 192, 32, ws1); - X_8(0, data, N, ws + (p->ws_is[3] << 1)); + V4SF_X_8(0, data, N, ws + (p->ws_is[3] << 1)); } else if (N == 64) { - X_4(0, data + 0, 16, ws); - X_4(0, data + 64, 16, ws); - X_4(0, data + 96, 16, ws); + V4SF_X_4(0, data + 0, 16, ws); + V4SF_X_4(0, data + 64, 16, ws); + V4SF_X_4(0, data + 96, 16, ws); - X_8(0, data + 0, N, ws + (p->ws_is[2] << 1)); + V4SF_X_8(0, data + 0, N, ws + (p->ws_is[2] << 1)); } else if (N == 32) { - X_8(0, data, N, ws + (p->ws_is[1] << 1)); + V4SF_X_8(0, data, N, ws + (p->ws_is[1] << 1)); } else { assert(N == 16); - X_4(0, data, N, ws); + V4SF_X_4(0, data, N, ws); } } -void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) +static void +ffts_static_rec_i_32f(ffts_plan_t *p, float *data, size_t N) { float *ws = (float*) p->ws; @@ -438,57 +511,59 @@ void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) size_t N2 = N >> 2; size_t N3 = N >> 3; - ffts_static_rec_i(p, data , N2); - ffts_static_rec_i(p, data + N1 , N3); - ffts_static_rec_i(p, data + N1 + N2, N3); - ffts_static_rec_i(p, data + N , N2); - ffts_static_rec_i(p, data + N + N1 , N2); + ffts_static_rec_i_32f(p, data , N2); + ffts_static_rec_i_32f(p, data + N1 , N3); + ffts_static_rec_i_32f(p, data + N1 + N2, N3); + ffts_static_rec_i_32f(p, data + N , N2); + ffts_static_rec_i_32f(p, data + N + N1 , N2); - X_8(1, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1)); + V4SF_X_8(1, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1)); } else if (N == 128) { const float *ws1 = ws + (p->ws_is[1] << 1); - X_8(1, data + 0, 32, ws1); + V4SF_X_8(1, data + 0, 32, ws1); - X_4(1, data + 64, 16, ws); - X_4(1, data + 96, 16, ws); + V4SF_X_4(1, data + 64, 16, ws); + V4SF_X_4(1, data + 96, 16, ws); - X_8(1, data + 128, 32, ws1); - X_8(1, data + 192, 32, ws1); + V4SF_X_8(1, data + 128, 32, ws1); + V4SF_X_8(1, data + 192, 32, ws1); - X_8(1, data, N, ws + (p->ws_is[3] << 1)); + V4SF_X_8(1, data, N, ws + (p->ws_is[3] << 1)); } else if (N == 64) { - X_4(1, data + 0, 16, ws); - X_4(1, data + 64, 16, ws); - X_4(1, data + 96, 16, ws); + V4SF_X_4(1, data + 0, 16, ws); + V4SF_X_4(1, data + 64, 16, ws); + V4SF_X_4(1, data + 96, 16, ws); - X_8(1, data + 0, N, ws + (p->ws_is[2] << 1)); + V4SF_X_8(1, data + 0, N, ws + (p->ws_is[2] << 1)); } else if (N == 32) { - X_8(1, data, N, ws + (p->ws_is[1] << 1)); + V4SF_X_8(1, data, N, ws + (p->ws_is[1] << 1)); } else { assert(N == 16); - X_4(1, data, N, ws); + V4SF_X_4(1, data, N, ws); } } -void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out) +void +ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out) { if (ffts_ctzl(p->N) & 1) { - ffts_static_firstpass_odd(out, in, p, 0); + ffts_static_firstpass_odd_32f((float*) out, (const float*) in, p, 0); } else { - ffts_static_firstpass_even(out, in, p, 0); + ffts_static_firstpass_even_32f((float*) out, (const float*) in, p, 0); } - ffts_static_rec_f(p, out, p->N); + ffts_static_rec_f_32f(p, (float*) out, p->N); } -void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out) +void +ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out) { if (ffts_ctzl(p->N) & 1) { - ffts_static_firstpass_odd(out, in, p, 1); + ffts_static_firstpass_odd_32f((float*) out, (const float*) in, p, 1); } else { - ffts_static_firstpass_even(out, in, p, 1); + ffts_static_firstpass_even_32f((float*) out, (const float*) in, p, 1); } - ffts_static_rec_i(p, out, p->N); + ffts_static_rec_i_32f(p, (float*) out, p->N); } \ No newline at end of file diff --git a/src/ffts_static.h b/src/ffts_static.h index e599d80..924c3e1 100644 --- a/src/ffts_static.h +++ b/src/ffts_static.h @@ -1,33 +1,33 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ @@ -40,10 +40,7 @@ #include "ffts.h" -void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) ; -void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out); - -void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) ; -void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out); +void ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out); +void ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out); #endif /* FFTS_STATIC_H */ diff --git a/src/macros-alpha.h b/src/macros-alpha.h index f4efaf8..f7795d4 100644 --- a/src/macros-alpha.h +++ b/src/macros-alpha.h @@ -1,40 +1,52 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2013, Michael J. Cree - Copyright (c) 2012, 2013, Anthony M. Blake - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2013, Michael J. Cree +Copyright (c) 2012, 2013, Anthony M. Blake + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef FFTS_MACROS_ALPHA_H #define FFTS_MACROS_ALPHA_H +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + +#include "ffts_attributes.h" + +#ifdef HAVE_STRING_H #include +#endif + +#ifdef HAVE_STDLIB_H +#include +#endif typedef union { struct { @@ -44,14 +56,15 @@ typedef union { float i2; } r; uint32_t u[4]; -} V; +} V4SF; -#define FFTS_MALLOC(d,a) malloc(d) -#define FFTS_FREE(d) free(d) +#define FFTS_MALLOC(d,a) (malloc(d)) +#define FFTS_FREE(d) (free(d)) -static FFTS_ALWAYS_INLINE V VLIT4(float f3, float f2, float f1, float f0) +static FFTS_ALWAYS_INLINE V4SF +V4SF_LIT4(float f3, float f2, float f1, float f0) { - V z; + V4SF z; z.r.r1 = f0; z.r.i1 = f1; @@ -61,9 +74,10 @@ static FFTS_ALWAYS_INLINE V VLIT4(float f3, float f2, float f1, float f0) return z; } -static FFTS_ALWAYS_INLINE V VADD(V x, V y) +static FFTS_ALWAYS_INLINE V4SF +V4SF_ADD(V4SF x, V4SF y) { - V z; + V4SF z; z.r.r1 = x.r.r1 + y.r.r1; z.r.i1 = x.r.i1 + y.r.i1; @@ -73,9 +87,10 @@ static FFTS_ALWAYS_INLINE V VADD(V x, V y) return z; } -static FFTS_ALWAYS_INLINE V VSUB(V x, V y) +static FFTS_ALWAYS_INLINE V4SF +V4SF_SUB(V4SF x, V4SF y) { - V z; + V4SF z; z.r.r1 = x.r.r1 - y.r.r1; z.r.i1 = x.r.i1 - y.r.i1; @@ -85,9 +100,10 @@ static FFTS_ALWAYS_INLINE V VSUB(V x, V y) return z; } -static FFTS_ALWAYS_INLINE V VMUL(V x, V y) +static FFTS_ALWAYS_INLINE V4SF +V4SF_MUL(V4SF x, V4SF y) { - V z; + V4SF z; z.r.r1 = x.r.r1 * y.r.r1; z.r.i1 = x.r.i1 * y.r.i1; @@ -97,9 +113,10 @@ static FFTS_ALWAYS_INLINE V VMUL(V x, V y) return z; } -static FFTS_ALWAYS_INLINE V VXOR(V x, V y) +static FFTS_ALWAYS_INLINE V4SF +V4SF_XOR(V4SF x, V4SF y) { - V z; + V4SF z; z.u[0] = x.u[0] ^ y.u[0]; z.u[1] = x.u[1] ^ y.u[1]; @@ -109,9 +126,10 @@ static FFTS_ALWAYS_INLINE V VXOR(V x, V y) return z; } -static FFTS_ALWAYS_INLINE V VSWAPPAIRS(V x) +static FFTS_ALWAYS_INLINE V4SF +V4SF_SWAP_PAIRS(V4SF x) { - V z; + V4SF z; z.r.r1 = x.r.i1; z.r.i1 = x.r.r1; @@ -121,9 +139,10 @@ static FFTS_ALWAYS_INLINE V VSWAPPAIRS(V x) return z; } -static FFTS_ALWAYS_INLINE V VBLEND(V x, V y) +static FFTS_ALWAYS_INLINE V4SF +V4SF_BLEND(V4SF x, V4SF y) { - V z; + V4SF z; z.r.r1 = x.r.r1; z.r.i1 = x.r.i1; @@ -133,9 +152,10 @@ static FFTS_ALWAYS_INLINE V VBLEND(V x, V y) return z; } -static FFTS_ALWAYS_INLINE V VUNPACKHI(V x, V y) +static FFTS_ALWAYS_INLINE V4SF +V4SF_UNPACK_HI(V4SF x, V4SF y) { - V z; + V4SF z; z.r.r1 = x.r.r2; z.r.i1 = x.r.i2; @@ -145,9 +165,10 @@ static FFTS_ALWAYS_INLINE V VUNPACKHI(V x, V y) return z; } -static FFTS_ALWAYS_INLINE V VUNPACKLO(V x, V y) +static FFTS_ALWAYS_INLINE V4SF +V4SF_UNPACK_LO(V4SF x, V4SF y) { - V z; + V4SF z; z.r.r1 = x.r.r1; z.r.i1 = x.r.i1; @@ -157,9 +178,10 @@ static FFTS_ALWAYS_INLINE V VUNPACKLO(V x, V y) return z; } -static FFTS_ALWAYS_INLINE V VDUPRE(V x) +static FFTS_ALWAYS_INLINE V4SF +V4SF_DUPLICATE_RE(V4SF x) { - V z; + V4SF z; z.r.r1 = x.r.r1; z.r.i1 = x.r.r1; @@ -169,9 +191,10 @@ static FFTS_ALWAYS_INLINE V VDUPRE(V x) return z; } -static FFTS_ALWAYS_INLINE V VDUPIM(V x) +static FFTS_ALWAYS_INLINE V4SF +V4SF_DUPLICATE_IM(V4SF x) { - V z; + V4SF z; z.r.r1 = x.r.i1; z.r.i1 = x.r.i1; @@ -181,23 +204,26 @@ static FFTS_ALWAYS_INLINE V VDUPIM(V x) return z; } -static FFTS_ALWAYS_INLINE V IMUL(V d, V re, V im) +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMUL(V4SF d, V4SF re, V4SF im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VSUB(re, im); + re = V4SF_MUL(re, d); + im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d)); + return V4SF_SUB(re, im); } -static FFTS_ALWAYS_INLINE V IMULJ(V d, V re, V im) +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMULJ(V4SF d, V4SF re, V4SF im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VADD(re, im); + re = V4SF_MUL(re, d); + im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d)); + return V4SF_ADD(re, im); } -static FFTS_ALWAYS_INLINE V MULI(int inv, V x) +static FFTS_ALWAYS_INLINE V4SF +V4SF_MULI(int inv, V4SF x) { - V z; + V4SF z; if (inv) { z.r.r1 = -x.r.r1; @@ -214,21 +240,24 @@ static FFTS_ALWAYS_INLINE V MULI(int inv, V x) return z; } -static FFTS_ALWAYS_INLINE V IMULI(int inv, V x) +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMULI(int inv, V4SF x) { - return VSWAPPAIRS(MULI(inv, x)); + return V4SF_SWAP_PAIRS(V4SF_MULI(inv, x)); } -static FFTS_ALWAYS_INLINE V VLD(const void *s) +static FFTS_ALWAYS_INLINE V4SF +V4SF_LD(const void *s) { - V z; + V4SF z; memcpy(&z, s, sizeof(z)); return z; } -static FFTS_ALWAYS_INLINE void VST(void *d, V s) +static FFTS_ALWAYS_INLINE void +V4SF_ST(void *d, V4SF s) { - V *r = (V*) d; + V4SF *r = (V4SF*) d; *r = s; } diff --git a/src/macros-neon.h b/src/macros-neon.h index 5663252..4ec92b3 100644 --- a/src/macros-neon.h +++ b/src/macros-neon.h @@ -1,116 +1,119 @@ /* - - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, 2013, Anthony M. Blake - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, 2013, Anthony M. Blake + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + #ifndef FFTS_MACROS_NEON_H #define FFTS_MACROS_NEON_H -#include "neon.h" #include -typedef float32x4_t V; -typedef float32x4x2_t VS; +#ifdef HAVE_STDLIB_H +#include +#endif + +#define FFTS_MALLOC(d,a) (valloc(d)) +#define FFTS_FREE(d) (free(d)) + +typedef float32x4_t V4SF; +typedef float32x4x2_t V4SF2; -#define ADD vaddq_f32 -#define SUB vsubq_f32 -#define MUL vmulq_f32 -#define VADD vaddq_f32 -#define VSUB vsubq_f32 -#define VMUL vmulq_f32 +#define V4SF_ADD vaddq_f32 +#define V4SF_SUB vsubq_f32 +#define V4SF_MUL vmulq_f32 -#define VXOR(x,y) \ +#define V4SF_XOR(x,y) \ (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)))) -#define VST vst1q_f32 -#define VLD vld1q_f32 -#define VST2 vst2q_f32 -#define VLD2 vld2q_f32 +#define V4SF_ST vst1q_f32 +#define V4SF_LD vld1q_f32 -#define VSWAPPAIRS(x) (vrev64q_f32(x)) +#define V4SF_SWAP_PAIRS(x) \ + (vrev64q_f32(x)) -#define VUNPACKHI(a,b) \ +#define V4SF_UNPACK_HI(a,b) \ (vcombine_f32(vget_high_f32(a), vget_high_f32(b))) -#define VUNPACKLO(a,b) \ +#define V4SF_UNPACK_LO(a,b) \ (vcombine_f32(vget_low_f32(a), vget_low_f32(b))) -#define VBLEND(x,y) \ +#define V4SF_BLEND(x,y) \ (vcombine_f32(vget_low_f32(x), vget_high_f32(y))) -static FFTS_INLINE V -VLIT4(float f3, float f2, float f1, float f0) +static FFTS_ALWAYS_INLINE V4SF +V4SF_LIT4(float f3, float f2, float f1, float f0) { float FFTS_ALIGN(16) d[4] = {f0, f1, f2, f3}; - return VLD(d); + return V4SF_LD(d); } -#define VDUPRE(r) \ +#define V4SF_DUPLICATE_RE(r) \ vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0)) -#define VDUPIM(r) \ +#define V4SF_DUPLICATE_IM(r) \ vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1)) -#define FFTS_MALLOC(d,a) (valloc(d)) -#define FFTS_FREE(d) (free(d)) - -static FFTS_INLINE void -STORESPR(float *addr, VS p) -{ - vst1q_f32(addr, p.val[0]); - vst1q_f32(addr + 4, p.val[1]); -} - -static FFTS_INLINE V -IMULI(int inv, V a) +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMULI(int inv, V a) { if (inv) { - return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); + return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(0.0f, -0.0f, 0.0f, -0.0f))); } else { - return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); + return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f))); } } -static FFTS_INLINE V -IMUL(V d, V re, V im) +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMUL(V4SF d, V4SF re, V4SF im) +{ + re = V4SF_MUL(re, d); + im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d)); + return V4SF_SUB(re, im); +} + +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMULJ(V d, V re, V im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VSUB(re, im); + re = V4SF_MUL(re, d); + im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d)); + return V4SF_ADD(re, im); } -static FFTS_INLINE V -IMULJ(V d, V re, V im) +#define V4SF2_ST vst2q_f32 +#define V4SF2_LD vld2q_f32 + +static FFTS_ALWAYS_INLINE void +V4SF2_STORE_SPR(float *addr, V4SF2 p) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VADD(re, im); + vst1q_f32(addr, p.val[0]); + vst1q_f32(addr + 4, p.val[1]); } #endif /* FFTS_MACROS_NEON_H */ diff --git a/src/macros-sse.h b/src/macros-sse.h index cab822c..827aa67 100644 --- a/src/macros-sse.h +++ b/src/macros-sse.h @@ -1,87 +1,100 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef FFTS_MACROS_SSE_H #define FFTS_MACROS_SSE_H +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + #include -//#define VL 4 +#define FFTS_MALLOC(d,a) (_mm_malloc(d,a)) +#define FFTS_FREE(d) (_mm_free(d)) + +typedef __m128 V4SF; -typedef __m128 V; +#define V4SF_ADD _mm_add_ps +#define V4SF_SUB _mm_sub_ps +#define V4SF_MUL _mm_mul_ps +#define V4SF_LIT4 _mm_set_ps +#define V4SF_XOR _mm_xor_ps +#define V4SF_ST _mm_store_ps +#define V4SF_LD _mm_load_ps -#define VADD _mm_add_ps -#define VSUB _mm_sub_ps -#define VMUL _mm_mul_ps -#define VLIT4 _mm_set_ps -#define VXOR _mm_xor_ps -#define VST _mm_store_ps -#define VLD _mm_load_ps +#define V4SF_SWAP_PAIRS(x) \ + (_mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1))) -#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1))) +#define V4SF_UNPACK_HI(x,y) \ + (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,3,2))) -#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2))) -#define VUNPACKLO(x,y) (_mm_movelh_ps(x,y)) +#define V4SF_UNPACK_LO(x,y) \ + (_mm_movelh_ps(x, y)) -#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0))) +#define V4SF_BLEND(x, y) \ + (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,1,0))) -#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0))) -#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1))) +#define V4SF_DUPLICATE_RE(r) \ + (_mm_shuffle_ps(r, r, _MM_SHUFFLE(2,2,0,0))) -#define FFTS_MALLOC(d,a) (_mm_malloc(d,a)) -#define FFTS_FREE(d) (_mm_free(d)) +#define V4SF_DUPLICATE_IM(r) \ + (_mm_shuffle_ps(r, r, _MM_SHUFFLE(3,3,1,1))) -static FFTS_ALWAYS_INLINE V IMULI(int inv, V a) +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMULI(int inv, V4SF a) { if (inv) { - return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); + return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(0.0f, -0.0f, 0.0f, -0.0f))); } else { - return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); + return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f))); } } -static FFTS_ALWAYS_INLINE V IMUL(V d, V re, V im) +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMUL(V4SF d, V4SF re, V4SF im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VSUB(re, im); + re = V4SF_MUL(re, d); + im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d)); + return V4SF_SUB(re, im); } -static FFTS_ALWAYS_INLINE V IMULJ(V d, V re, V im) +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMULJ(V4SF d, V4SF re, V4SF im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VADD(re, im); + re = V4SF_MUL(re, d); + im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d)); + return V4SF_ADD(re, im); } #endif /* FFTS_MACROS_SSE_H */ diff --git a/src/macros.h b/src/macros.h index fc53ae4..b755775 100644 --- a/src/macros.h +++ b/src/macros.h @@ -49,102 +49,108 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif static FFTS_INLINE void -TX2(V *a, V *b) +V4SF_TX2(V4SF *a, V4SF *b) { - V TX2_t0 = VUNPACKLO(*a, *b); - V TX2_t1 = VUNPACKHI(*a, *b); - *a = TX2_t0; - *b = TX2_t1; + V4SF t0 = V4SF_UNPACK_LO(*a, *b); + V4SF t1 = V4SF_UNPACK_HI(*a, *b); + *a = t0; + *b = t1; } static FFTS_INLINE void -K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3) +V4SF_K_N(int inv, + V4SF re, + V4SF im, + V4SF *r0, + V4SF *r1, + V4SF *r2, + V4SF *r3) { - V uk, uk2, zk_p, zk_n, zk, zk_d; + V4SF uk, uk2, zk_p, zk_n, zk, zk_d; uk = *r0; uk2 = *r1; - zk_p = IMUL(*r2, re, im); - zk_n = IMULJ(*r3, re, im); + zk_p = V4SF_IMUL(*r2, re, im); + zk_n = V4SF_IMULJ(*r3, re, im); - zk = VADD(zk_p, zk_n); - zk_d = IMULI(inv, VSUB(zk_p, zk_n)); + zk = V4SF_ADD(zk_p, zk_n); + zk_d = V4SF_IMULI(inv, V4SF_SUB(zk_p, zk_n)); - *r2 = VSUB(uk, zk); - *r0 = VADD(uk, zk); - *r3 = VADD(uk2, zk_d); - *r1 = VSUB(uk2, zk_d); + *r2 = V4SF_SUB(uk, zk); + *r0 = V4SF_ADD(uk, zk); + *r3 = V4SF_ADD(uk2, zk_d); + *r1 = V4SF_SUB(uk2, zk_d); } static FFTS_INLINE void -L_2_4(int inv, - const float *FFTS_RESTRICT i0, - const float *FFTS_RESTRICT i1, - const float *FFTS_RESTRICT i2, - const float *FFTS_RESTRICT i3, - V *r0, - V *r1, - V *r2, - V *r3) +V4SF_L_2_4(int inv, + const float *FFTS_RESTRICT i0, + const float *FFTS_RESTRICT i1, + const float *FFTS_RESTRICT i2, + const float *FFTS_RESTRICT i3, + V4SF *r0, + V4SF *r1, + V4SF *r2, + V4SF *r3) { - V t0, t1, t2, t3, t4, t5, t6, t7; + V4SF t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); - t1 = VLD(i1); - t2 = VLD(i2); - t3 = VLD(i3); + t0 = V4SF_LD(i0); + t1 = V4SF_LD(i1); + t2 = V4SF_LD(i2); + t3 = V4SF_LD(i3); - t4 = VADD(t0, t1); - t5 = VSUB(t0, t1); - t6 = VADD(t2, t3); - t7 = VSUB(t2, t3); + t4 = V4SF_ADD(t0, t1); + t5 = V4SF_SUB(t0, t1); + t6 = V4SF_ADD(t2, t3); + t7 = V4SF_SUB(t2, t3); - *r0 = VUNPACKLO(t4, t5); - *r1 = VUNPACKLO(t6, t7); + *r0 = V4SF_UNPACK_LO(t4, t5); + *r1 = V4SF_UNPACK_LO(t6, t7); - t5 = IMULI(inv, t5); + t5 = V4SF_IMULI(inv, t5); - t0 = VADD(t6, t4); - t2 = VSUB(t6, t4); - t1 = VSUB(t7, t5); - t3 = VADD(t7, t5); + t0 = V4SF_ADD(t6, t4); + t2 = V4SF_SUB(t6, t4); + t1 = V4SF_SUB(t7, t5); + t3 = V4SF_ADD(t7, t5); - *r3 = VUNPACKHI(t0, t1); - *r2 = VUNPACKHI(t2, t3); + *r3 = V4SF_UNPACK_HI(t0, t1); + *r2 = V4SF_UNPACK_HI(t2, t3); } static FFTS_INLINE void -L_4_4(int inv, - const float *FFTS_RESTRICT i0, - const float *FFTS_RESTRICT i1, - const float *FFTS_RESTRICT i2, - const float *FFTS_RESTRICT i3, - V *r0, - V *r1, - V *r2, - V *r3) +V4SF_L_4_4(int inv, + const float *FFTS_RESTRICT i0, + const float *FFTS_RESTRICT i1, + const float *FFTS_RESTRICT i2, + const float *FFTS_RESTRICT i3, + V4SF *r0, + V4SF *r1, + V4SF *r2, + V4SF *r3) { - V t0, t1, t2, t3, t4, t5, t6, t7; + V4SF t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); - t1 = VLD(i1); - t2 = VLD(i2); - t3 = VLD(i3); + t0 = V4SF_LD(i0); + t1 = V4SF_LD(i1); + t2 = V4SF_LD(i2); + t3 = V4SF_LD(i3); - t4 = VADD(t0, t1); - t5 = VSUB(t0, t1); - t6 = VADD(t2, t3); + t4 = V4SF_ADD(t0, t1); + t5 = V4SF_SUB(t0, t1); + t6 = V4SF_ADD(t2, t3); - t7 = IMULI(inv, VSUB(t2, t3)); + t7 = V4SF_IMULI(inv, V4SF_SUB(t2, t3)); - t0 = VADD(t4, t6); - t2 = VSUB(t4, t6); - t1 = VSUB(t5, t7); - t3 = VADD(t5, t7); + t0 = V4SF_ADD(t4, t6); + t2 = V4SF_SUB(t4, t6); + t1 = V4SF_SUB(t5, t7); + t3 = V4SF_ADD(t5, t7); - TX2(&t0, &t1); - TX2(&t2, &t3); + V4SF_TX2(&t0, &t1); + V4SF_TX2(&t2, &t3); *r0 = t0; *r2 = t1; @@ -153,46 +159,46 @@ L_4_4(int inv, } static FFTS_INLINE void -L_4_2(int inv, - const float *FFTS_RESTRICT i0, - const float *FFTS_RESTRICT i1, - const float *FFTS_RESTRICT i2, - const float *FFTS_RESTRICT i3, - V *r0, - V *r1, - V *r2, - V *r3) +V4SF_L_4_2(int inv, + const float *FFTS_RESTRICT i0, + const float *FFTS_RESTRICT i1, + const float *FFTS_RESTRICT i2, + const float *FFTS_RESTRICT i3, + V4SF *r0, + V4SF *r1, + V4SF *r2, + V4SF *r3) { - V t0, t1, t2, t3, t4, t5, t6, t7; + V4SF t0, t1, t2, t3, t4, t5, t6, t7; - t0 = VLD(i0); - t1 = VLD(i1); - t6 = VLD(i2); - t7 = VLD(i3); + t0 = V4SF_LD(i0); + t1 = V4SF_LD(i1); + t6 = V4SF_LD(i2); + t7 = V4SF_LD(i3); - t2 = VBLEND(t6, t7); - t3 = VBLEND(t7, t6); + t2 = V4SF_BLEND(t6, t7); + t3 = V4SF_BLEND(t7, t6); - t4 = VADD(t0, t1); - t5 = VSUB(t0, t1); - t6 = VADD(t2, t3); - t7 = VSUB(t2, t3); + t4 = V4SF_ADD(t0, t1); + t5 = V4SF_SUB(t0, t1); + t6 = V4SF_ADD(t2, t3); + t7 = V4SF_SUB(t2, t3); - *r2 = VUNPACKHI(t4, t5); - *r3 = VUNPACKHI(t6, t7); + *r2 = V4SF_UNPACK_HI(t4, t5); + *r3 = V4SF_UNPACK_HI(t6, t7); - t7 = IMULI(inv, t7); + t7 = V4SF_IMULI(inv, t7); - t0 = VADD(t4, t6); - t2 = VSUB(t4, t6); - t1 = VSUB(t5, t7); - t3 = VADD(t5, t7); + t0 = V4SF_ADD(t4, t6); + t2 = V4SF_SUB(t4, t6); + t1 = V4SF_SUB(t5, t7); + t3 = V4SF_ADD(t5, t7); - *r0 = VUNPACKLO(t0, t1); - *r1 = VUNPACKLO(t2, t3); + *r0 = V4SF_UNPACK_LO(t0, t1); + *r1 = V4SF_UNPACK_LO(t2, t3); } -#define S_4(r0, r1, r2, r3, o0, o1, o2, o3) \ - VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3); +#define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \ + V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3); #endif /* FFTS_MACROS_H */ \ No newline at end of file -- cgit v1.1