diff options
author | Anthony Blake <anthonix@me.com> | 2013-04-22 17:21:47 +1200 |
---|---|---|
committer | Anthony Blake <anthonix@me.com> | 2013-04-22 17:21:47 +1200 |
commit | 752031ba2441f5fef3617b05b9cd2d36cb3b30c4 (patch) | |
tree | 6f96efd68262ea89a2948b82255c05536b32f6d5 /src | |
parent | bd43284b757bd62f9d9f1f1108703b134efc16d7 (diff) | |
download | ffts-752031ba2441f5fef3617b05b9cd2d36cb3b30c4.zip ffts-752031ba2441f5fef3617b05b9cd2d36cb3b30c4.tar.gz |
Included new files I forgot to commit earlier -- thanks Michael Cree
Diffstat (limited to 'src')
-rw-r--r-- | src/ffts_small.c | 156 | ||||
-rw-r--r-- | src/ffts_small.h | 13 | ||||
-rw-r--r-- | src/macros-alpha.h | 206 | ||||
-rw-r--r-- | src/macros-altivec.h | 137 | ||||
-rw-r--r-- | src/macros-neon.h | 96 |
5 files changed, 608 insertions, 0 deletions
diff --git a/src/ffts_small.c b/src/ffts_small.c new file mode 100644 index 0000000..ddd2d3e --- /dev/null +++ b/src/ffts_small.c @@ -0,0 +1,156 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts.h" +#include "macros.h" + +#include <stdlib.h> + +#define DEBUG(x) + +#include "ffts_small.h" + + void firstpass_16_f(ffts_plan_t * p, const void * in, void * out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + float *LUT8 = p->ws; + + L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); + L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); + K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); + S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); + K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); + S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); +} + + void firstpass_16_b(ffts_plan_t * p, const void * in, void * out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + float *LUT8 = p->ws; + + L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); + L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); + K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); + S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); + K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); + S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); +} + + + void firstpass_8_f(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + V r0_1, r2_3, r4_5, r6_7; + float *LUT8 = p->ws + p->ws_is[0]; + + L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); + K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); +} + + void firstpass_8_b(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + V r0_1, r2_3, r4_5, r6_7; + float *LUT8 = p->ws + p->ws_is[0]; + + L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); + K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); + S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); +} + + + void firstpass_4_f(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + cdata_t t0, t1, t2, t3, t4, t5, t6, t7; + t0[0] = din[0]; t0[1] = din[1]; + t1[0] = din[4]; t1[1] = din[5]; + t2[0] = din[2]; t2[1] = din[3]; + t3[0] = din[6]; t3[1] = din[7]; + + t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1]; + t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1]; + t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1]; + t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1]; + + dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1]; + dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1]; + dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0]; + dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0]; +} + + void firstpass_4_b(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + cdata_t t0, t1, t2, t3, t4, t5, t6, t7; + t0[0] = din[0]; t0[1] = din[1]; + t1[0] = din[4]; t1[1] = din[5]; + t2[0] = din[2]; t2[1] = din[3]; + t3[0] = din[6]; t3[1] = din[7]; + + t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1]; + t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1]; + t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1]; + t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1]; + + dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1]; + dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1]; + dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0]; + dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0]; +} + + void firstpass_2(ffts_plan_t *p, const void *in, void *out) +{ + const data_t *din = (const data_t *)in; + data_t *dout = (data_t *)out; + cdata_t t0, t1, r0,r1; + t0[0] = din[0]; t0[1] = din[1]; + t1[0] = din[2]; t1[1] = din[3]; + r0[0] = t0[0] + t1[0]; + r0[1] = t0[1] + t1[1]; + r1[0] = t0[0] - t1[0]; + r1[1] = t0[1] - t1[1]; + dout[0] = r0[0]; dout[1] = r0[1]; + dout[2] = r1[0]; dout[3] = r1[1]; +} diff --git a/src/ffts_small.h b/src/ffts_small.h new file mode 100644 index 0000000..76cadf5 --- /dev/null +++ b/src/ffts_small.h @@ -0,0 +1,13 @@ +#ifndef __FFTS_SMALL_H__ +#define __FFTS_SMALL_H__ + + +void firstpass_16_f(ffts_plan_t * p, const void * in, void * out); +void firstpass_16_b(ffts_plan_t * p, const void * in, void * out); +void firstpass_8_f(ffts_plan_t * p, const void * in, void * out); +void firstpass_8_b(ffts_plan_t * p, const void * in, void * out); +void firstpass_4_f(ffts_plan_t * p, const void * in, void * out); +void firstpass_4_b(ffts_plan_t * p, const void * in, void * out); +void firstpass_2(ffts_plan_t * p, const void * in, void * out); + +#endif diff --git a/src/macros-alpha.h b/src/macros-alpha.h new file mode 100644 index 0000000..06daf4a --- /dev/null +++ b/src/macros-alpha.h @@ -0,0 +1,206 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __MACROS_ALPHA_H__ +#define __MACROS_ALPHA_H__ + +#include <math.h> + +#ifdef __alpha__ +#define restrict +#endif + +typedef struct {float r1, i1, r2, i2;} V; + +#define FFTS_MALLOC(d,a) malloc(d) +#define FFTS_FREE(d) free(d) + +#define VLIT4(f3,f2,f1,f0) ((V){f0,f1,f2,f3}) + +static inline V VADD(V x, V y) +{ + V z; + z.r1 = x.r1 + y.r1; + z.i1 = x.i1 + y.i1; + z.r2 = x.r2 + y.r2; + z.i2 = x.i2 + y.i2; + return z; +} + + +static inline V VSUB(V x, V y) +{ + V z; + z.r1 = x.r1 - y.r1; + z.i1 = x.i1 - y.i1; + z.r2 = x.r2 - y.r2; + z.i2 = x.i2 - y.i2; + return z; +} + + +static inline V VMUL(V x, V y) +{ + V z; + z.r1 = x.r1 * y.r1; + z.i1 = x.i1 * y.i1; + z.r2 = x.r2 * y.r2; + z.i2 = x.i2 * y.i2; + return z; +} + +static inline V VXOR(V x, V y) +{ + V r; + r.r1 = (uint32_t)x.r1 ^ (uint32_t)y.r1; + r.i1 = (uint32_t)x.i1 ^ (uint32_t)y.i1; + r.r2 = (uint32_t)x.r2 ^ (uint32_t)y.r2; + r.i2 = (uint32_t)x.i2 ^ (uint32_t)y.i2; + return r; +} + +static inline V VSWAPPAIRS(V x) +{ + V z; + z.r1 = x.i1; + z.i1 = x.r1; + z.r2 = x.i2; + z.i2 = x.r2; + return z; +} + + +static inline V VBLEND(V x, V y) +{ + V z; + z.r1 = x.r1; + z.i1 = x.i1; + z.r2 = y.r2; + z.i2 = y.i2; + return z; +} + +static inline V VUNPACKHI(V x, V y) +{ + V z; + z.r1 = x.r2; + z.i1 = x.i2; + z.r2 = y.r2; + z.i2 = y.i2; + return z; +} + +static inline V VUNPACKLO(V x, V y) +{ + V z; + z.r1 = x.r1; + z.i1 = x.i1; + z.r2 = y.r1; + z.i2 = y.i1; + return z; +} + +static inline V VDUPRE(V x) +{ + V z; + z.r1 = x.r1; + z.i1 = x.r1; + z.r2 = x.r2; + z.i2 = x.r2; + return z; +} + +static inline V VDUPIM(V x) +{ + V z; + z.r1 = x.i1; + z.i1 = x.i1; + z.r2 = x.i2; + z.i2 = x.i2; + return z; +} + +static inline V IMUL(V d, V re, V im) +{ + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VSUB(re, im); +} + + +static inline V IMULJ(V d, V re, V im) +{ + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VADD(re, im); +} + +static inline V MULI(int inv, V x) +{ + V z; + + if (inv) { + z.r1 = -x.r1; + z.i1 = x.i1; + z.r2 = -x.r2; + z.i2 = x.i2; + }else{ + z.r1 = x.r1; + z.i1 = -x.i1; + z.r2 = x.r2; + z.i2 = -x.i2; + } + return z; +} + + +static inline V IMULI(int inv, V x) +{ + return VSWAPPAIRS(MULI(inv, x)); +} + + +static inline V VLD(const void *s) +{ + V *d = (V *)s; + return *d; +} + + +static inline void VST(void *d, V s) +{ + V *r = (V *)d; + *r = s; +} + +#endif diff --git a/src/macros-altivec.h b/src/macros-altivec.h new file mode 100644 index 0000000..0d148a5 --- /dev/null +++ b/src/macros-altivec.h @@ -0,0 +1,137 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __MACROS_ALTIVEC_H__ +#define __MACROS_ALTIVEC_H__ + +#include <math.h> +#include <altivec.h> + +#define restrict + +typedef vector float V; +typedef vector unsigned char VUC; + +#ifdef __apple__ +#define FFTS_MALLOC(d,a) vec_malloc(d) +#define FFTS_FREE(d) vec_free(d) +#else +/* It appears vec_malloc() and friends are not implemented on Linux */ +#include <malloc.h> +#define FFTS_MALLOC(d,a) memalign(16,d) +#define FFTS_FREE(d) free(d) +#endif + +#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3}) + +#define VADD(x,y) vec_add(x,y) +#define VSUB(x,y) vec_sub(x,y) +#define VMUL(x,y) vec_madd(x,y,(V){0}) +#define VMULADD(x,y,z) vec_madd(x,y,z) +#define VNMULSUB(x,y,z) vec_nmsub(x,y,z) +#define VXOR(x,y) vec_xor((x),(y)) +#define VSWAPPAIRS(x) \ + vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \ + 0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b}) + +#define VBLEND(x,y) \ + vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \ + 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}) + +#define VUNPACKHI(x,y) \ + vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \ + 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}) + +#define VUNPACKLO(x,y) \ + vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \ + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}) + +#define VDUPRE(x) \ + vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \ + 0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b}) + +#define VDUPIM(x) \ + vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \ + 0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f}) + + +static inline V IMUL(V d, V re, V im) +{ + im = VMUL(im, VSWAPPAIRS(d)); + re = VMUL(re, d); + return VSUB(re, im); +} + + +static inline V IMULJ(V d, V re, V im) +{ + im = VMUL(im, VSWAPPAIRS(d)); + return VMULADD(re, d, im); +} + +#ifndef __GNUC__ +/* gcc (4.6 and 4.7) ICEs on this code! */ +static inline V MULI(int inv, V x) +{ + return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f)); +} +#else +/* but compiles this fine... */ +static inline V MULI(int inv, V x) +{ + V t; + t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f); + return VXOR(x, t); +} +#endif + + +static inline V IMULI(int inv, V x) +{ + return VSWAPPAIRS(MULI(inv, x)); +} + + +static inline V VLD(const void *s) +{ + V *d = (V *)s; + return *d; +} + + +static inline void VST(void *d, V s) +{ + V *r = (V *)d; + *r = s; +} +#endif diff --git a/src/macros-neon.h b/src/macros-neon.h new file mode 100644 index 0000000..0750b75 --- /dev/null +++ b/src/macros-neon.h @@ -0,0 +1,96 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#ifndef __MACROS_NEON_H__ +#define __MACROS_NEON_H__ + +#include "neon.h" +#include <arm_neon.h> + +typedef float32x4_t V; + +typedef float32x4x2_t VS; + +#define ADD vaddq_f32 +#define SUB vsubq_f32 +#define MUL vmulq_f32 +#define VADD vaddq_f32 +#define VSUB vsubq_f32 +#define VMUL vmulq_f32 +#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)))) +#define VST vst1q_f32 +#define VLD vld1q_f32 +#define VST2 vst2q_f32 +#define VLD2 vld2q_f32 + +#define VSWAPPAIRS(x) (vrev64q_f32(x)) + +#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b))) +#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b))) + +#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y))) + +__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) { + data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3}; + return VLD(d); +} + +#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0)) +#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1)) + +#define FFTS_MALLOC(d,a) (valloc(d)) +#define FFTS_FREE(d) (free(d)) + +__INLINE void STORESPR(data_t * addr, VS p) { + + vst1q_f32(addr, p.val[0]); + vst1q_f32(addr + 4, p.val[1]); + +} + +__INLINE V IMULI(int inv, V a) { + if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); + else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); +} + +__INLINE V IMUL(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VSUB(re, im); +} + +__INLINE V IMULJ(V d, V re, V im) { + re = VMUL(re, d); + im = VMUL(im, VSWAPPAIRS(d)); + return VADD(re, im); +} + +#endif |