summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnthony Blake <anthonix@me.com>2013-04-22 17:21:47 +1200
committerAnthony Blake <anthonix@me.com>2013-04-22 17:21:47 +1200
commit752031ba2441f5fef3617b05b9cd2d36cb3b30c4 (patch)
tree6f96efd68262ea89a2948b82255c05536b32f6d5
parentbd43284b757bd62f9d9f1f1108703b134efc16d7 (diff)
downloadffts-752031ba2441f5fef3617b05b9cd2d36cb3b30c4.zip
ffts-752031ba2441f5fef3617b05b9cd2d36cb3b30c4.tar.gz
Included new files I forgot to commit earlier -- thanks Michael Cree
-rw-r--r--src/ffts_small.c156
-rw-r--r--src/ffts_small.h13
-rw-r--r--src/macros-alpha.h206
-rw-r--r--src/macros-altivec.h137
-rw-r--r--src/macros-neon.h96
5 files changed, 608 insertions, 0 deletions
diff --git a/src/ffts_small.c b/src/ffts_small.c
new file mode 100644
index 0000000..ddd2d3e
--- /dev/null
+++ b/src/ffts_small.c
@@ -0,0 +1,156 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts.h"
+#include "macros.h"
+
+#include <stdlib.h>
+
+#define DEBUG(x)
+
+#include "ffts_small.h"
+
+ void firstpass_16_f(ffts_plan_t * p, const void * in, void * out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+ float *LUT8 = p->ws;
+
+ L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
+ L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
+ K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
+ S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
+ K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
+ S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
+}
+
+ void firstpass_16_b(ffts_plan_t * p, const void * in, void * out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+ float *LUT8 = p->ws;
+
+ L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
+ L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
+ K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
+ S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
+ K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
+ S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
+}
+
+
+ void firstpass_8_f(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ V r0_1, r2_3, r4_5, r6_7;
+ float *LUT8 = p->ws + p->ws_is[0];
+
+ L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+ K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
+ S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
+}
+
+ void firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ V r0_1, r2_3, r4_5, r6_7;
+ float *LUT8 = p->ws + p->ws_is[0];
+
+ L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+ K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
+ S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
+}
+
+
+ void firstpass_4_f(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
+ t0[0] = din[0]; t0[1] = din[1];
+ t1[0] = din[4]; t1[1] = din[5];
+ t2[0] = din[2]; t2[1] = din[3];
+ t3[0] = din[6]; t3[1] = din[7];
+
+ t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
+ t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
+ t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
+ t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
+
+ dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
+ dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
+ dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0];
+ dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0];
+}
+
+ void firstpass_4_b(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
+ t0[0] = din[0]; t0[1] = din[1];
+ t1[0] = din[4]; t1[1] = din[5];
+ t2[0] = din[2]; t2[1] = din[3];
+ t3[0] = din[6]; t3[1] = din[7];
+
+ t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
+ t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
+ t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
+ t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
+
+ dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
+ dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
+ dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0];
+ dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0];
+}
+
+ void firstpass_2(ffts_plan_t *p, const void *in, void *out)
+{
+ const data_t *din = (const data_t *)in;
+ data_t *dout = (data_t *)out;
+ cdata_t t0, t1, r0,r1;
+ t0[0] = din[0]; t0[1] = din[1];
+ t1[0] = din[2]; t1[1] = din[3];
+ r0[0] = t0[0] + t1[0];
+ r0[1] = t0[1] + t1[1];
+ r1[0] = t0[0] - t1[0];
+ r1[1] = t0[1] - t1[1];
+ dout[0] = r0[0]; dout[1] = r0[1];
+ dout[2] = r1[0]; dout[3] = r1[1];
+}
diff --git a/src/ffts_small.h b/src/ffts_small.h
new file mode 100644
index 0000000..76cadf5
--- /dev/null
+++ b/src/ffts_small.h
@@ -0,0 +1,13 @@
+#ifndef __FFTS_SMALL_H__
+#define __FFTS_SMALL_H__
+
+
+void firstpass_16_f(ffts_plan_t * p, const void * in, void * out);
+void firstpass_16_b(ffts_plan_t * p, const void * in, void * out);
+void firstpass_8_f(ffts_plan_t * p, const void * in, void * out);
+void firstpass_8_b(ffts_plan_t * p, const void * in, void * out);
+void firstpass_4_f(ffts_plan_t * p, const void * in, void * out);
+void firstpass_4_b(ffts_plan_t * p, const void * in, void * out);
+void firstpass_2(ffts_plan_t * p, const void * in, void * out);
+
+#endif
diff --git a/src/macros-alpha.h b/src/macros-alpha.h
new file mode 100644
index 0000000..06daf4a
--- /dev/null
+++ b/src/macros-alpha.h
@@ -0,0 +1,206 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __MACROS_ALPHA_H__
+#define __MACROS_ALPHA_H__
+
+#include <math.h>
+
+#ifdef __alpha__
+#define restrict
+#endif
+
+typedef struct {float r1, i1, r2, i2;} V;
+
+#define FFTS_MALLOC(d,a) malloc(d)
+#define FFTS_FREE(d) free(d)
+
+#define VLIT4(f3,f2,f1,f0) ((V){f0,f1,f2,f3})
+
+static inline V VADD(V x, V y)
+{
+ V z;
+ z.r1 = x.r1 + y.r1;
+ z.i1 = x.i1 + y.i1;
+ z.r2 = x.r2 + y.r2;
+ z.i2 = x.i2 + y.i2;
+ return z;
+}
+
+
+static inline V VSUB(V x, V y)
+{
+ V z;
+ z.r1 = x.r1 - y.r1;
+ z.i1 = x.i1 - y.i1;
+ z.r2 = x.r2 - y.r2;
+ z.i2 = x.i2 - y.i2;
+ return z;
+}
+
+
+static inline V VMUL(V x, V y)
+{
+ V z;
+ z.r1 = x.r1 * y.r1;
+ z.i1 = x.i1 * y.i1;
+ z.r2 = x.r2 * y.r2;
+ z.i2 = x.i2 * y.i2;
+ return z;
+}
+
+static inline V VXOR(V x, V y)
+{
+ V r;
+ r.r1 = (uint32_t)x.r1 ^ (uint32_t)y.r1;
+ r.i1 = (uint32_t)x.i1 ^ (uint32_t)y.i1;
+ r.r2 = (uint32_t)x.r2 ^ (uint32_t)y.r2;
+ r.i2 = (uint32_t)x.i2 ^ (uint32_t)y.i2;
+ return r;
+}
+
+static inline V VSWAPPAIRS(V x)
+{
+ V z;
+ z.r1 = x.i1;
+ z.i1 = x.r1;
+ z.r2 = x.i2;
+ z.i2 = x.r2;
+ return z;
+}
+
+
+static inline V VBLEND(V x, V y)
+{
+ V z;
+ z.r1 = x.r1;
+ z.i1 = x.i1;
+ z.r2 = y.r2;
+ z.i2 = y.i2;
+ return z;
+}
+
+static inline V VUNPACKHI(V x, V y)
+{
+ V z;
+ z.r1 = x.r2;
+ z.i1 = x.i2;
+ z.r2 = y.r2;
+ z.i2 = y.i2;
+ return z;
+}
+
+static inline V VUNPACKLO(V x, V y)
+{
+ V z;
+ z.r1 = x.r1;
+ z.i1 = x.i1;
+ z.r2 = y.r1;
+ z.i2 = y.i1;
+ return z;
+}
+
+static inline V VDUPRE(V x)
+{
+ V z;
+ z.r1 = x.r1;
+ z.i1 = x.r1;
+ z.r2 = x.r2;
+ z.i2 = x.r2;
+ return z;
+}
+
+static inline V VDUPIM(V x)
+{
+ V z;
+ z.r1 = x.i1;
+ z.i1 = x.i1;
+ z.r2 = x.i2;
+ z.i2 = x.i2;
+ return z;
+}
+
+static inline V IMUL(V d, V re, V im)
+{
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VSUB(re, im);
+}
+
+
+static inline V IMULJ(V d, V re, V im)
+{
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VADD(re, im);
+}
+
+static inline V MULI(int inv, V x)
+{
+ V z;
+
+ if (inv) {
+ z.r1 = -x.r1;
+ z.i1 = x.i1;
+ z.r2 = -x.r2;
+ z.i2 = x.i2;
+ }else{
+ z.r1 = x.r1;
+ z.i1 = -x.i1;
+ z.r2 = x.r2;
+ z.i2 = -x.i2;
+ }
+ return z;
+}
+
+
+static inline V IMULI(int inv, V x)
+{
+ return VSWAPPAIRS(MULI(inv, x));
+}
+
+
+static inline V VLD(const void *s)
+{
+ V *d = (V *)s;
+ return *d;
+}
+
+
+static inline void VST(void *d, V s)
+{
+ V *r = (V *)d;
+ *r = s;
+}
+
+#endif
diff --git a/src/macros-altivec.h b/src/macros-altivec.h
new file mode 100644
index 0000000..0d148a5
--- /dev/null
+++ b/src/macros-altivec.h
@@ -0,0 +1,137 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __MACROS_ALTIVEC_H__
+#define __MACROS_ALTIVEC_H__
+
+#include <math.h>
+#include <altivec.h>
+
+#define restrict
+
+typedef vector float V;
+typedef vector unsigned char VUC;
+
+#ifdef __apple__
+#define FFTS_MALLOC(d,a) vec_malloc(d)
+#define FFTS_FREE(d) vec_free(d)
+#else
+/* It appears vec_malloc() and friends are not implemented on Linux */
+#include <malloc.h>
+#define FFTS_MALLOC(d,a) memalign(16,d)
+#define FFTS_FREE(d) free(d)
+#endif
+
+#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
+
+#define VADD(x,y) vec_add(x,y)
+#define VSUB(x,y) vec_sub(x,y)
+#define VMUL(x,y) vec_madd(x,y,(V){0})
+#define VMULADD(x,y,z) vec_madd(x,y,z)
+#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
+#define VXOR(x,y) vec_xor((x),(y))
+#define VSWAPPAIRS(x) \
+ vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \
+ 0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})
+
+#define VBLEND(x,y) \
+ vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
+ 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
+
+#define VUNPACKHI(x,y) \
+ vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \
+ 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
+
+#define VUNPACKLO(x,y) \
+ vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
+ 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})
+
+#define VDUPRE(x) \
+ vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \
+ 0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})
+
+#define VDUPIM(x) \
+ vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \
+ 0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})
+
+
+static inline V IMUL(V d, V re, V im)
+{
+ im = VMUL(im, VSWAPPAIRS(d));
+ re = VMUL(re, d);
+ return VSUB(re, im);
+}
+
+
+static inline V IMULJ(V d, V re, V im)
+{
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VMULADD(re, d, im);
+}
+
+#ifndef __GNUC__
+/* gcc (4.6 and 4.7) ICEs on this code! */
+static inline V MULI(int inv, V x)
+{
+ return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
+}
+#else
+/* but compiles this fine... */
+static inline V MULI(int inv, V x)
+{
+ V t;
+ t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
+ return VXOR(x, t);
+}
+#endif
+
+
+static inline V IMULI(int inv, V x)
+{
+ return VSWAPPAIRS(MULI(inv, x));
+}
+
+
+static inline V VLD(const void *s)
+{
+ V *d = (V *)s;
+ return *d;
+}
+
+
+static inline void VST(void *d, V s)
+{
+ V *r = (V *)d;
+ *r = s;
+}
+#endif
diff --git a/src/macros-neon.h b/src/macros-neon.h
new file mode 100644
index 0000000..0750b75
--- /dev/null
+++ b/src/macros-neon.h
@@ -0,0 +1,96 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef __MACROS_NEON_H__
+#define __MACROS_NEON_H__
+
+#include "neon.h"
+#include <arm_neon.h>
+
+typedef float32x4_t V;
+
+typedef float32x4x2_t VS;
+
+#define ADD vaddq_f32
+#define SUB vsubq_f32
+#define MUL vmulq_f32
+#define VADD vaddq_f32
+#define VSUB vsubq_f32
+#define VMUL vmulq_f32
+#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
+#define VST vst1q_f32
+#define VLD vld1q_f32
+#define VST2 vst2q_f32
+#define VLD2 vld2q_f32
+
+#define VSWAPPAIRS(x) (vrev64q_f32(x))
+
+#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
+#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
+
+#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
+
+__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
+ data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
+ return VLD(d);
+}
+
+#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
+#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
+
+#define FFTS_MALLOC(d,a) (valloc(d))
+#define FFTS_FREE(d) (free(d))
+
+__INLINE void STORESPR(data_t * addr, VS p) {
+
+ vst1q_f32(addr, p.val[0]);
+ vst1q_f32(addr + 4, p.val[1]);
+
+}
+
+__INLINE V IMULI(int inv, V a) {
+ if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+ else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+}
+
+__INLINE V IMUL(V d, V re, V im) {
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VSUB(re, im);
+}
+
+__INLINE V IMULJ(V d, V re, V im) {
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VADD(re, im);
+}
+
+#endif
OpenPOWER on IntegriCloud