summaryrefslogtreecommitdiffstats
path: root/src/ffts_real.c
diff options
context:
space:
mode:
authorAnthony Blake <anthonix@me.com>2012-11-09 12:28:22 +1300
committerAnthony Blake <anthonix@me.com>2012-11-09 12:28:22 +1300
commit028def4e6ab0459cdb971fdb594b38ca013f1d68 (patch)
tree0b472150220164b69a1b6c2101bee4c42a5caa09 /src/ffts_real.c
parent121b25b7a3becf5ed1cfeae8948a818d00b2dd76 (diff)
downloadffts-028def4e6ab0459cdb971fdb594b38ca013f1d68.zip
ffts-028def4e6ab0459cdb971fdb594b38ca013f1d68.tar.gz
NEON optimization for inverse real-valued transforms
Diffstat (limited to 'src/ffts_real.c')
-rw-r--r--src/ffts_real.c51
1 files changed, 50 insertions, 1 deletions
diff --git a/src/ffts_real.c b/src/ffts_real.c
index fbe6514..8d29757 100644
--- a/src/ffts_real.c
+++ b/src/ffts_real.c
@@ -120,11 +120,60 @@ void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) {
float *B = p->B;
size_t N = p->N;
+ float *p_buf0 = in;
+ float *p_buf1 = in + N - 2;
+ float *p_out = buf;
+
size_t i;
+#ifdef __ARM_NEON__
+ for(i=0;i<N/2;i+=2) {
+ __asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t"
+ "vld1.32 {q9}, [%[pb], :128]!\n\t"
+ "vld1.32 {q10}, [%[buf0], :128]!\n\t"
+ "vld1.32 {q11}, [%[buf1], :64]\n\t"
+ "sub %[buf1], %[buf1], #16\n\t"
+
+ "vdup.32 d26, d16[1]\n\t"
+ "vdup.32 d27, d17[1]\n\t"
+ "vdup.32 d24, d16[0]\n\t"
+ "vdup.32 d25, d17[0]\n\t"
+
+ "vdup.32 d30, d23[1]\n\t"
+ "vdup.32 d31, d22[1]\n\t"
+ "vdup.32 d28, d23[0]\n\t"
+ "vdup.32 d29, d22[0]\n\t"
+
+ "vmul.f32 q13, q13, q10\n\t"
+ "vmul.f32 q15, q15, q9\n\t"
+ "vmul.f32 q12, q12, q10\n\t"
+ "vmul.f32 q14, q14, q9\n\t"
+ "vrev64.f32 q13, q13\n\t"
+ "vrev64.f32 q15, q15\n\t"
+
+ "vtrn.32 d26, d27\n\t"
+ "vtrn.32 d28, d29\n\t"
+ "vneg.f32 d27, d27\n\t"
+ "vneg.f32 d29, d29\n\t"
+ "vtrn.32 d26, d27\n\t"
+ "vtrn.32 d28, d29\n\t"
+
+ "vadd.f32 q12, q12, q14\n\t"
+ "vsub.f32 q13, q13, q15\n\t"
+ "vadd.f32 q12, q12, q13\n\t"
+ "vst1.32 {q12}, [%[pout], :128]!\n\t"
+ : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
+ [pout] "+r" (p_out)
+ :
+ : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+
+
+#else
for(i=0;i<N/2;i++) {
buf[2*i] = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1];
buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i];
- }
+#endif
+}
p->plans[0]->transform(p->plans[0], buf, out);
OpenPOWER on IntegriCloud