summaryrefslogtreecommitdiffstats
path: root/src/ffts_nd.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-28 13:57:47 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2016-03-28 13:57:47 +0300
commit2cf68165a461a9faf7069e094436d16c22990aff (patch)
treef91cf852ae9ac06d12ae84f64c66df6d5cb22385 /src/ffts_nd.c
parenta62bc084cad3ea82128c8060b17a488f2fce2587 (diff)
downloadffts-2cf68165a461a9faf7069e094436d16c22990aff.zip
ffts-2cf68165a461a9faf7069e094436d16c22990aff.tar.gz
Improve performance of small complex 2D Neon transform by 15%
Diffstat (limited to 'src/ffts_nd.c')
-rw-r--r--src/ffts_nd.c69
1 files changed, 6 insertions, 63 deletions
diff --git a/src/ffts_nd.c b/src/ffts_nd.c
index 72e21e7..2bde9c4 100644
--- a/src/ffts_nd.c
+++ b/src/ffts_nd.c
@@ -92,74 +92,17 @@ static void ffts_free_nd(ffts_plan_t *p)
static void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf)
{
#ifdef HAVE_NEON
- size_t i, j, k;
- int linebytes = 8 * w;
+#if 0
+ neon_transpose(in, out, w, h);
+#else
+ size_t i, j;
for (j = 0; j < h; j += 8) {
for (i = 0; i < w; i += 8) {
- neon_transpose_to_buf(in + j*w + i, buf, w);
-
- uint64_t *p = out + i*h + j;
- uint64_t *pbuf = buf;
- uint64_t *ptemp;
-
- __asm__ __volatile__(
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
- "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
- "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
- "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
- "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-
- : [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
- : [w] "r" (w)
- : "memory", "q8", "q9", "q10", "q11"
- );
-
- /* out[i*h + j] = in[j*w + i]; */
+ neon_transpose_to_buf(in + j*w + i, out + i*h + j, w);
}
}
+#endif
#else
#ifdef HAVE_SSE
uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE];
OpenPOWER on IntegriCloud