summaryrefslogtreecommitdiffstats
path: root/src/ffts_real_nd.c
diff options
context:
space:
mode:
authorAnthony Blake <anthonix@me.com>2012-11-15 16:04:56 +1300
committerAnthony Blake <anthonix@me.com>2012-11-15 16:04:56 +1300
commit02e3467b75a6e205c6bd2e82d3a1c8daa3c81f6f (patch)
tree49a096a4592b4ce05ad071e8641373899c217300 /src/ffts_real_nd.c
parent763210e26805abdc61cc7c6efcacf70468dce3e6 (diff)
downloadffts-02e3467b75a6e205c6bd2e82d3a1c8daa3c81f6f.zip
ffts-02e3467b75a6e205c6bd2e82d3a1c8daa3c81f6f.tar.gz
2D real-valued transforms work
Diffstat (limited to 'src/ffts_real_nd.c')
-rw-r--r--src/ffts_real_nd.c130
1 files changed, 16 insertions, 114 deletions
diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c
index 3e87af6..ff97562 100644
--- a/src/ffts_real_nd.c
+++ b/src/ffts_real_nd.c
@@ -36,119 +36,21 @@
#ifdef __ARM_NEON__
#include "neon.h"
#endif
-/*
-void ffts_free_nd_real(ffts_plan_t *p) {
- free(p->Ns);
- free(p->Ms);
-
- int i;
- for(i=0;i<p->rank;i++) {
-
- ffts_plan_t *x = p->plans[i];
- int k;
- for(k=0;k<i;k++) {
- if(x == p->plans[k]) x = NULL;
- }
-
- ffts_free(x);
- }
-
- free(p->plans);
- free(p->buf);
- free(p->transpose_buf);
- free(p);
-}
-void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
+void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
-#ifdef __ARM_NEON__
- size_t i,j,k;
- int linebytes = w*8;
-
- for(j=0;j<h;j+=8) {
- for(i=0;i<w;i+=8) {
- neon_transpose_to_buf(in + j*w + i, buf, w);
-
- uint64_t __attribute__((aligned(32))) *p = out + i*h + j;
- uint64_t __attribute__((aligned(32))) *pbuf = buf;
- uint64_t *ptemp;
-
- __asm__ __volatile__(
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
- "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
- "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
- "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "add %[p], %[p], %[w], lsl #3\n\t"
- "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
- "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
- "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
- "mov %[ptemp], %[p]\n\t"
- "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
- "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-
- : [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
- : [w] "r" (w)
- : "memory", "q8", "q9", "q10", "q11"
- );
-// out[i*h + j] = in[j*w + i];
- }
- }
-#else
size_t i,j;
- for(i=0;i<w;i+=2) {
- for(j=0;j<h;j+=2) {
-// out[i*h + j] = in[j*w + i];
- __m128d q0 = _mm_load_pd((double *)(in + j*w + i));
- __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
- __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
- __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
- _mm_store_pd((double *)(out + i*h + j), t0);
- _mm_store_pd((double *)(out + i*h + j + h), t1);
+ for(i=0;i<w;i+=1) {
+ for(j=0;j<h;j+=1) {
+ out[i*h + j] = in[j*w + i];
}
}
-#endif
}
-void ffts_execute_nd_real(ffts_plan_t *p, const data_t * in, data_t * out) {
+void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) {
- uint64_t *din = (uint64_t *)in;
+ uint32_t *din = (uint32_t *)in;
uint64_t *buf = p->buf;
uint64_t *dout = (uint64_t *)out;
@@ -156,22 +58,22 @@ void ffts_execute_nd_real(ffts_plan_t *p, const data_t * in, data_t * out) {
for(i=0;i<p->Ns[0];i++) {
p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0]));
}
- ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);
+ ffts_scalar_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);
for(i=1;i<p->rank;i++) {
for(j=0;j<p->Ns[i];j++) {
p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
}
- ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
+ ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
}
}
-*/
+
ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
size_t vol = 1;
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
- p->transform = &ffts_execute_nd;
+ p->transform = &ffts_execute_nd_real;
p->destroy = &ffts_free_nd;
p->rank = rank;
@@ -190,13 +92,13 @@ ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
p->plans[i] = NULL;
int k;
- for(k=1;k<i;k++) {
- if(p->Ms[k] == p->Ms[i])
- p->plans[i] = p->plans[k];
- }
+ for(k=1;k<i;k++) {
+ if(p->Ms[k] == p->Ms[i])
+ p->plans[i] = p->plans[k];
+ }
- if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
- if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
+ if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
+ else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
}
OpenPOWER on IntegriCloud