diff options
author | Anthony Blake <anthonix@me.com> | 2012-11-15 16:04:56 +1300 |
---|---|---|
committer | Anthony Blake <anthonix@me.com> | 2012-11-15 16:04:56 +1300 |
commit | 02e3467b75a6e205c6bd2e82d3a1c8daa3c81f6f (patch) | |
tree | 49a096a4592b4ce05ad071e8641373899c217300 /src/ffts_real_nd.c | |
parent | 763210e26805abdc61cc7c6efcacf70468dce3e6 (diff) | |
download | ffts-02e3467b75a6e205c6bd2e82d3a1c8daa3c81f6f.zip ffts-02e3467b75a6e205c6bd2e82d3a1c8daa3c81f6f.tar.gz |
2D real-valued transforms work
Diffstat (limited to 'src/ffts_real_nd.c')
-rw-r--r-- | src/ffts_real_nd.c | 130 |
1 files changed, 16 insertions, 114 deletions
diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c index 3e87af6..ff97562 100644 --- a/src/ffts_real_nd.c +++ b/src/ffts_real_nd.c @@ -36,119 +36,21 @@ #ifdef __ARM_NEON__ #include "neon.h" #endif -/* -void ffts_free_nd_real(ffts_plan_t *p) { - free(p->Ns); - free(p->Ms); - - int i; - for(i=0;i<p->rank;i++) { - - ffts_plan_t *x = p->plans[i]; - int k; - for(k=0;k<i;k++) { - if(x == p->plans[k]) x = NULL; - } - - ffts_free(x); - } - - free(p->plans); - free(p->buf); - free(p->transpose_buf); - free(p); -} -void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) { +void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) { -#ifdef __ARM_NEON__ - size_t i,j,k; - int linebytes = w*8; - - for(j=0;j<h;j+=8) { - for(i=0;i<w;i+=8) { - neon_transpose_to_buf(in + j*w + i, buf, w); - - uint64_t __attribute__((aligned(32))) *p = out + i*h + j; - uint64_t __attribute__((aligned(32))) *pbuf = buf; - uint64_t *ptemp; - - __asm__ __volatile__( - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" - "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" - "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" - "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" - "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" - "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" - "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" - "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" - "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" - "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" - "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" - "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" - "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" - "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" - "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" - "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" - "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" - "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" - "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "add %[p], %[p], %[w], lsl #3\n\t" - "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t" - "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t" - "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t" - "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t" - "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t" - "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t" - "mov %[ptemp], %[p]\n\t" - "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t" - "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t" - - : [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp) - : [w] "r" (w) - : "memory", "q8", "q9", "q10", "q11" - ); -// out[i*h + j] = in[j*w + i]; - } - } -#else size_t i,j; - for(i=0;i<w;i+=2) { - for(j=0;j<h;j+=2) { -// out[i*h + j] = in[j*w + i]; - __m128d q0 = _mm_load_pd((double *)(in + j*w + i)); - __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w)); - __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); - __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); - _mm_store_pd((double *)(out + i*h + j), t0); - _mm_store_pd((double *)(out + i*h + j + h), t1); + for(i=0;i<w;i+=1) { + for(j=0;j<h;j+=1) { + out[i*h + j] = in[j*w + i]; } } -#endif } -void ffts_execute_nd_real(ffts_plan_t *p, const data_t * in, data_t * out) { +void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) { - uint64_t *din = (uint64_t *)in; + uint32_t *din = (uint32_t *)in; uint64_t *buf = p->buf; uint64_t *dout = (uint64_t *)out; @@ -156,22 +58,22 @@ void ffts_execute_nd_real(ffts_plan_t *p, const data_t * in, data_t * out) { for(i=0;i<p->Ns[0];i++) { p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0])); } - ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf); + ffts_scalar_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf); for(i=1;i<p->rank;i++) { for(j=0;j<p->Ns[i];j++) { p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i])); } - ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf); + ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf); } } -*/ + ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) { size_t vol = 1; ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); - p->transform = &ffts_execute_nd; + p->transform = &ffts_execute_nd_real; p->destroy = &ffts_free_nd; p->rank = rank; @@ -190,13 +92,13 @@ ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) { p->plans[i] = NULL; int k; - for(k=1;k<i;k++) { - if(p->Ms[k] == p->Ms[i]) - p->plans[i] = p->plans[k]; - } + for(k=1;k<i;k++) { + if(p->Ms[k] == p->Ms[i]) + p->plans[i] = p->plans[k]; + } - if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); - if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); + if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); + else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); } |