summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2016-04-05 17:52:13 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2016-04-05 17:52:13 +0300
commit8b2d55e5d6bd43eb45ca7da1595fccc401a22158 (patch)
tree11a9eacf66b67b1b9529e4662243336ffb2c1695
parentc71724a0dc7536ef160732b5ed6ec1f4ef2c0ff9 (diff)
downloadffts-8b2d55e5d6bd43eb45ca7da1595fccc401a22158.zip
ffts-8b2d55e5d6bd43eb45ca7da1595fccc401a22158.tar.gz
Combine ffts_tranpose_scalar and ffts_transpose, and use ffts_transpose_scalar as native C fallback
-rw-r--r--CMakeLists.txt2
-rw-r--r--src/ffts_nd.c112
-rw-r--r--src/ffts_nd.h67
-rw-r--r--src/ffts_real.c4
-rw-r--r--src/ffts_real.h59
-rw-r--r--src/ffts_real_nd.c174
-rw-r--r--src/ffts_real_nd.h67
-rw-r--r--src/ffts_transpose.c194
-rw-r--r--src/ffts_transpose.h46
9 files changed, 399 insertions, 326 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58f402b..8c21185 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -351,6 +351,8 @@ set(FFTS_SOURCES
src/ffts_real.c
src/ffts_real_nd.c
src/ffts_real_nd.h
+ src/ffts_transpose.c
+ src/ffts_transpose.h
src/ffts_trig.c
src/ffts_trig.h
src/ffts_static.c
diff --git a/src/ffts_nd.c b/src/ffts_nd.c
index 49c6229..64220f1 100644
--- a/src/ffts_nd.c
+++ b/src/ffts_nd.c
@@ -34,15 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ffts_nd.h"
#include "ffts_internal.h"
-
-#ifdef HAVE_NEON
-#include "neon.h"
-#include <arm_neon.h>
-#elif HAVE_SSE2
-#include <emmintrin.h>
-#endif
-
-#define TSIZE 8
+#include "ffts_transpose.h"
static void
ffts_free_nd(ffts_plan_t *p)
@@ -86,108 +78,6 @@ ffts_free_nd(ffts_plan_t *p)
}
static void
-ffts_transpose(uint64_t *in, uint64_t *out, int w, int h)
-{
-#ifdef HAVE_NEON
-#if 0
- neon_transpose4(in, out, w, h);
-#else
- neon_transpose8(in, out, w, h);
-#endif
-#else
-#ifdef HAVE_SSE
- uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE];
- int tx, ty;
- /* int x; */
- int y;
- int tw = w / TSIZE;
- int th = h / TSIZE;
-
- for (ty = 0; ty < th; ty++) {
- for (tx = 0; tx < tw; tx++) {
- uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
- uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */
-
- /* copy/transpose to tmp */
- for (y = 0; y < TSIZE; y += 2) {
- /* for (x=0;x<TSIZE;x+=2) {
- op[x*TSIZE] = ip[x];
- */
- __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w));
- __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w));
- __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w));
- __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w));
- __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w));
- __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w));
- __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w));
- __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w));
-
- __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
- __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
- __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
- __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
- __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
- __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
- __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
- __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
-
- ip0 += 2;
- /* _mm_store_pd((double *)(op0 + y*h + x), t0);
- _mm_store_pd((double *)(op0 + y*h + x + h), t1);
- */
-
- _mm_store_pd((double*)(op0 + 0 ), t0);
- _mm_store_pd((double*)(op0 + 0 + TSIZE), t1);
- _mm_store_pd((double*)(op0 + 2 ), t2);
- _mm_store_pd((double*)(op0 + 2 + TSIZE), t3);
- _mm_store_pd((double*)(op0 + 4 ), t4);
- _mm_store_pd((double*)(op0 + 4 + TSIZE), t5);
- _mm_store_pd((double*)(op0 + 6 ), t6);
- _mm_store_pd((double*)(op0 + 6 + TSIZE), t7);
- /* } */
-
- op0 += 2*TSIZE;
- }
-
- op0 = out + h*tx*TSIZE + ty*TSIZE;
- ip0 = tmp;
- for (y = 0; y < TSIZE; y += 1) {
- /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */
-
- __m128d q0 = _mm_load_pd((double*)(ip0 + 0));
- __m128d q1 = _mm_load_pd((double*)(ip0 + 2));
- __m128d q2 = _mm_load_pd((double*)(ip0 + 4));
- __m128d q3 = _mm_load_pd((double*)(ip0 + 6));
-
- _mm_store_pd((double*)(op0 + 0), q0);
- _mm_store_pd((double*)(op0 + 2), q1);
- _mm_store_pd((double*)(op0 + 4), q2);
- _mm_store_pd((double*)(op0 + 6), q3);
-
- op0 += h;
- ip0 += TSIZE;
- }
- }
- }
- /*
- size_t i,j;
- for(i=0;i<w;i+=2) {
- for(j=0;j<h;j+=2) {
- // out[i*h + j] = in[j*w + i];
- __m128d q0 = _mm_load_pd((double *)(in + j*w + i));
- __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
- __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
- __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
- _mm_store_pd((double *)(out + i*h + j), t0);
- _mm_store_pd((double *)(out + i*h + j + h), t1);
- }
- }
- */
-#endif
-#endif
-}
-
-static void
ffts_execute_nd(ffts_plan_t *p, const void *in, void *out)
{
uint64_t *din = (uint64_t*) in;
diff --git a/src/ffts_nd.h b/src/ffts_nd.h
index a960cad..1998cc6 100644
--- a/src/ffts_nd.h
+++ b/src/ffts_nd.h
@@ -1,43 +1,50 @@
/*
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FFTS_ND_H
#define FFTS_ND_H
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
#include "ffts.h"
#include <stddef.h>
-ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign);
-ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign);
+ffts_plan_t*
+ffts_init_nd(int rank, size_t *Ns, int sign);
+
+ffts_plan_t*
+ffts_init_2d(size_t N1, size_t N2, int sign);
#endif /* FFTS_ND_H */
diff --git a/src/ffts_real.c b/src/ffts_real.c
index 7f41069..0f87a12 100644
--- a/src/ffts_real.c
+++ b/src/ffts_real.c
@@ -641,9 +641,9 @@ ffts_init_1d_real(size_t N, int sign)
}
#ifdef HAVE_SSE3
- ffts_generate_table_1d_real_32f(p, sign, 1);
+ ffts_generate_table_1d_real_32f(p, sign, 1);
#else
- ffts_generate_table_1d_real_32f(p, sign, 0);
+ ffts_generate_table_1d_real_32f(p, sign, 0);
#endif
return p;
diff --git a/src/ffts_real.h b/src/ffts_real.h
index 81ca80f..61d03d4 100644
--- a/src/ffts_real.h
+++ b/src/ffts_real.h
@@ -1,33 +1,33 @@
/*
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
@@ -41,6 +41,7 @@
#include "ffts.h"
#include <stddef.h>
-ffts_plan_t *ffts_init_1d_real(size_t N, int sign);
+ffts_plan_t*
+ffts_init_1d_real(size_t N, int sign);
#endif /* FFTS_REAL_H */
diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c
index 545e8f0..89ef7f7 100644
--- a/src/ffts_real_nd.c
+++ b/src/ffts_real_nd.c
@@ -1,80 +1,67 @@
/*
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_real_nd.h"
#include "ffts_real.h"
#include "ffts_internal.h"
+#include "ffts_transpose.h"
-#ifdef __ARM_NEON__
-#include "neon.h"
-#endif
-
-#ifdef HAVE_NEON
-#include <arm_neon.h>
-#elif HAVE_SSE
-#include <xmmintrin.h>
-#endif
-
-#include <stdio.h>
-
-static void ffts_free_nd_real(ffts_plan_t *p)
+static void
+ffts_free_nd_real(ffts_plan_t *p)
{
if (p->plans) {
- int i;
+ int i, j;
for (i = 0; i < p->rank; i++) {
ffts_plan_t *plan = p->plans[i];
- if (plan) {
- int j;
-
- for (j = i + 1; j < p->rank; j++) {
- if (plan == p->plans[j]) {
- p->plans[j] = NULL;
- }
- }
-
- ffts_free(plan);
- }
+ if (plan) {
+ for (j = 0; j < i; j++) {
+ if (p->Ns[i] == p->Ns[j]) {
+ plan = NULL;
+ break;
+ }
+ }
+
+ if (plan) {
+ ffts_free(plan);
+ }
+ }
}
free(p->plans);
}
- if (p->transpose_buf) {
- ffts_aligned_free(p->transpose_buf);
- }
-
if (p->buf) {
ffts_aligned_free(p->buf);
}
@@ -90,59 +77,8 @@ static void ffts_free_nd_real(ffts_plan_t *p)
free(p);
}
-static void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf)
-{
- const int bw = 1;
- const int bh = 8;
- int i = 0, j = 0;
-
- for (; i <= h - bh; i += bh) {
- for (j = 0; j <= w - bw; j += bw) {
- uint64_t const *ib = &src[w*i + j];
- uint64_t *ob = &dst[h*j + i];
-
- uint64_t s_0_0 = ib[0*w + 0];
- uint64_t s_1_0 = ib[1*w + 0];
- uint64_t s_2_0 = ib[2*w + 0];
- uint64_t s_3_0 = ib[3*w + 0];
- uint64_t s_4_0 = ib[4*w + 0];
- uint64_t s_5_0 = ib[5*w + 0];
- uint64_t s_6_0 = ib[6*w + 0];
- uint64_t s_7_0 = ib[7*w + 0];
-
- ob[0*h + 0] = s_0_0;
- ob[0*h + 1] = s_1_0;
- ob[0*h + 2] = s_2_0;
- ob[0*h + 3] = s_3_0;
- ob[0*h + 4] = s_4_0;
- ob[0*h + 5] = s_5_0;
- ob[0*h + 6] = s_6_0;
- ob[0*h + 7] = s_7_0;
- }
- }
-
- if (i < h) {
- int i1;
-
- for (i1 = 0; i1 < w; i1++) {
- for (j = i; j < h; j++) {
- dst[i1*h + j] = src[j*w + i1];
- }
- }
- }
-
- if (j < w) {
- int j1;
-
- for (i = j; i < w; i++) {
- for (j1 = 0; j1 < h; j1++) {
- dst[i*h + j1] = src[j1*w + i];
- }
- }
- }
-}
-
-static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
+static void
+ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
{
const size_t Ms0 = p->Ms[0];
const size_t Ns0 = p->Ns[0];
@@ -150,7 +86,6 @@ static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
uint32_t *din = (uint32_t*) in;
uint64_t *buf = p->buf;
uint64_t *dout = (uint64_t*) out;
- uint64_t *transpose_buf = (uint64_t*) p->transpose_buf;
ffts_plan_t *plan;
int i;
@@ -161,7 +96,7 @@ static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
plan->transform(plan, din + (j * Ms0), buf + (j * (Ms0 / 2 + 1)));
}
- ffts_scalar_transpose(buf, dout, Ms0 / 2 + 1, Ns0, transpose_buf);
+ ffts_transpose(buf, dout, Ms0 / 2 + 1, Ns0);
for (i = 1; i < p->rank; i++) {
const size_t Ms = p->Ms[i];
@@ -173,11 +108,12 @@ static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
plan->transform(plan, dout + (j * Ms), buf + (j * Ms));
}
- ffts_scalar_transpose(buf, dout, Ms, Ns, transpose_buf);
+ ffts_transpose(buf, dout, Ms, Ns);
}
}
-static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out)
+static void
+ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out)
{
const size_t Ms0 = p->Ms[0];
const size_t Ms1 = p->Ms[1];
@@ -187,7 +123,6 @@ static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out)
uint64_t *din = (uint64_t*) in;
uint64_t *buf = p->buf;
uint64_t *buf2;
- uint64_t *transpose_buf = (uint64_t*) p->transpose_buf;
float *doutr = (float*) out;
ffts_plan_t *plan;
@@ -203,14 +138,14 @@ static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out)
buf2 = buf + vol;
- ffts_scalar_transpose(din, buf, Ms0, Ns0, transpose_buf);
+ ffts_transpose(din, buf, Ms0, Ns0);
plan = p->plans[0];
for (j = 0; j < Ms0; j++) {
plan->transform(plan, buf + (j * Ns0), buf2 + (j * Ns0));
}
- ffts_scalar_transpose(buf2, buf, Ns0, Ms0, transpose_buf);
+ ffts_transpose(buf2, buf, Ns0, Ms0);
plan = p->plans[1];
for (j = 0; j < Ms1; j++) {
@@ -267,11 +202,6 @@ ffts_init_nd_real(int rank, size_t *Ns, int sign)
goto cleanup;
}
- p->transpose_buf = ffts_aligned_malloc(2 * 8 * 8 * sizeof(float));
- if (!p->transpose_buf) {
- goto cleanup;
- }
-
p->plans = (ffts_plan_t**) calloc(rank, sizeof(*p->plans));
if (!p->plans) {
goto cleanup;
diff --git a/src/ffts_real_nd.h b/src/ffts_real_nd.h
index 22a708d..fac607b 100644
--- a/src/ffts_real_nd.h
+++ b/src/ffts_real_nd.h
@@ -1,33 +1,33 @@
- /*
-
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
@@ -41,7 +41,10 @@
#include "ffts.h"
#include <stddef.h>
-ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign);
-ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign);
+ffts_plan_t*
+ffts_init_nd_real(int rank, size_t *Ns, int sign);
+
+ffts_plan_t*
+ffts_init_2d_real(size_t N1, size_t N2, int sign);
#endif /* FFTS_REAL_ND_H */ \ No newline at end of file
diff --git a/src/ffts_transpose.c b/src/ffts_transpose.c
new file mode 100644
index 0000000..272cb48
--- /dev/null
+++ b/src/ffts_transpose.c
@@ -0,0 +1,194 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_transpose.h"
+#include "ffts_internal.h"
+
+#ifdef HAVE_NEON
+#include "neon.h"
+#include <arm_neon.h>
+#elif HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+#define TSIZE 8
+
+void
+ffts_transpose(uint64_t *in, uint64_t *out, int w, int h)
+{
+#ifdef HAVE_NEON
+#if 0
+ neon_transpose4(in, out, w, h);
+#else
+ neon_transpose8(in, out, w, h);
+#endif
+#elif HAVE_SSE2
+ uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE];
+ int tx, ty;
+ /* int x; */
+ int y;
+ int tw = w / TSIZE;
+ int th = h / TSIZE;
+
+ for (ty = 0; ty < th; ty++) {
+ for (tx = 0; tx < tw; tx++) {
+ uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
+ uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */
+
+ /* copy/transpose to tmp */
+ for (y = 0; y < TSIZE; y += 2) {
+ /* for (x=0;x<TSIZE;x+=2) {
+ op[x*TSIZE] = ip[x];
+ */
+ __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w));
+ __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w));
+ __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w));
+ __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w));
+ __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w));
+ __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w));
+ __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w));
+ __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w));
+
+ __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+ __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+ __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
+ __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
+ __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
+ __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
+ __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
+ __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
+
+ ip0 += 2;
+ /* _mm_store_pd((double *)(op0 + y*h + x), t0);
+ _mm_store_pd((double *)(op0 + y*h + x + h), t1);
+ */
+
+ _mm_store_pd((double*)(op0 + 0 ), t0);
+ _mm_store_pd((double*)(op0 + 0 + TSIZE), t1);
+ _mm_store_pd((double*)(op0 + 2 ), t2);
+ _mm_store_pd((double*)(op0 + 2 + TSIZE), t3);
+ _mm_store_pd((double*)(op0 + 4 ), t4);
+ _mm_store_pd((double*)(op0 + 4 + TSIZE), t5);
+ _mm_store_pd((double*)(op0 + 6 ), t6);
+ _mm_store_pd((double*)(op0 + 6 + TSIZE), t7);
+ /* } */
+
+ op0 += 2*TSIZE;
+ }
+
+ op0 = out + h*tx*TSIZE + ty*TSIZE;
+ ip0 = tmp;
+ for (y = 0; y < TSIZE; y += 1) {
+ /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */
+
+ __m128d q0 = _mm_load_pd((double*)(ip0 + 0));
+ __m128d q1 = _mm_load_pd((double*)(ip0 + 2));
+ __m128d q2 = _mm_load_pd((double*)(ip0 + 4));
+ __m128d q3 = _mm_load_pd((double*)(ip0 + 6));
+
+ _mm_store_pd((double*)(op0 + 0), q0);
+ _mm_store_pd((double*)(op0 + 2), q1);
+ _mm_store_pd((double*)(op0 + 4), q2);
+ _mm_store_pd((double*)(op0 + 6), q3);
+
+ op0 += h;
+ ip0 += TSIZE;
+ }
+ }
+ }
+ /*
+ size_t i,j;
+ for(i=0;i<w;i+=2) {
+ for(j=0;j<h;j+=2) {
+ // out[i*h + j] = in[j*w + i];
+ __m128d q0 = _mm_load_pd((double *)(in + j*w + i));
+ __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
+ __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+ __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+ _mm_store_pd((double *)(out + i*h + j), t0);
+ _mm_store_pd((double *)(out + i*h + j + h), t1);
+ }
+ }
+ */
+#else
+ const int bw = 1;
+ const int bh = 8;
+ int i = 0, j = 0;
+
+ for (; i <= h - bh; i += bh) {
+ for (j = 0; j <= w - bw; j += bw) {
+ uint64_t const *ib = &in[w*i + j];
+ uint64_t *ob = &out[h*j + i];
+
+ uint64_t s_0_0 = ib[0*w + 0];
+ uint64_t s_1_0 = ib[1*w + 0];
+ uint64_t s_2_0 = ib[2*w + 0];
+ uint64_t s_3_0 = ib[3*w + 0];
+ uint64_t s_4_0 = ib[4*w + 0];
+ uint64_t s_5_0 = ib[5*w + 0];
+ uint64_t s_6_0 = ib[6*w + 0];
+ uint64_t s_7_0 = ib[7*w + 0];
+
+ ob[0*h + 0] = s_0_0;
+ ob[0*h + 1] = s_1_0;
+ ob[0*h + 2] = s_2_0;
+ ob[0*h + 3] = s_3_0;
+ ob[0*h + 4] = s_4_0;
+ ob[0*h + 5] = s_5_0;
+ ob[0*h + 6] = s_6_0;
+ ob[0*h + 7] = s_7_0;
+ }
+ }
+
+ if (i < h) {
+ int i1;
+
+ for (i1 = 0; i1 < w; i1++) {
+ for (j = i; j < h; j++) {
+ out[i1*h + j] = in[j*w + i1];
+ }
+ }
+ }
+
+ if (j < w) {
+ int j1;
+
+ for (i = j; i < w; i++) {
+ for (j1 = 0; j1 < h; j1++) {
+ out[i*h + j1] = in[j1*w + i];
+ }
+ }
+ }
+#endif
+} \ No newline at end of file
diff --git a/src/ffts_transpose.h b/src/ffts_transpose.h
new file mode 100644
index 0000000..fe376cb
--- /dev/null
+++ b/src/ffts_transpose.h
@@ -0,0 +1,46 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_TRANSPOSE_H
+#define FFTS_TRANSPOSE_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_internal.h"
+
+void
+ffts_transpose(uint64_t *in, uint64_t *out, int w, int h);
+
+#endif /* FFTS_TRANSPOSE_H */
OpenPOWER on IntegriCloud