diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2016-04-05 17:52:13 +0300 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2016-04-05 17:52:13 +0300 |
commit | 8b2d55e5d6bd43eb45ca7da1595fccc401a22158 (patch) | |
tree | 11a9eacf66b67b1b9529e4662243336ffb2c1695 | |
parent | c71724a0dc7536ef160732b5ed6ec1f4ef2c0ff9 (diff) | |
download | ffts-8b2d55e5d6bd43eb45ca7da1595fccc401a22158.zip ffts-8b2d55e5d6bd43eb45ca7da1595fccc401a22158.tar.gz |
Combine ffts_tranpose_scalar and ffts_transpose, and use ffts_transpose_scalar as native C fallback
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/ffts_nd.c | 112 | ||||
-rw-r--r-- | src/ffts_nd.h | 67 | ||||
-rw-r--r-- | src/ffts_real.c | 4 | ||||
-rw-r--r-- | src/ffts_real.h | 59 | ||||
-rw-r--r-- | src/ffts_real_nd.c | 174 | ||||
-rw-r--r-- | src/ffts_real_nd.h | 67 | ||||
-rw-r--r-- | src/ffts_transpose.c | 194 | ||||
-rw-r--r-- | src/ffts_transpose.h | 46 |
9 files changed, 399 insertions, 326 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 58f402b..8c21185 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -351,6 +351,8 @@ set(FFTS_SOURCES src/ffts_real.c src/ffts_real_nd.c src/ffts_real_nd.h + src/ffts_transpose.c + src/ffts_transpose.h src/ffts_trig.c src/ffts_trig.h src/ffts_static.c diff --git a/src/ffts_nd.c b/src/ffts_nd.c index 49c6229..64220f1 100644 --- a/src/ffts_nd.c +++ b/src/ffts_nd.c @@ -34,15 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ffts_nd.h" #include "ffts_internal.h" - -#ifdef HAVE_NEON -#include "neon.h" -#include <arm_neon.h> -#elif HAVE_SSE2 -#include <emmintrin.h> -#endif - -#define TSIZE 8 +#include "ffts_transpose.h" static void ffts_free_nd(ffts_plan_t *p) @@ -86,108 +78,6 @@ ffts_free_nd(ffts_plan_t *p) } static void -ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) -{ -#ifdef HAVE_NEON -#if 0 - neon_transpose4(in, out, w, h); -#else - neon_transpose8(in, out, w, h); -#endif -#else -#ifdef HAVE_SSE - uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE]; - int tx, ty; - /* int x; */ - int y; - int tw = w / TSIZE; - int th = h / TSIZE; - - for (ty = 0; ty < th; ty++) { - for (tx = 0; tx < tw; tx++) { - uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE; - uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */ - - /* copy/transpose to tmp */ - for (y = 0; y < TSIZE; y += 2) { - /* for (x=0;x<TSIZE;x+=2) { - op[x*TSIZE] = ip[x]; - */ - __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w)); - __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w)); - __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w)); - __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w)); - __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w)); - __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w)); - __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w)); - __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w)); - - __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); - __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); - __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0)); - __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1)); - __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0)); - __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1)); - __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0)); - __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1)); - - ip0 += 2; - /* _mm_store_pd((double *)(op0 + y*h + x), t0); - _mm_store_pd((double *)(op0 + y*h + x + h), t1); - */ - - _mm_store_pd((double*)(op0 + 0 ), t0); - _mm_store_pd((double*)(op0 + 0 + TSIZE), t1); - _mm_store_pd((double*)(op0 + 2 ), t2); - _mm_store_pd((double*)(op0 + 2 + TSIZE), t3); - _mm_store_pd((double*)(op0 + 4 ), t4); - _mm_store_pd((double*)(op0 + 4 + TSIZE), t5); - _mm_store_pd((double*)(op0 + 6 ), t6); - _mm_store_pd((double*)(op0 + 6 + TSIZE), t7); - /* } */ - - op0 += 2*TSIZE; - } - - op0 = out + h*tx*TSIZE + ty*TSIZE; - ip0 = tmp; - for (y = 0; y < TSIZE; y += 1) { - /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */ - - __m128d q0 = _mm_load_pd((double*)(ip0 + 0)); - __m128d q1 = _mm_load_pd((double*)(ip0 + 2)); - __m128d q2 = _mm_load_pd((double*)(ip0 + 4)); - __m128d q3 = _mm_load_pd((double*)(ip0 + 6)); - - _mm_store_pd((double*)(op0 + 0), q0); - _mm_store_pd((double*)(op0 + 2), q1); - _mm_store_pd((double*)(op0 + 4), q2); - _mm_store_pd((double*)(op0 + 6), q3); - - op0 += h; - ip0 += TSIZE; - } - } - } - /* - size_t i,j; - for(i=0;i<w;i+=2) { - for(j=0;j<h;j+=2) { - // out[i*h + j] = in[j*w + i]; - __m128d q0 = _mm_load_pd((double *)(in + j*w + i)); - __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w)); - __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); - __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); - _mm_store_pd((double *)(out + i*h + j), t0); - _mm_store_pd((double *)(out + i*h + j + h), t1); - } - } - */ -#endif -#endif -} - -static void ffts_execute_nd(ffts_plan_t *p, const void *in, void *out) { uint64_t *din = (uint64_t*) in; diff --git a/src/ffts_nd.h b/src/ffts_nd.h index a960cad..1998cc6 100644 --- a/src/ffts_nd.h +++ b/src/ffts_nd.h @@ -1,43 +1,50 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef FFTS_ND_H #define FFTS_ND_H +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + #include "ffts.h" #include <stddef.h> -ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign); -ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign); +ffts_plan_t* +ffts_init_nd(int rank, size_t *Ns, int sign); + +ffts_plan_t* +ffts_init_2d(size_t N1, size_t N2, int sign); #endif /* FFTS_ND_H */ diff --git a/src/ffts_real.c b/src/ffts_real.c index 7f41069..0f87a12 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -641,9 +641,9 @@ ffts_init_1d_real(size_t N, int sign) } #ifdef HAVE_SSE3 - ffts_generate_table_1d_real_32f(p, sign, 1); + ffts_generate_table_1d_real_32f(p, sign, 1); #else - ffts_generate_table_1d_real_32f(p, sign, 0); + ffts_generate_table_1d_real_32f(p, sign, 0); #endif return p; diff --git a/src/ffts_real.h b/src/ffts_real.h index 81ca80f..61d03d4 100644 --- a/src/ffts_real.h +++ b/src/ffts_real.h @@ -1,33 +1,33 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ @@ -41,6 +41,7 @@ #include "ffts.h" #include <stddef.h> -ffts_plan_t *ffts_init_1d_real(size_t N, int sign); +ffts_plan_t* +ffts_init_1d_real(size_t N, int sign); #endif /* FFTS_REAL_H */ diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c index 545e8f0..89ef7f7 100644 --- a/src/ffts_real_nd.c +++ b/src/ffts_real_nd.c @@ -1,80 +1,67 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "ffts_real_nd.h" #include "ffts_real.h" #include "ffts_internal.h" +#include "ffts_transpose.h" -#ifdef __ARM_NEON__ -#include "neon.h" -#endif - -#ifdef HAVE_NEON -#include <arm_neon.h> -#elif HAVE_SSE -#include <xmmintrin.h> -#endif - -#include <stdio.h> - -static void ffts_free_nd_real(ffts_plan_t *p) +static void +ffts_free_nd_real(ffts_plan_t *p) { if (p->plans) { - int i; + int i, j; for (i = 0; i < p->rank; i++) { ffts_plan_t *plan = p->plans[i]; - if (plan) { - int j; - - for (j = i + 1; j < p->rank; j++) { - if (plan == p->plans[j]) { - p->plans[j] = NULL; - } - } - - ffts_free(plan); - } + if (plan) { + for (j = 0; j < i; j++) { + if (p->Ns[i] == p->Ns[j]) { + plan = NULL; + break; + } + } + + if (plan) { + ffts_free(plan); + } + } } free(p->plans); } - if (p->transpose_buf) { - ffts_aligned_free(p->transpose_buf); - } - if (p->buf) { ffts_aligned_free(p->buf); } @@ -90,59 +77,8 @@ static void ffts_free_nd_real(ffts_plan_t *p) free(p); } -static void ffts_scalar_transpose(uint64_t *src, uint64_t *dst, int w, int h, uint64_t *buf) -{ - const int bw = 1; - const int bh = 8; - int i = 0, j = 0; - - for (; i <= h - bh; i += bh) { - for (j = 0; j <= w - bw; j += bw) { - uint64_t const *ib = &src[w*i + j]; - uint64_t *ob = &dst[h*j + i]; - - uint64_t s_0_0 = ib[0*w + 0]; - uint64_t s_1_0 = ib[1*w + 0]; - uint64_t s_2_0 = ib[2*w + 0]; - uint64_t s_3_0 = ib[3*w + 0]; - uint64_t s_4_0 = ib[4*w + 0]; - uint64_t s_5_0 = ib[5*w + 0]; - uint64_t s_6_0 = ib[6*w + 0]; - uint64_t s_7_0 = ib[7*w + 0]; - - ob[0*h + 0] = s_0_0; - ob[0*h + 1] = s_1_0; - ob[0*h + 2] = s_2_0; - ob[0*h + 3] = s_3_0; - ob[0*h + 4] = s_4_0; - ob[0*h + 5] = s_5_0; - ob[0*h + 6] = s_6_0; - ob[0*h + 7] = s_7_0; - } - } - - if (i < h) { - int i1; - - for (i1 = 0; i1 < w; i1++) { - for (j = i; j < h; j++) { - dst[i1*h + j] = src[j*w + i1]; - } - } - } - - if (j < w) { - int j1; - - for (i = j; i < w; i++) { - for (j1 = 0; j1 < h; j1++) { - dst[i*h + j1] = src[j1*w + i]; - } - } - } -} - -static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out) +static void +ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out) { const size_t Ms0 = p->Ms[0]; const size_t Ns0 = p->Ns[0]; @@ -150,7 +86,6 @@ static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out) uint32_t *din = (uint32_t*) in; uint64_t *buf = p->buf; uint64_t *dout = (uint64_t*) out; - uint64_t *transpose_buf = (uint64_t*) p->transpose_buf; ffts_plan_t *plan; int i; @@ -161,7 +96,7 @@ static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out) plan->transform(plan, din + (j * Ms0), buf + (j * (Ms0 / 2 + 1))); } - ffts_scalar_transpose(buf, dout, Ms0 / 2 + 1, Ns0, transpose_buf); + ffts_transpose(buf, dout, Ms0 / 2 + 1, Ns0); for (i = 1; i < p->rank; i++) { const size_t Ms = p->Ms[i]; @@ -173,11 +108,12 @@ static void ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out) plan->transform(plan, dout + (j * Ms), buf + (j * Ms)); } - ffts_scalar_transpose(buf, dout, Ms, Ns, transpose_buf); + ffts_transpose(buf, dout, Ms, Ns); } } -static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out) +static void +ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out) { const size_t Ms0 = p->Ms[0]; const size_t Ms1 = p->Ms[1]; @@ -187,7 +123,6 @@ static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out) uint64_t *din = (uint64_t*) in; uint64_t *buf = p->buf; uint64_t *buf2; - uint64_t *transpose_buf = (uint64_t*) p->transpose_buf; float *doutr = (float*) out; ffts_plan_t *plan; @@ -203,14 +138,14 @@ static void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out) buf2 = buf + vol; - ffts_scalar_transpose(din, buf, Ms0, Ns0, transpose_buf); + ffts_transpose(din, buf, Ms0, Ns0); plan = p->plans[0]; for (j = 0; j < Ms0; j++) { plan->transform(plan, buf + (j * Ns0), buf2 + (j * Ns0)); } - ffts_scalar_transpose(buf2, buf, Ns0, Ms0, transpose_buf); + ffts_transpose(buf2, buf, Ns0, Ms0); plan = p->plans[1]; for (j = 0; j < Ms1; j++) { @@ -267,11 +202,6 @@ ffts_init_nd_real(int rank, size_t *Ns, int sign) goto cleanup; } - p->transpose_buf = ffts_aligned_malloc(2 * 8 * 8 * sizeof(float)); - if (!p->transpose_buf) { - goto cleanup; - } - p->plans = (ffts_plan_t**) calloc(rank, sizeof(*p->plans)); if (!p->plans) { goto cleanup; diff --git a/src/ffts_real_nd.h b/src/ffts_real_nd.h index 22a708d..fac607b 100644 --- a/src/ffts_real_nd.h +++ b/src/ffts_real_nd.h @@ -1,33 +1,33 @@ - /* - - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ @@ -41,7 +41,10 @@ #include "ffts.h" #include <stddef.h> -ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign); -ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign); +ffts_plan_t* +ffts_init_nd_real(int rank, size_t *Ns, int sign); + +ffts_plan_t* +ffts_init_2d_real(size_t N1, size_t N2, int sign); #endif /* FFTS_REAL_ND_H */
\ No newline at end of file diff --git a/src/ffts_transpose.c b/src/ffts_transpose.c new file mode 100644 index 0000000..272cb48 --- /dev/null +++ b/src/ffts_transpose.c @@ -0,0 +1,194 @@ +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi> +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts_transpose.h" +#include "ffts_internal.h" + +#ifdef HAVE_NEON +#include "neon.h" +#include <arm_neon.h> +#elif HAVE_SSE2 +#include <emmintrin.h> +#endif + +#define TSIZE 8 + +void +ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) +{ +#ifdef HAVE_NEON +#if 0 + neon_transpose4(in, out, w, h); +#else + neon_transpose8(in, out, w, h); +#endif +#elif HAVE_SSE2 + uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE]; + int tx, ty; + /* int x; */ + int y; + int tw = w / TSIZE; + int th = h / TSIZE; + + for (ty = 0; ty < th; ty++) { + for (tx = 0; tx < tw; tx++) { + uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE; + uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */ + + /* copy/transpose to tmp */ + for (y = 0; y < TSIZE; y += 2) { + /* for (x=0;x<TSIZE;x+=2) { + op[x*TSIZE] = ip[x]; + */ + __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w)); + __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w)); + __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w)); + __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w)); + __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w)); + __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w)); + __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w)); + __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w)); + + __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); + __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); + __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0)); + __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1)); + __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0)); + __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1)); + __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0)); + __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1)); + + ip0 += 2; + /* _mm_store_pd((double *)(op0 + y*h + x), t0); + _mm_store_pd((double *)(op0 + y*h + x + h), t1); + */ + + _mm_store_pd((double*)(op0 + 0 ), t0); + _mm_store_pd((double*)(op0 + 0 + TSIZE), t1); + _mm_store_pd((double*)(op0 + 2 ), t2); + _mm_store_pd((double*)(op0 + 2 + TSIZE), t3); + _mm_store_pd((double*)(op0 + 4 ), t4); + _mm_store_pd((double*)(op0 + 4 + TSIZE), t5); + _mm_store_pd((double*)(op0 + 6 ), t6); + _mm_store_pd((double*)(op0 + 6 + TSIZE), t7); + /* } */ + + op0 += 2*TSIZE; + } + + op0 = out + h*tx*TSIZE + ty*TSIZE; + ip0 = tmp; + for (y = 0; y < TSIZE; y += 1) { + /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */ + + __m128d q0 = _mm_load_pd((double*)(ip0 + 0)); + __m128d q1 = _mm_load_pd((double*)(ip0 + 2)); + __m128d q2 = _mm_load_pd((double*)(ip0 + 4)); + __m128d q3 = _mm_load_pd((double*)(ip0 + 6)); + + _mm_store_pd((double*)(op0 + 0), q0); + _mm_store_pd((double*)(op0 + 2), q1); + _mm_store_pd((double*)(op0 + 4), q2); + _mm_store_pd((double*)(op0 + 6), q3); + + op0 += h; + ip0 += TSIZE; + } + } + } + /* + size_t i,j; + for(i=0;i<w;i+=2) { + for(j=0;j<h;j+=2) { + // out[i*h + j] = in[j*w + i]; + __m128d q0 = _mm_load_pd((double *)(in + j*w + i)); + __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w)); + __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); + __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); + _mm_store_pd((double *)(out + i*h + j), t0); + _mm_store_pd((double *)(out + i*h + j + h), t1); + } + } + */ +#else + const int bw = 1; + const int bh = 8; + int i = 0, j = 0; + + for (; i <= h - bh; i += bh) { + for (j = 0; j <= w - bw; j += bw) { + uint64_t const *ib = &in[w*i + j]; + uint64_t *ob = &out[h*j + i]; + + uint64_t s_0_0 = ib[0*w + 0]; + uint64_t s_1_0 = ib[1*w + 0]; + uint64_t s_2_0 = ib[2*w + 0]; + uint64_t s_3_0 = ib[3*w + 0]; + uint64_t s_4_0 = ib[4*w + 0]; + uint64_t s_5_0 = ib[5*w + 0]; + uint64_t s_6_0 = ib[6*w + 0]; + uint64_t s_7_0 = ib[7*w + 0]; + + ob[0*h + 0] = s_0_0; + ob[0*h + 1] = s_1_0; + ob[0*h + 2] = s_2_0; + ob[0*h + 3] = s_3_0; + ob[0*h + 4] = s_4_0; + ob[0*h + 5] = s_5_0; + ob[0*h + 6] = s_6_0; + ob[0*h + 7] = s_7_0; + } + } + + if (i < h) { + int i1; + + for (i1 = 0; i1 < w; i1++) { + for (j = i; j < h; j++) { + out[i1*h + j] = in[j*w + i1]; + } + } + } + + if (j < w) { + int j1; + + for (i = j; i < w; i++) { + for (j1 = 0; j1 < h; j1++) { + out[i*h + j1] = in[j1*w + i]; + } + } + } +#endif +}
\ No newline at end of file diff --git a/src/ffts_transpose.h b/src/ffts_transpose.h new file mode 100644 index 0000000..fe376cb --- /dev/null +++ b/src/ffts_transpose.h @@ -0,0 +1,46 @@ +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef FFTS_TRANSPOSE_H +#define FFTS_TRANSPOSE_H + +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + +#include "ffts_internal.h" + +void +ffts_transpose(uint64_t *in, uint64_t *out, int w, int h); + +#endif /* FFTS_TRANSPOSE_H */ |