From 0bea449a662d4bcd228a2b6bc2f6c8f4162dda2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Carretero?= Date: Tue, 30 Sep 2014 20:54:46 -0400 Subject: real: fix alignment issue in 1d execution (bug #30) Because of the size of M/2+1, you can't expect the data to be aligned at 128 bits. --- src/ffts_real.c | 160 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 80 insertions(+), 80 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 7fad638..87eaab7 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -61,45 +61,46 @@ void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) { size_t i; #ifdef __ARM_NEON__ for(i=0;i Date: Tue, 30 Sep 2014 21:01:14 -0400 Subject: automatic trailing space removal ... because I use a real editor --- src/ffts_real.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 87eaab7..97ff942 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -1,10 +1,10 @@ /* - + This file is part of FFTS -- The Fastest Fourier Transform in the South - + Copyright (c) 2012, Anthony M. Blake - Copyright (c) 2012, The University of Waikato - + Copyright (c) 2012, The University of Waikato + All rights reserved. Redistribution and use in source and binary forms, with or without @@ -111,10 +111,10 @@ void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) { #endif } - + out[N] = buf[0] - buf[1]; out[N+1] = 0.0f; - + } void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) { @@ -124,12 +124,12 @@ void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) { float *A = p->A; float *B = p->B; size_t N = p->N; - + float *p_buf0 = in; float *p_buf1 = in + N - 2; - + float *p_out = buf; - + size_t i; #ifdef __ARM_NEON__ for(i=0;iplans[0]->transform(p->plans[0], buf, out); - + } ffts_plan_t *ffts_init_1d_real(size_t N, int sign) { @@ -189,13 +189,13 @@ ffts_plan_t *ffts_init_1d_real(size_t N, int sign) { if(sign < 0) p->transform = &ffts_execute_1d_real; else p->transform = &ffts_execute_1d_real_inv; - + p->destroy = &ffts_free_1d_real; p->N = N; p->rank = 1; p->plans = malloc(sizeof(ffts_plan_t **) * 1); - p->plans[0] = ffts_init_1d(N/2, sign); + p->plans[0] = ffts_init_1d(N/2, sign); p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1)); @@ -219,7 +219,7 @@ ffts_plan_t *ffts_init_1d_real(size_t N, int sign) { p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i)); } } - + return p; } -- cgit v1.1 From c602cee1b51e8c532e4817d41d973deea8ab257a Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Wed, 29 Oct 2014 16:13:33 +0200 Subject: Cleaning and reorganizing --- src/ffts_real.c | 452 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 253 insertions(+), 199 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 97ff942..77c57a0 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -1,227 +1,281 @@ /* - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, Anthony M. Blake - Copyright (c) 2012, The University of Waikato - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "ffts_real.h" -void ffts_free_1d_real(ffts_plan_t *p) { - ffts_free(p->plans[0]); - free(p->A); - free(p->B); - free(p->plans); - free(p->buf); - free(p); -} +#ifdef HAVE_NEON +#include +#endif + +#ifdef HAVE_SSE +#include +#endif + +static void ffts_free_1d_real(ffts_plan_t *p) +{ + if (p->B) { + ffts_aligned_free(p->B); + } -void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) { - float *out = (float *)vout; - float *buf = (float *)p->buf; - float *A = p->A; - float *B = p->B; + if (p->A) { + ffts_aligned_free(p->A); + } - p->plans[0]->transform(p->plans[0], vin, buf); + if (p->buf) { + ffts_aligned_free(p->buf); + } - size_t N = p->N; - buf[N] = buf[0]; - buf[N+1] = buf[1]; + if (p->plans) { + ffts_free(p->plans[0]); + free(p->plans); + } - float *p_buf0 = buf; - float *p_buf1 = buf + N - 2; - float *p_out = out; + free(p); +} + +static void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) +{ + float *out = (float*) vout; + float *buf = (float*) p->buf; + float *A = p->A; + float *B = p->B; + size_t N = p->N; - size_t i; #ifdef __ARM_NEON__ - for(i=0;iplans[0]->transform(p->plans[0], vin, buf); - out[N] = buf[0] - buf[1]; - out[N+1] = 0.0f; + buf[N + 0] = buf[0]; + buf[N + 1] = buf[1]; +#ifdef __ARM_NEON__ + for (i = 0; i < N/2; i += 2) { + __asm__ __volatile__ ( + "vld1.32 {q8}, [%[pa]]!\n\t" + "vld1.32 {q9}, [%[pb]]!\n\t" + "vld1.32 {q10}, [%[buf0]]!\n\t" + "vld1.32 {q11}, [%[buf1]]\n\t" + "sub %[buf1], %[buf1], #16\n\t" + + "vdup.32 d26, d16[1]\n\t" + "vdup.32 d27, d17[1]\n\t" + "vdup.32 d24, d16[0]\n\t" + "vdup.32 d25, d17[0]\n\t" + + "vdup.32 d30, d23[1]\n\t" + "vdup.32 d31, d22[1]\n\t" + "vdup.32 d28, d23[0]\n\t" + "vdup.32 d29, d22[0]\n\t" + + "vmul.f32 q13, q13, q10\n\t" + "vmul.f32 q15, q15, q9\n\t" + "vmul.f32 q12, q12, q10\n\t" + "vmul.f32 q14, q14, q9\n\t" + "vrev64.f32 q13, q13\n\t" + "vrev64.f32 q15, q15\n\t" + + "vtrn.32 d26, d27\n\t" + "vtrn.32 d30, d31\n\t" + "vneg.f32 d26, d26\n\t" + "vneg.f32 d31, d31\n\t" + "vtrn.32 d26, d27\n\t" + "vtrn.32 d30, d31\n\t" + + "vadd.f32 q12, q12, q14\n\t" + "vadd.f32 q13, q13, q15\n\t" + "vadd.f32 q12, q12, q13\n\t" + "vst1.32 {q12}, [%[pout]]!\n\t" + : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), + [pout] "+r" (p_out) + : + : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + } +#else + for (i = 0; i < N/2; i++) { + out[2*i + 0] = buf[2*i + 0] * A[2*i] - buf[2*i + 1] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1]; + out[2*i + 1] = buf[2*i + 1] * A[2*i] + buf[2*i + 0] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; + + /* out[2*N-2*i+0] = out[2*i+0]; + out[2*N-2*i+1] = -out[2*i+1]; + */ + } +#endif + + out[N + 0] = buf[0] - buf[1]; + out[N + 1] = 0.0f; } -void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) { - float *out = (float *)vout; - float *in = (float *)vin; - float *buf = (float *)p->buf; - float *A = p->A; - float *B = p->B; - size_t N = p->N; +static void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) +{ + float *out = (float*) vout; + float *in = (float*) vin; + float *buf = (float*) p->buf; + float *A = p->A; + float *B = p->B; + size_t N = p->N; - float *p_buf0 = in; - float *p_buf1 = in + N - 2; +#ifdef __ARM_NEON__ + float *p_buf0 = in; + float *p_buf1 = in + N - 2; + float *p_out = buf; +#endif - float *p_out = buf; + size_t i; - size_t i; #ifdef __ARM_NEON__ - for(i=0;iplans[0]->transform(p->plans[0], buf, out); - -} - -ffts_plan_t *ffts_init_1d_real(size_t N, int sign) { - ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); - - if(sign < 0) p->transform = &ffts_execute_1d_real; - else p->transform = &ffts_execute_1d_real_inv; - - p->destroy = &ffts_free_1d_real; - p->N = N; - p->rank = 1; - p->plans = malloc(sizeof(ffts_plan_t **) * 1); - - p->plans[0] = ffts_init_1d(N/2, sign); - - p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1)); - - p->A = valloc(sizeof(float) * N); - p->B = valloc(sizeof(float) * N); - - if(sign < 0) { - int i; - for (i = 0; i < N/2; i++) { - p->A[2 * i] = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i)); - p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i)); - p->B[2 * i] = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i)); - p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i)); - } - }else{ - int i; - for (i = 0; i < N/2; i++) { - p->A[2 * i] = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i)); - p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i)); - p->B[2 * i] = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i)); - p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i)); - } - } - - return p; + p->plans[0]->transform(p->plans[0], buf, out); } - -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +ffts_plan_t *ffts_init_1d_real(size_t N, int sign) +{ + ffts_plan_t *p; + size_t i; + + p = (ffts_plan_t*) calloc(1, sizeof(*p)); + if (!p) { + return NULL; + } + + if (sign < 0) { + p->transform = &ffts_execute_1d_real; + } else { + p->transform = &ffts_execute_1d_real_inv; + } + + p->destroy = &ffts_free_1d_real; + p->N = N; + p->rank = 1; + + p->plans = (ffts_plan_t**) malloc(1 * sizeof(*p->plans)); + if (!p->plans) { + goto cleanup; + } + + p->plans[0] = ffts_init_1d(N/2, sign); + if (!p->plans[0]) { + goto cleanup; + } + + p->buf = ffts_aligned_malloc(2 * ((N/2) + 1) * sizeof(float)); + if (!p->buf) { + goto cleanup; + } + + p->A = (float*) ffts_aligned_malloc(N * sizeof(float)); + if (!p->A) { + goto cleanup; + } + + p->B = (float*) ffts_aligned_malloc(N * sizeof(float)); + if (!p->B) { + goto cleanup; + } + + if (sign < 0) { + for (i = 0; i < N/2; i++) { + p->A[2 * i + 0] = 0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)); + p->A[2 * i + 1] = 0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + p->B[2 * i + 0] = 0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)); + p->B[2 * i + 1] = 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + } + } else { + for (i = 0; i < N/2; i++) { + p->A[2 * i + 0] = 1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)); + p->A[2 * i + 1] = 1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + p->B[2 * i + 0] = 1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)); + p->B[2 * i + 1] = 1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + } + } + + return p; + +cleanup: + ffts_free_1d_real(p); + return NULL; +} \ No newline at end of file -- cgit v1.1 From 71f1f4dae77c2f6b335c3e06c13a3ecedf73ccda Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Mon, 17 Nov 2014 15:39:46 +0200 Subject: Fix redefinition of ffts_plan_t --- src/ffts_real.c | 1 + 1 file changed, 1 insertion(+) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 77c57a0..c2f03b7 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -32,6 +32,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "ffts_real.h" +#include "ffts_internal.h" #ifdef HAVE_NEON #include -- cgit v1.1 From f16baa9919e28a57363e974e4adfff6c7dce9e74 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Sat, 6 Dec 2014 16:33:46 +0200 Subject: Definitions HAVE_NEON and HAVE_SSE cannot coexist --- src/ffts_real.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index c2f03b7..12c02b9 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -36,9 +36,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef HAVE_NEON #include -#endif - -#ifdef HAVE_SSE +#elif HAVE_SSE #include #endif @@ -132,10 +130,6 @@ static void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) for (i = 0; i < N/2; i++) { out[2*i + 0] = buf[2*i + 0] * A[2*i] - buf[2*i + 1] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1]; out[2*i + 1] = buf[2*i + 1] * A[2*i] + buf[2*i + 0] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; - - /* out[2*N-2*i+0] = out[2*i+0]; - out[2*N-2*i+1] = -out[2*i+1]; - */ } #endif -- cgit v1.1 From ceb8e6aef7f0e406ff4724896a8138bf72911a68 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Mon, 6 Jul 2015 12:02:33 +0300 Subject: Avoid allocating array of single pointer --- src/ffts_real.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 12c02b9..5522f6b 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South Copyright (c) 2012, Anthony M. Blake Copyright (c) 2012, The University of Waikato +Copyright (c) 2015, Jukka Ojanen All rights reserved. @@ -40,7 +41,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -static void ffts_free_1d_real(ffts_plan_t *p) +static void +ffts_free_1d_real(ffts_plan_t *p) { if (p->B) { ffts_aligned_free(p->B); @@ -54,9 +56,8 @@ static void ffts_free_1d_real(ffts_plan_t *p) ffts_aligned_free(p->buf); } - if (p->plans) { + if (p->plans[0]) { ffts_free(p->plans[0]); - free(p->plans); } free(p); @@ -207,12 +208,13 @@ static void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout p->plans[0]->transform(p->plans[0], buf, out); } -ffts_plan_t *ffts_init_1d_real(size_t N, int sign) +ffts_plan_t* +ffts_init_1d_real(size_t N, int sign) { ffts_plan_t *p; size_t i; - p = (ffts_plan_t*) calloc(1, sizeof(*p)); + p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans)); if (!p) { return NULL; } @@ -226,11 +228,7 @@ ffts_plan_t *ffts_init_1d_real(size_t N, int sign) p->destroy = &ffts_free_1d_real; p->N = N; p->rank = 1; - - p->plans = (ffts_plan_t**) malloc(1 * sizeof(*p->plans)); - if (!p->plans) { - goto cleanup; - } + p->plans = (ffts_plan_t**) &p[1]; p->plans[0] = ffts_init_1d(N/2, sign); if (!p->plans[0]) { -- cgit v1.1 From fbcfb21e9de85b6443848c721523d3793ae668ff Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Mon, 6 Jul 2015 12:08:32 +0300 Subject: Add new attributes to help auto-vectorization --- src/ffts_real.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 5522f6b..82a9e79 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -63,13 +63,19 @@ ffts_free_1d_real(ffts_plan_t *p) free(p); } -static void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) +static void +ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) { - float *out = (float*) vout; - float *buf = (float*) p->buf; - float *A = p->A; - float *B = p->B; - size_t N = p->N; + float *const FFTS_RESTRICT out = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; #ifdef __ARM_NEON__ float *p_buf0 = buf; @@ -77,9 +83,10 @@ static void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) float *p_out = out; #endif - size_t i; + /* we know this */ + FFTS_ASSUME(N/2 > 0); - p->plans[0]->transform(p->plans[0], vin, buf); + p->plans[0]->transform(p->plans[0], input, buf); buf[N + 0] = buf[0]; buf[N + 1] = buf[1]; @@ -138,14 +145,19 @@ static void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) out[N + 1] = 0.0f; } -static void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) +static void +ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) { - float *out = (float*) vout; - float *in = (float*) vin; - float *buf = (float*) p->buf; - float *A = p->A; - float *B = p->B; - size_t N = p->N; + float *const FFTS_RESTRICT in = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input); + float *const FFTS_RESTRICT buf = + (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf); + const float *const FFTS_RESTRICT A = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A); + const float *const FFTS_RESTRICT B = + (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B); + const int N = (const int) p->N; + int i; #ifdef __ARM_NEON__ float *p_buf0 = in; @@ -153,7 +165,8 @@ static void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout float *p_out = buf; #endif - size_t i; + /* we know this */ + FFTS_ASSUME(N/2 > 0); #ifdef __ARM_NEON__ for (i = 0; i < N/2; i += 2) { @@ -205,7 +218,7 @@ static void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout } #endif - p->plans[0]->transform(p->plans[0], buf, out); + p->plans[0]->transform(p->plans[0], buf, output); } ffts_plan_t* -- cgit v1.1 From 6bf4e36dd29a12136f018c208f830dbaac05f182 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Mon, 6 Jul 2015 12:10:17 +0300 Subject: SSE optimized versions of ffts_execute_1d_real and ffts_execute_1d_real_inv --- src/ffts_real.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 4 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 82a9e79..0dd24d8 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -134,10 +134,58 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } +#elif HAVE_SSE + if (N < 8) { + for (i = 0; i < N/2; i++) { + out[2*i + 0] = + buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] + + buf[N - 2*i + 0] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1]; + out[2*i + 1] = + buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] + + buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; + } + } else { + const __m128 c0 = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f); + __m128 t0 = _mm_load_ps(buf); + + for (i = 0; i < N; i += 8) { + __m128 t1 = _mm_load_ps(buf + i); + __m128 t2 = _mm_load_ps(buf + N - i - 4); + __m128 t3 = _mm_load_ps(A + i); + __m128 t4 = _mm_load_ps(B + i); + + _mm_store_ps(out + i, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0), + _MM_SHUFFLE(2,3,0,1))))); + + t0 = _mm_load_ps(buf + N - i - 8); + t1 = _mm_load_ps(buf + i + 4); + t3 = _mm_load_ps(A + i + 4); + t4 = _mm_load_ps(B + i + 4); + + _mm_store_ps(out + i + 4, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0), + _MM_SHUFFLE(2,3,0,1))))); + } + } #else for (i = 0; i < N/2; i++) { - out[2*i + 0] = buf[2*i + 0] * A[2*i] - buf[2*i + 1] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1]; - out[2*i + 1] = buf[2*i + 1] * A[2*i] + buf[2*i + 0] * A[2*i + 1] + buf[N - 2*i] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; + out[2*i + 0] = + buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] + + buf[N - 2*i + 0] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1]; + out[2*i + 1] = + buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] + + buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; } #endif @@ -211,10 +259,58 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } +#elif HAVE_SSE + if (N < 8) { + for (i = 0; i < N/2; i++) { + buf[2*i + 0] = + in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] + + in[N - 2*i + 0] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1]; + buf[2*i + 1] = + in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] - + in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0]; + } + } else { + const __m128 c0 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f); + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); + + for (i = 0; i < N; i += 8) { + __m128 t1 = _mm_load_ps(in + i); + __m128 t2 = _mm_load_ps(in + N - i - 4); + __m128 t3 = _mm_load_ps(A + i); + __m128 t4 = _mm_load_ps(B + i); + + _mm_store_ps(buf + i, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); + + t0 = _mm_load_ps(in + N - i - 8); + t1 = _mm_load_ps(in + i + 4); + t3 = _mm_load_ps(A + i + 4); + t4 = _mm_load_ps(B + i + 4); + + _mm_store_ps(buf + i + 4, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); + } + } #else for (i = 0; i < N/2; i++) { - buf[2*i + 0] = in[2*i + 0] * A[2*i] + in[2*i + 1] * A[2*i + 1] + in[N - 2*i] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1]; - buf[2*i + 1] = in[2*i + 1] * A[2*i] - in[2*i + 0] * A[2*i + 1] - in[N - 2*i] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0]; + buf[2*i + 0] = + in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] + + in[N - 2*i + 0] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1]; + buf[2*i + 1] = + in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] - + in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0]; } #endif -- cgit v1.1 From a0aa64fdb9a5fcbdf8ad3edfe3f2b1bc4e37c770 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Mon, 6 Jul 2015 14:28:45 +0300 Subject: To silence warning 'possible loss of data', use explicit casting to float --- src/ffts_real.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 0dd24d8..f3b5126 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -361,17 +361,17 @@ ffts_init_1d_real(size_t N, int sign) if (sign < 0) { for (i = 0; i < N/2; i++) { - p->A[2 * i + 0] = 0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)); - p->A[2 * i + 1] = 0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)); - p->B[2 * i + 0] = 0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)); - p->B[2 * i + 1] = 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + p->A[2 * i + 0] = (float) (0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i))); + p->A[2 * i + 1] = (float) (0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i))); + p->B[2 * i + 0] = (float) (0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); + p->B[2 * i + 1] = (float) (0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); } } else { for (i = 0; i < N/2; i++) { - p->A[2 * i + 0] = 1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i)); - p->A[2 * i + 1] = 1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i)); - p->B[2 * i + 0] = 1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i)); - p->B[2 * i + 1] = 1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i)); + p->A[2 * i + 0] = (float) (1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i))); + p->A[2 * i + 1] = (float) (1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i))); + p->B[2 * i + 0] = (float) (1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); + p->B[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); } } -- cgit v1.1 From ea0c10a22b233af7ef9ddd9bd6b71d3ab9208cff Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 7 Jul 2015 12:50:30 +0300 Subject: Add SSE3 optimized version of ffts_execute_1d_real --- src/ffts_real.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 13 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index f3b5126..f3fbaae 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -39,6 +39,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #elif HAVE_SSE #include + +/* check if have SSE3 intrinsics */ +#ifdef HAVE_PMMINTRIN_H +#include +#elif HAVE_INTRIN_H +#include +#else +/* avoid using negative zero as some configurations have problems with those */ +static const FFTS_ALIGN(16) unsigned int sign_mask[4] = { + 0x80000000, 0, 0x80000000, 0 +}; +#endif #endif static void @@ -88,8 +100,10 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) p->plans[0]->transform(p->plans[0], input, buf); +#ifndef HAVE_SSE buf[N + 0] = buf[0]; buf[N + 1] = buf[1]; +#endif #ifdef __ARM_NEON__ for (i = 0; i < N/2; i += 2) { @@ -134,18 +148,67 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -#elif HAVE_SSE +#elif HAVE_SSE3 if (N < 8) { - for (i = 0; i < N/2; i++) { - out[2*i + 0] = - buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] + - buf[N - 2*i + 0] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1]; - out[2*i + 1] = - buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] + - buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0]; + const __m128 t0 = _mm_load_ps(buf); + const __m128 t1 = _mm_load_ps(A); + const __m128 t2 = _mm_load_ps(B); + + _mm_store_ps(out, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t0, _mm_moveldup_ps(t1)), + _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t1))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2)))); + } else { + __m128 t0 = _mm_load_ps(buf); + + for (i = 0; i < N; i += 8) { + __m128 t1 = _mm_load_ps(buf + i); + __m128 t2 = _mm_load_ps(buf + N - i - 4); + __m128 t3 = _mm_load_ps(A + i); + __m128 t4 = _mm_load_ps(B + i); + + _mm_store_ps(out + i, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t0 = _mm_load_ps(buf + N - i - 8); + t1 = _mm_load_ps(buf + i + 4); + t3 = _mm_load_ps(A + i + 4); + t4 = _mm_load_ps(B + i + 4); + + _mm_store_ps(out + i + 4, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } + } +#elif HAVE_SSE + if (N < 8) { + const __m128 c0 = _mm_load_ps((const float*) sign_mask); + const __m128 t0 = _mm_load_ps(buf); + const __m128 t1 = _mm_load_ps(A); + const __m128 t2 = _mm_load_ps(B); + + _mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t0, _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2)), + _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t2, c0), _mm_xor_ps(t2, c0), + _MM_SHUFFLE(2,3,0,1))))); } else { - const __m128 c0 = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f); + const __m128 c0 = _mm_load_ps((const float*) sign_mask); __m128 t0 = _mm_load_ps(buf); for (i = 0; i < N; i += 8) { @@ -278,7 +341,7 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) __m128 t2 = _mm_load_ps(in + N - i - 4); __m128 t3 = _mm_load_ps(A + i); __m128 t4 = _mm_load_ps(B + i); - + _mm_store_ps(buf + i, _mm_add_ps(_mm_sub_ps(_mm_add_ps( _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), @@ -361,10 +424,14 @@ ffts_init_1d_real(size_t N, int sign) if (sign < 0) { for (i = 0; i < N/2; i++) { - p->A[2 * i + 0] = (float) (0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i))); - p->A[2 * i + 1] = (float) (0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i))); + p->A[2 * i + 0] = (float) ( 0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i))); + p->A[2 * i + 1] = (float) ( 0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i))); +#ifdef HAVE_SSE3 + p->B[2 * i + 0] = (float) (-0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); +#else p->B[2 * i + 0] = (float) (0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); - p->B[2 * i + 1] = (float) (0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); +#endif + p->B[2 * i + 1] = (float) ( 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); } } else { for (i = 0; i < N/2; i++) { -- cgit v1.1 From 95c783a2afd9a2e299812be2623fecd415bd1c41 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 7 Jul 2015 19:14:22 +0300 Subject: Add SSE3 optimized version of ffts_execute_1d_real_inv --- src/ffts_real.c | 98 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 20 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index f3fbaae..5c01103 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -47,9 +47,12 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #else /* avoid using negative zero as some configurations have problems with those */ -static const FFTS_ALIGN(16) unsigned int sign_mask[4] = { +static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = { 0x80000000, 0, 0x80000000, 0 }; +static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = { + 0, 0x80000000, 0, 0x80000000 +}; #endif #endif @@ -150,9 +153,9 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) } #elif HAVE_SSE3 if (N < 8) { - const __m128 t0 = _mm_load_ps(buf); - const __m128 t1 = _mm_load_ps(A); - const __m128 t2 = _mm_load_ps(B); + __m128 t0 = _mm_load_ps(buf); + __m128 t1 = _mm_load_ps(A); + __m128 t2 = _mm_load_ps(B); _mm_store_ps(out, _mm_add_ps(_mm_addsub_ps( _mm_mul_ps(t0, _mm_moveldup_ps(t1)), @@ -194,10 +197,10 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) } #elif HAVE_SSE if (N < 8) { - const __m128 c0 = _mm_load_ps((const float*) sign_mask); - const __m128 t0 = _mm_load_ps(buf); - const __m128 t1 = _mm_load_ps(A); - const __m128 t2 = _mm_load_ps(B); + __m128 c0 = _mm_load_ps((const float*) sign_mask_even); + __m128 t0 = _mm_load_ps(buf); + __m128 t1 = _mm_load_ps(A); + __m128 t2 = _mm_load_ps(B); _mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps( _mm_mul_ps(t0, _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,0,0))), @@ -208,7 +211,7 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _mm_shuffle_ps(_mm_xor_ps(t2, c0), _mm_xor_ps(t2, c0), _MM_SHUFFLE(2,3,0,1))))); } else { - const __m128 c0 = _mm_load_ps((const float*) sign_mask); + __m128 c0 = _mm_load_ps((const float*) sign_mask_even); __m128 t0 = _mm_load_ps(buf); for (i = 0; i < N; i += 8) { @@ -322,18 +325,69 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -#elif HAVE_SSE +#elif HAVE_SSE3 if (N < 8) { - for (i = 0; i < N/2; i++) { - buf[2*i + 0] = - in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] + - in[N - 2*i + 0] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1]; - buf[2*i + 1] = - in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] - - in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0]; + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]); + __m128 t1 = _mm_load_ps(in); + __m128 t2 = _mm_load_ps(A); + __m128 t3 = _mm_load_ps(B); + + _mm_store_ps(buf, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t2)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t2))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)))); + } else { + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); + + for (i = 0; i < N; i += 8) { + __m128 t1 = _mm_load_ps(in + i); + __m128 t2 = _mm_load_ps(in + N - i - 4); + __m128 t3 = _mm_load_ps(A + i); + __m128 t4 = _mm_load_ps(B + i); + + _mm_store_ps(buf + i, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t0 = _mm_load_ps(in + N - i - 8); + t1 = _mm_load_ps(in + i + 4); + t3 = _mm_load_ps(A + i + 4); + t4 = _mm_load_ps(B + i + 4); + + _mm_store_ps(buf + i + 4, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } + } +#elif HAVE_SSE + if (N < 8) { + __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]); + __m128 t1 = _mm_load_ps(in); + __m128 t2 = _mm_load_ps(A); + __m128 t3 = _mm_load_ps(B); + + _mm_store_ps(buf, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t3, c0)))); } else { - const __m128 c0 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f); + __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); for (i = 0; i < N; i += 8) { @@ -341,7 +395,7 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) __m128 t2 = _mm_load_ps(in + N - i - 4); __m128 t3 = _mm_load_ps(A + i); __m128 t4 = _mm_load_ps(B + i); - + _mm_store_ps(buf + i, _mm_add_ps(_mm_sub_ps(_mm_add_ps( _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), @@ -429,14 +483,18 @@ ffts_init_1d_real(size_t N, int sign) #ifdef HAVE_SSE3 p->B[2 * i + 0] = (float) (-0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); #else - p->B[2 * i + 0] = (float) (0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); + p->B[2 * i + 0] = (float) ( 0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); #endif p->B[2 * i + 1] = (float) ( 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); } } else { for (i = 0; i < N/2; i++) { p->A[2 * i + 0] = (float) (1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i))); +#ifdef HAVE_SSE3 + p->A[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); +#else p->A[2 * i + 1] = (float) (1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i))); +#endif p->B[2 * i + 0] = (float) (1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); p->B[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); } -- cgit v1.1 From ed8a12ca33ffa69604bc261e65a17ea6c04fbeb8 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Wed, 8 Jul 2015 16:38:58 +0300 Subject: Half the number of calls to sin/cos functions in ffts_init_1d_real --- src/ffts_real.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 12 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 5c01103..a737696 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -477,27 +477,83 @@ ffts_init_1d_real(size_t N, int sign) } if (sign < 0) { - for (i = 0; i < N/2; i++) { - p->A[2 * i + 0] = (float) ( 0.5 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i))); - p->A[2 * i + 1] = (float) ( 0.5 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i))); + /* peel off the first */ + p->A[0] = 0.5f; + p->A[1] = -0.5f; #ifdef HAVE_SSE3 - p->B[2 * i + 0] = (float) (-0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); + p->B[0] = -0.5f; #else - p->B[2 * i + 0] = (float) ( 0.5 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); + p->B[0] = 0.5f; +#endif + p->B[1] = 0.5f; + + for (i = 1; i < N/4; i++) { + float t0 = (float) (0.5 * (1.0 - sin(2.0 * M_PI / N * i))); + float t1 = (float) (0.5 * (1.0 * cos(2.0 * M_PI / N * i))); + float t2 = (float) (0.5 * (1.0 + sin(2.0 * M_PI / N * i))); + + p->A[ 2 * i + 0] = t0; + p->A[N - 2 * i + 0] = t0; + p->A[ 2 * i + 1] = -t1; + p->A[N - 2 * i + 1] = t1; + +#ifdef HAVE_SSE3 + p->B[ 2 * i + 0] = -t2; + p->B[N - 2 * i + 0] = -t2; +#else + p->B[ 2 * i + 0] = t2; + p->B[N - 2 * i + 0] = t2; #endif - p->B[2 * i + 1] = (float) ( 0.5 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); + p->B[ 2 * i + 1] = t1; + p->B[N - 2 * i + 1] = -t1; } + + /* and the last */ + p->A[2 * i + 0] = 0.0f; + p->A[2 * i + 1] = 0.0f; +#ifdef HAVE_SSE3 + p->B[2 * i + 0] = -1.0f; +#else + p->B[2 * i + 0] = 1.0f; +#endif + p->B[2 * i + 1] = 0.0f; } else { - for (i = 0; i < N/2; i++) { - p->A[2 * i + 0] = (float) (1.0 * ( 1.0 - sin(2.0 * M_PI / (double) N * (double) i))); + /* peel of the first */ + p->A[0] = 1.0f; +#ifdef HAVE_SSE3 + p->A[1] = 1.0f; +#else + p->A[1] = -1.0f; +#endif + p->B[0] = 1.0f; + p->B[1] = 1.0f; + + for (i = 1; i < N/4; i++) { + float t0 = (float) (1.0 * (1.0 - sin(2.0 * M_PI / N * i))); + float t1 = (float) (1.0 * (1.0 * cos(2.0 * M_PI / N * i))); + float t2 = (float) (1.0 * (1.0 + sin(2.0 * M_PI / N * i))); + + p->A[ 2 * i + 0] = t0; + p->A[N - 2 * i + 0] = t0; #ifdef HAVE_SSE3 - p->A[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); + p->A[ 2 * i + 1] = t1; + p->A[N - 2 * i + 1] = -t1; #else - p->A[2 * i + 1] = (float) (1.0 * (-1.0 * cos(2.0 * M_PI / (double) N * (double) i))); + p->A[ 2 * i + 1] = -t1; + p->A[N - 2 * i + 1] = t1; #endif - p->B[2 * i + 0] = (float) (1.0 * ( 1.0 + sin(2.0 * M_PI / (double) N * (double) i))); - p->B[2 * i + 1] = (float) (1.0 * ( 1.0 * cos(2.0 * M_PI / (double) N * (double) i))); + + p->B[ 2 * i + 0] = t2; + p->B[N - 2 * i + 0] = t2; + p->B[ 2 * i + 1] = t1; + p->B[N - 2 * i + 1] = -t1; } + + /* and the last */ + p->A[2 * i + 0] = 0.0f; + p->A[2 * i + 1] = 0.0f; + p->B[2 * i + 0] = 2.0f; + p->B[2 * i + 1] = 0.0f; } return p; -- cgit v1.1 From 7e018bb933d5291155739614e422773c4c2d8781 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Thu, 9 Jul 2015 15:37:53 +0300 Subject: Unroll loops to process 64 byte cache line per iteration --- src/ffts_real.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 205 insertions(+), 39 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index a737696..0327f15 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -152,22 +152,36 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) ); } #elif HAVE_SSE3 - if (N < 8) { + if (FFTS_UNLIKELY(N <= 8)) { __m128 t0 = _mm_load_ps(buf); - __m128 t1 = _mm_load_ps(A); - __m128 t2 = _mm_load_ps(B); + __m128 t1 = _mm_load_ps(buf + N - 4); + __m128 t2 = _mm_load_ps(A); + __m128 t3 = _mm_load_ps(B); _mm_store_ps(out, _mm_add_ps(_mm_addsub_ps( - _mm_mul_ps(t0, _mm_moveldup_ps(t1)), + _mm_mul_ps(t0, _mm_moveldup_ps(t2)), _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)), - _mm_movehdup_ps(t1))), _mm_addsub_ps( - _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)), - _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1))), - _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2)))); + _mm_movehdup_ps(t2))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)))); + + if (N == 8) { + t2 = _mm_load_ps(A + 4); + t3 = _mm_load_ps(B + 4); + + _mm_store_ps(out + 4, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t2)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t2))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(2,2,0,0)), t3)))); + } } else { __m128 t0 = _mm_load_ps(buf); - for (i = 0; i < N; i += 8) { + for (i = 0; i < N; i += 16) { __m128 t1 = _mm_load_ps(buf + i); __m128 t2 = _mm_load_ps(buf + N - i - 4); __m128 t3 = _mm_load_ps(A + i); @@ -193,28 +207,69 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t1 = _mm_load_ps(buf + i + 8); + t2 = _mm_load_ps(buf + N - i - 12); + t3 = _mm_load_ps(A + i + 8); + t4 = _mm_load_ps(B + i + 8); + + _mm_store_ps(out + i + 8, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t0 = _mm_load_ps(buf + N - i - 16); + t1 = _mm_load_ps(buf + i + 12); + t3 = _mm_load_ps(A + i + 12); + t4 = _mm_load_ps(B + i + 12); + + _mm_store_ps(out + i + 12, _mm_add_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } } #elif HAVE_SSE - if (N < 8) { + if (FFTS_UNLIKELY(N <= 8)) { __m128 c0 = _mm_load_ps((const float*) sign_mask_even); __m128 t0 = _mm_load_ps(buf); - __m128 t1 = _mm_load_ps(A); - __m128 t2 = _mm_load_ps(B); + __m128 t1 = _mm_load_ps(buf + N - 4); + __m128 t2 = _mm_load_ps(A); + __m128 t3 = _mm_load_ps(B); _mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps( - _mm_mul_ps(t0, _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(t0, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))), _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)), - _mm_xor_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3,3,1,1)), c0))), - _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,0,0)), t2)), - _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3,3,1,1)), - _mm_shuffle_ps(_mm_xor_ps(t2, c0), _mm_xor_ps(t2, c0), + _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)), + _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t3, c0), _mm_xor_ps(t3, c0), _MM_SHUFFLE(2,3,0,1))))); + + if (N == 8) { + t2 = _mm_load_ps(A + 4); + t3 = _mm_load_ps(B + 4); + + _mm_store_ps(out + 4, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(2,2,0,0)), t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t3, c0), _mm_xor_ps(t3, c0), + _MM_SHUFFLE(2,3,0,1))))); + } } else { __m128 c0 = _mm_load_ps((const float*) sign_mask_even); __m128 t0 = _mm_load_ps(buf); - for (i = 0; i < N; i += 8) { + for (i = 0; i < N; i += 16) { __m128 t1 = _mm_load_ps(buf + i); __m128 t2 = _mm_load_ps(buf + N - i - 4); __m128 t3 = _mm_load_ps(A + i); @@ -242,6 +297,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0), _MM_SHUFFLE(2,3,0,1))))); + + t1 = _mm_load_ps(buf + i + 8); + t2 = _mm_load_ps(buf + N - i - 12); + t3 = _mm_load_ps(A + i + 8); + t4 = _mm_load_ps(B + i + 8); + + _mm_store_ps(out + i + 8, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0), + _MM_SHUFFLE(2,3,0,1))))); + + t0 = _mm_load_ps(buf + N - i - 16); + t1 = _mm_load_ps(buf + i + 12); + t3 = _mm_load_ps(A + i + 12); + t4 = _mm_load_ps(B + i + 12); + + _mm_store_ps(out + i + 12, _mm_add_ps(_mm_add_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0), + _MM_SHUFFLE(2,3,0,1))))); } } #else @@ -326,23 +409,37 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) ); } #elif HAVE_SSE3 - if (N < 8) { - __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]); + if (FFTS_UNLIKELY(N <= 8)) { + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); __m128 t1 = _mm_load_ps(in); - __m128 t2 = _mm_load_ps(A); - __m128 t3 = _mm_load_ps(B); + __m128 t2 = _mm_load_ps(in + N - 4); + __m128 t3 = _mm_load_ps(A); + __m128 t4 = _mm_load_ps(B); _mm_store_ps(buf, _mm_sub_ps(_mm_addsub_ps( - _mm_mul_ps(t1, _mm_moveldup_ps(t2)), + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), - _mm_movehdup_ps(t2))), _mm_addsub_ps( - _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), - _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))), - _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)))); + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + if (N == 8) { + t3 = _mm_load_ps(A + 4); + t4 = _mm_load_ps(B + 4); + + _mm_store_ps(buf + 4, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t2, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,2,0,0)), t4)))); + } } else { __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); - for (i = 0; i < N; i += 8) { + for (i = 0; i < N; i += 16) { __m128 t1 = _mm_load_ps(in + i); __m128 t2 = _mm_load_ps(in + N - i - 4); __m128 t3 = _mm_load_ps(A + i); @@ -368,29 +465,70 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t1 = _mm_load_ps(in + i + 8); + t2 = _mm_load_ps(in + N - i - 12); + t3 = _mm_load_ps(A + i + 8); + t4 = _mm_load_ps(B + i + 8); + + _mm_store_ps(buf + i + 8, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)))); + + t0 = _mm_load_ps(in + N - i - 16); + t1 = _mm_load_ps(in + i + 12); + t3 = _mm_load_ps(A + i + 12); + t4 = _mm_load_ps(B + i + 12); + + _mm_store_ps(buf + i + 12, _mm_sub_ps(_mm_addsub_ps( + _mm_mul_ps(t1, _mm_moveldup_ps(t3)), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_movehdup_ps(t3))), _mm_addsub_ps( + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)))); } } #elif HAVE_SSE - if (N < 8) { + if (FFTS_UNLIKELY(N <= 8)) { __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); - __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[4]); + __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); __m128 t1 = _mm_load_ps(in); - __m128 t2 = _mm_load_ps(A); - __m128 t3 = _mm_load_ps(B); + __m128 t2 = _mm_load_ps(in + N - 4); + __m128 t3 = _mm_load_ps(A); + __m128 t4 = _mm_load_ps(B); _mm_store_ps(buf, _mm_add_ps(_mm_sub_ps(_mm_add_ps( - _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), - _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))), - _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)), - _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1)))), - _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), - _mm_xor_ps(t3, c0)))); + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); + + if (N == 8) { + t3 = _mm_load_ps(A + 4); + t4 = _mm_load_ps(B + 4); + + _mm_store_ps(buf + 4, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t2, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); + } } else { __m128 c0 = _mm_load_ps((const float*) sign_mask_odd); __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]); - for (i = 0; i < N; i += 8) { + for (i = 0; i < N; i += 16) { __m128 t1 = _mm_load_ps(in + i); __m128 t2 = _mm_load_ps(in + N - i - 4); __m128 t3 = _mm_load_ps(A + i); @@ -418,6 +556,34 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), _mm_xor_ps(t4, c0)))); + + t1 = _mm_load_ps(in + i + 8); + t2 = _mm_load_ps(in + N - i - 12); + t3 = _mm_load_ps(A + i + 8); + t4 = _mm_load_ps(B + i + 8); + + _mm_store_ps(buf + i + 8, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); + + t0 = _mm_load_ps(in + N - i - 16); + t1 = _mm_load_ps(in + i + 12); + t3 = _mm_load_ps(A + i + 12); + t4 = _mm_load_ps(B + i + 12); + + _mm_store_ps(buf + i + 12, _mm_add_ps(_mm_sub_ps(_mm_add_ps( + _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))), + _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)), + _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)), + _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))), + _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), + _mm_xor_ps(t4, c0)))); } } #else -- cgit v1.1 From 9885d87c6335d5b688bdf6b90e78de1add605d63 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 14 Jul 2015 17:08:14 +0300 Subject: Move trigonometric stuff to separate file. Implemented Oscar Buneman's method for generating a sequence of sines and cosines. --- src/ffts_real.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 0327f15..f1355c7 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -650,7 +650,7 @@ ffts_init_1d_real(size_t N, int sign) p->B[0] = -0.5f; #else p->B[0] = 0.5f; -#endif +#endif p->B[1] = 0.5f; for (i = 1; i < N/4; i++) { -- cgit v1.1 From f571c435ceccc56e79c024f626a91c57f52d94ff Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 14 Jul 2015 23:51:35 +0300 Subject: FFTS is no longer depended on any other math library, and this should help to verify its numerical accuracy. --- src/ffts_real.c | 80 +++------------------------------------------------------ 1 file changed, 3 insertions(+), 77 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index f1355c7..f6e6127 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ffts_real.h" #include "ffts_internal.h" +#include "ffts_trig.h" #ifdef HAVE_NEON #include @@ -604,7 +605,6 @@ ffts_plan_t* ffts_init_1d_real(size_t N, int sign) { ffts_plan_t *p; - size_t i; p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans)); if (!p) { @@ -642,85 +642,11 @@ ffts_init_1d_real(size_t N, int sign) goto cleanup; } - if (sign < 0) { - /* peel off the first */ - p->A[0] = 0.5f; - p->A[1] = -0.5f; -#ifdef HAVE_SSE3 - p->B[0] = -0.5f; -#else - p->B[0] = 0.5f; -#endif - p->B[1] = 0.5f; - - for (i = 1; i < N/4; i++) { - float t0 = (float) (0.5 * (1.0 - sin(2.0 * M_PI / N * i))); - float t1 = (float) (0.5 * (1.0 * cos(2.0 * M_PI / N * i))); - float t2 = (float) (0.5 * (1.0 + sin(2.0 * M_PI / N * i))); - - p->A[ 2 * i + 0] = t0; - p->A[N - 2 * i + 0] = t0; - p->A[ 2 * i + 1] = -t1; - p->A[N - 2 * i + 1] = t1; - #ifdef HAVE_SSE3 - p->B[ 2 * i + 0] = -t2; - p->B[N - 2 * i + 0] = -t2; + ffts_generate_table_1d_real_32f(p, sign, 1); #else - p->B[ 2 * i + 0] = t2; - p->B[N - 2 * i + 0] = t2; + ffts_generate_table_1d_real_32f(p, sign, 0); #endif - p->B[ 2 * i + 1] = t1; - p->B[N - 2 * i + 1] = -t1; - } - - /* and the last */ - p->A[2 * i + 0] = 0.0f; - p->A[2 * i + 1] = 0.0f; -#ifdef HAVE_SSE3 - p->B[2 * i + 0] = -1.0f; -#else - p->B[2 * i + 0] = 1.0f; -#endif - p->B[2 * i + 1] = 0.0f; - } else { - /* peel of the first */ - p->A[0] = 1.0f; -#ifdef HAVE_SSE3 - p->A[1] = 1.0f; -#else - p->A[1] = -1.0f; -#endif - p->B[0] = 1.0f; - p->B[1] = 1.0f; - - for (i = 1; i < N/4; i++) { - float t0 = (float) (1.0 * (1.0 - sin(2.0 * M_PI / N * i))); - float t1 = (float) (1.0 * (1.0 * cos(2.0 * M_PI / N * i))); - float t2 = (float) (1.0 * (1.0 + sin(2.0 * M_PI / N * i))); - - p->A[ 2 * i + 0] = t0; - p->A[N - 2 * i + 0] = t0; -#ifdef HAVE_SSE3 - p->A[ 2 * i + 1] = t1; - p->A[N - 2 * i + 1] = -t1; -#else - p->A[ 2 * i + 1] = -t1; - p->A[N - 2 * i + 1] = t1; -#endif - - p->B[ 2 * i + 0] = t2; - p->B[N - 2 * i + 0] = t2; - p->B[ 2 * i + 1] = t1; - p->B[N - 2 * i + 1] = -t1; - } - - /* and the last */ - p->A[2 * i + 0] = 0.0f; - p->A[2 * i + 1] = 0.0f; - p->B[2 * i + 0] = 2.0f; - p->B[2 * i + 1] = 0.0f; - } return p; -- cgit v1.1 From cb35f8927bc8c6992d41efcc3b972f2d8ee318dc Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Thu, 16 Jul 2015 10:45:32 +0300 Subject: Define [pa] and [pb] as constant input variables, not writable outputs --- src/ffts_real.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index f6e6127..6650d07 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -110,7 +110,7 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) #endif #ifdef __ARM_NEON__ - for (i = 0; i < N/2; i += 2) { + for (i = 0; i < N; i += 4) { __asm__ __volatile__ ( "vld1.32 {q8}, [%[pa]]!\n\t" "vld1.32 {q9}, [%[pb]]!\n\t" @@ -146,9 +146,8 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output) "vadd.f32 q13, q13, q15\n\t" "vadd.f32 q12, q12, q13\n\t" "vst1.32 {q12}, [%[pout]]!\n\t" - : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), - [pout] "+r" (p_out) - : + : [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), [pout] "+r" (p_out) + : [pa] "r" (A), [pb] "r" (B) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -403,9 +402,8 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) "vsub.f32 q13, q13, q15\n\t" "vadd.f32 q12, q12, q13\n\t" "vst1.32 {q12}, [%[pout]]!\n\t" - : [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), - [pout] "+r" (p_out) - : + : [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), [pout] "+r" (p_out) + : [pa] "r" (A), [pb] "r" (B) : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -- cgit v1.1 From ae1b59ddd07cb66b0807bc2c7c981ce96c69acea Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Mon, 30 Nov 2015 17:16:01 +0200 Subject: Enable building shared library and start version numbering from 0.9.0. On Windows when using FFTS as a DLL, define FFTS_SHARED. This is not mandatory, but it offers a little performance increase. Hide symbols when possible to improve compiler optimization and sizeof binary. Use CMake target alias "ffts" to choose between static and shared library, preferring static --- src/ffts_real.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 6650d07..7f41069 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -599,7 +599,7 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output) p->plans[0]->transform(p->plans[0], buf, output); } -ffts_plan_t* +FFTS_API ffts_plan_t* ffts_init_1d_real(size_t N, int sign) { ffts_plan_t *p; -- cgit v1.1 From 8b2d55e5d6bd43eb45ca7da1595fccc401a22158 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Tue, 5 Apr 2016 17:52:13 +0300 Subject: Combine ffts_tranpose_scalar and ffts_transpose, and use ffts_transpose_scalar as native C fallback --- src/ffts_real.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/ffts_real.c') diff --git a/src/ffts_real.c b/src/ffts_real.c index 7f41069..0f87a12 100644 --- a/src/ffts_real.c +++ b/src/ffts_real.c @@ -641,9 +641,9 @@ ffts_init_1d_real(size_t N, int sign) } #ifdef HAVE_SSE3 - ffts_generate_table_1d_real_32f(p, sign, 1); + ffts_generate_table_1d_real_32f(p, sign, 1); #else - ffts_generate_table_1d_real_32f(p, sign, 0); + ffts_generate_table_1d_real_32f(p, sign, 0); #endif return p; -- cgit v1.1