From e013b85d38101abc0c449dff509aaf8a0057d321 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Thu, 17 Sep 2015 16:49:58 +0300 Subject: Add SSE2 optimized ffts_generate_cosine_sine_pow2_32f --- src/ffts_trig.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 14 deletions(-) diff --git a/src/ffts_trig.c b/src/ffts_trig.c index 1ca9c98..cdd2d05 100644 --- a/src/ffts_trig.c +++ b/src/ffts_trig.c @@ -221,12 +221,15 @@ exit: * O. Buneman, Stable on–line creation of sines and cosines of * successive angles, Proc. IEEE 75, 1434 – 1435 (1987). */ +#if HAVE_SSE2 int ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) { - const ffts_cpx_64f *FFTS_RESTRICT ct; + static const __m128d sign_swap = { 0.0, -0.0 }; + const __m128d *FFTS_RESTRICT ct; const double *FFTS_RESTRICT hs; - ffts_cpx_64f FFTS_ALIGN(16) w[32]; + __m128d FFTS_ALIGN(16) w[32]; + __m128d FFTS_ALIGN(16) h[32]; int i, log_2, offset; /* size must be a power of two */ @@ -253,14 +256,16 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) log_2 = ffts_ctzl(table_size); FFTS_ASSUME(log_2 > 1); offset = 32 - log_2; - ct = (const ffts_cpx_64f*) + ct = (const __m128d*) FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); hs = (const double*) &half_secant[4 * offset]; /* initialize from lookup table */ for (i = 0; i <= log_2; i++) { - w[i][0] = ct[2*i][0]; - w[i][1] = ct[2*i][1]; + w[i] = ct[2*i]; + + /* duplicate the high part */ + h[i] = _mm_set1_pd(hs[2*i]); } /* generate sine and cosine tables with maximum error less than 0.5 ULP */ @@ -268,15 +273,15 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) /* calculate trailing zeros in index */ log_2 = ffts_ctzl(i); - table[i + 0][0] = (float) w[log_2][0]; - table[i + 0][1] = (float) -w[log_2][1]; - table[table_size - i][0] = (float) w[log_2][1]; - table[table_size - i][1] = (float) -w[log_2][0]; + /* note that storing is not 16 byte aligned */ + _mm_storel_pi((__m64*) &table[i + 0], + _mm_cvtpd_ps(_mm_or_pd(w[log_2], sign_swap))); + _mm_storel_pi((__m64*) &table[table_size - i], _mm_cvtpd_ps( + _mm_or_pd(_mm_shuffle_pd(w[log_2], w[log_2], 1), sign_swap))); /* skip and find next trailing zero */ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); - w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]); - w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]); + w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset])); } mid_point: @@ -287,7 +292,6 @@ exit: return 0; } -#if HAVE_SSE2 int ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) { @@ -304,8 +308,8 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) } /* the first */ - table[0][0] = 1.0; - table[0][1] = -0.0; + table[0][0] = 1.0; + table[0][1] = -0.0; if (FFTS_UNLIKELY(table_size == 1)) { goto exit; @@ -361,6 +365,72 @@ exit: } #else int +ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size) +{ + const ffts_cpx_64f *FFTS_RESTRICT ct; + const double *FFTS_RESTRICT hs; + ffts_cpx_64f FFTS_ALIGN(16) w[32]; + int i, log_2, offset; + + /* size must be a power of two */ + if (!table || !table_size || (table_size & (table_size - 1))) { + return -1; + } + + /* the first */ + table[0][0] = 1.0f; + table[0][1] = -0.0f; + + if (FFTS_UNLIKELY(table_size == 1)) { + goto exit; + } + + if (FFTS_UNLIKELY(table_size == 2)) { + /* skip over */ + i = 1; + goto mid_point; + } + + /* calculate table offset */ + FFTS_ASSUME(table_size/2 > 1); + log_2 = ffts_ctzl(table_size); + FFTS_ASSUME(log_2 > 1); + offset = 32 - log_2; + ct = (const ffts_cpx_64f*) + FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); + hs = (const double*) &half_secant[4 * offset]; + + /* initialize from lookup table */ + for (i = 0; i <= log_2; i++) { + w[i][0] = ct[2*i][0]; + w[i][1] = ct[2*i][1]; + } + + /* generate sine and cosine tables with maximum error less than 0.5 ULP */ + for (i = 1; i < table_size/2; i++) { + /* calculate trailing zeros in index */ + log_2 = ffts_ctzl(i); + + table[i + 0][0] = (float) w[log_2][0]; + table[i + 0][1] = (float) -w[log_2][1]; + table[table_size - i][0] = (float) w[log_2][1]; + table[table_size - i][1] = (float) -w[log_2][0]; + + /* skip and find next trailing zero */ + offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); + w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]); + w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]); + } + +mid_point: + table[i][0] = 0.70710677f; + table[i][1] = -0.70710677f; + +exit: + return 0; +} + +int ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) { const struct ffts_dd_t *FFTS_RESTRICT ct; -- cgit v1.1