summaryrefslogtreecommitdiffstats
path: root/src/ffts_trig.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-09-17 16:49:58 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-09-17 16:49:58 +0300
commite013b85d38101abc0c449dff509aaf8a0057d321 (patch)
tree15992183ee9f2cf8c8b4e25a9886c5ecc7f6aa8a /src/ffts_trig.c
parentf4e533c64e0c005e567b3fa76bc3456facbe5484 (diff)
downloadffts-e013b85d38101abc0c449dff509aaf8a0057d321.zip
ffts-e013b85d38101abc0c449dff509aaf8a0057d321.tar.gz
Add SSE2 optimized ffts_generate_cosine_sine_pow2_32f
Diffstat (limited to 'src/ffts_trig.c')
-rw-r--r--src/ffts_trig.c98
1 files changed, 84 insertions, 14 deletions
diff --git a/src/ffts_trig.c b/src/ffts_trig.c
index 1ca9c98..cdd2d05 100644
--- a/src/ffts_trig.c
+++ b/src/ffts_trig.c
@@ -221,12 +221,15 @@ exit:
* O. Buneman, Stable on–line creation of sines and cosines of
* successive angles, Proc. IEEE 75, 1434 – 1435 (1987).
*/
+#if HAVE_SSE2
int
ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
{
- const ffts_cpx_64f *FFTS_RESTRICT ct;
+ static const __m128d sign_swap = { 0.0, -0.0 };
+ const __m128d *FFTS_RESTRICT ct;
const double *FFTS_RESTRICT hs;
- ffts_cpx_64f FFTS_ALIGN(16) w[32];
+ __m128d FFTS_ALIGN(16) w[32];
+ __m128d FFTS_ALIGN(16) h[32];
int i, log_2, offset;
/* size must be a power of two */
@@ -253,14 +256,16 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
log_2 = ffts_ctzl(table_size);
FFTS_ASSUME(log_2 > 1);
offset = 32 - log_2;
- ct = (const ffts_cpx_64f*)
+ ct = (const __m128d*)
FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
hs = (const double*) &half_secant[4 * offset];
/* initialize from lookup table */
for (i = 0; i <= log_2; i++) {
- w[i][0] = ct[2*i][0];
- w[i][1] = ct[2*i][1];
+ w[i] = ct[2*i];
+
+ /* duplicate the high part */
+ h[i] = _mm_set1_pd(hs[2*i]);
}
/* generate sine and cosine tables with maximum error less than 0.5 ULP */
@@ -268,15 +273,15 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
/* calculate trailing zeros in index */
log_2 = ffts_ctzl(i);
- table[i + 0][0] = (float) w[log_2][0];
- table[i + 0][1] = (float) -w[log_2][1];
- table[table_size - i][0] = (float) w[log_2][1];
- table[table_size - i][1] = (float) -w[log_2][0];
+ /* note that storing is not 16 byte aligned */
+ _mm_storel_pi((__m64*) &table[i + 0],
+ _mm_cvtpd_ps(_mm_or_pd(w[log_2], sign_swap)));
+ _mm_storel_pi((__m64*) &table[table_size - i], _mm_cvtpd_ps(
+ _mm_or_pd(_mm_shuffle_pd(w[log_2], w[log_2], 1), sign_swap)));
/* skip and find next trailing zero */
offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
- w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
- w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+ w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset]));
}
mid_point:
@@ -287,7 +292,6 @@ exit:
return 0;
}
-#if HAVE_SSE2
int
ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
{
@@ -304,8 +308,8 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
}
/* the first */
- table[0][0] = 1.0;
- table[0][1] = -0.0;
+ table[0][0] = 1.0;
+ table[0][1] = -0.0;
if (FFTS_UNLIKELY(table_size == 1)) {
goto exit;
@@ -361,6 +365,72 @@ exit:
}
#else
int
+ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
+{
+ const ffts_cpx_64f *FFTS_RESTRICT ct;
+ const double *FFTS_RESTRICT hs;
+ ffts_cpx_64f FFTS_ALIGN(16) w[32];
+ int i, log_2, offset;
+
+ /* size must be a power of two */
+ if (!table || !table_size || (table_size & (table_size - 1))) {
+ return -1;
+ }
+
+ /* the first */
+ table[0][0] = 1.0f;
+ table[0][1] = -0.0f;
+
+ if (FFTS_UNLIKELY(table_size == 1)) {
+ goto exit;
+ }
+
+ if (FFTS_UNLIKELY(table_size == 2)) {
+ /* skip over */
+ i = 1;
+ goto mid_point;
+ }
+
+ /* calculate table offset */
+ FFTS_ASSUME(table_size/2 > 1);
+ log_2 = ffts_ctzl(table_size);
+ FFTS_ASSUME(log_2 > 1);
+ offset = 32 - log_2;
+ ct = (const ffts_cpx_64f*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+ hs = (const double*) &half_secant[4 * offset];
+
+ /* initialize from lookup table */
+ for (i = 0; i <= log_2; i++) {
+ w[i][0] = ct[2*i][0];
+ w[i][1] = ct[2*i][1];
+ }
+
+ /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+ for (i = 1; i < table_size/2; i++) {
+ /* calculate trailing zeros in index */
+ log_2 = ffts_ctzl(i);
+
+ table[i + 0][0] = (float) w[log_2][0];
+ table[i + 0][1] = (float) -w[log_2][1];
+ table[table_size - i][0] = (float) w[log_2][1];
+ table[table_size - i][1] = (float) -w[log_2][0];
+
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
+ w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+ }
+
+mid_point:
+ table[i][0] = 0.70710677f;
+ table[i][1] = -0.70710677f;
+
+exit:
+ return 0;
+}
+
+int
ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
{
const struct ffts_dd_t *FFTS_RESTRICT ct;
OpenPOWER on IntegriCloud