From e013b85d38101abc0c449dff509aaf8a0057d321 Mon Sep 17 00:00:00 2001
From: Jukka Ojanen <jukka.ojanen@linkotec.net>
Date: Thu, 17 Sep 2015 16:49:58 +0300
Subject: Add SSE2 optimized ffts_generate_cosine_sine_pow2_32f

---
 src/ffts_trig.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 14 deletions(-)

(limited to 'src/ffts_trig.c')

diff --git a/src/ffts_trig.c b/src/ffts_trig.c
index 1ca9c98..cdd2d05 100644
--- a/src/ffts_trig.c
+++ b/src/ffts_trig.c
@@ -221,12 +221,15 @@ exit:
 *  O. Buneman, Stable on–line creation of sines and cosines of
 *  successive angles, Proc. IEEE 75, 1434 – 1435 (1987).
 */
+#if HAVE_SSE2
 int
 ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
 {
-    const ffts_cpx_64f *FFTS_RESTRICT ct;
+    static const __m128d sign_swap = { 0.0, -0.0 };
+    const __m128d *FFTS_RESTRICT ct;
     const double *FFTS_RESTRICT hs;
-    ffts_cpx_64f FFTS_ALIGN(16) w[32];
+    __m128d FFTS_ALIGN(16) w[32];
+    __m128d FFTS_ALIGN(16) h[32];
     int i, log_2, offset;
 
     /* size must be a power of two */
@@ -253,14 +256,16 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
     log_2 = ffts_ctzl(table_size);
     FFTS_ASSUME(log_2 > 1);
     offset = 32 - log_2;
-    ct = (const ffts_cpx_64f*)
+    ct = (const __m128d*)
         FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
     hs = (const double*) &half_secant[4 * offset];
 
     /* initialize from lookup table */
     for (i = 0; i <= log_2; i++) {
-        w[i][0] = ct[2*i][0];
-        w[i][1] = ct[2*i][1];
+        w[i] = ct[2*i];
+
+        /* duplicate the high part */
+        h[i] = _mm_set1_pd(hs[2*i]);
     }
 
     /* generate sine and cosine tables with maximum error less than 0.5 ULP */
@@ -268,15 +273,15 @@ ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
         /* calculate trailing zeros in index */
         log_2 = ffts_ctzl(i);
 
-        table[i          + 0][0] = (float)  w[log_2][0];
-        table[i          + 0][1] = (float) -w[log_2][1];
-        table[table_size - i][0] = (float)  w[log_2][1];
-        table[table_size - i][1] = (float) -w[log_2][0];
+        /* note that storing is not 16 byte aligned */
+        _mm_storel_pi((__m64*) &table[i + 0],
+            _mm_cvtpd_ps(_mm_or_pd(w[log_2], sign_swap)));
+        _mm_storel_pi((__m64*) &table[table_size - i], _mm_cvtpd_ps(
+            _mm_or_pd(_mm_shuffle_pd(w[log_2], w[log_2], 1), sign_swap)));
 
         /* skip and find next trailing zero */
         offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
-        w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
-        w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+        w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset]));
     }
 
 mid_point:
@@ -287,7 +292,6 @@ exit:
     return 0;
 }
 
-#if HAVE_SSE2
 int
 ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
 {
@@ -304,8 +308,8 @@ ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
     }
 
     /* the first */
-    table[0][0] =   1.0;
-    table[0][1] =  -0.0;
+    table[0][0] =  1.0;
+    table[0][1] = -0.0;
 
     if (FFTS_UNLIKELY(table_size == 1)) {
         goto exit;
@@ -361,6 +365,72 @@ exit:
 }
 #else
 int
+ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
+{
+    const ffts_cpx_64f *FFTS_RESTRICT ct;
+    const double *FFTS_RESTRICT hs;
+    ffts_cpx_64f FFTS_ALIGN(16) w[32];
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =  1.0f;
+    table[0][1] = -0.0f;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const ffts_cpx_64f*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const double*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i][0] = ct[2*i][0];
+        w[i][1] = ct[2*i][1];
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        table[i          + 0][0] = (float)  w[log_2][0];
+        table[i          + 0][1] = (float) -w[log_2][1];
+        table[table_size - i][0] = (float)  w[log_2][1];
+        table[table_size - i][1] = (float) -w[log_2][0];
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
+        w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+    }
+
+mid_point:
+    table[i][0] =  0.70710677f;
+    table[i][1] = -0.70710677f;
+
+exit:
+    return 0;
+}
+
+int
 ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
 {
     const struct ffts_dd_t *FFTS_RESTRICT ct;
-- 
cgit v1.1