Add double-double arithmetic to generate "exact" double precision cosine and sine tables. Correct rounding verified using MPFR upto 2^28. SSE2 optimized ffts_generate_cosine_sine_pow2_64f takes twice as long as ffts_generate_cosine_sine_pow2_32f.

author: Jukka Ojanen <jukka.ojanen@linkotec.net> 2015-09-16 17:52:55 +0300
committer: Jukka Ojanen <jukka.ojanen@linkotec.net> 2015-09-16 17:52:55 +0300
commit: f4e533c64e0c005e567b3fa76bc3456facbe5484 (patch)
tree: 2ee9201c2f17045eaf204717dc7a8f722674395c /src/ffts_trig.c
parent: 7ae74d3d18c9113fffd6530c891add60046c7ee1 (diff)
download: ffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.zip
ffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.tar.gz
1 files changed, 146 insertions, 0 deletions
diff --git a/src/ffts_trig.c b/src/ffts_trig.c
index 624d2c3..1ca9c98 100644
--- a/src/ffts_trig.c
+++ b/src/ffts_trig.c
@@ -31,6 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include "ffts_trig.h"
+#include "ffts_dd.h"
 
 /* 1/(2*cos(pow(2,-p)*pi)) */
 static const FFTS_ALIGN(16) unsigned int half_secant[132] = {
@@ -286,6 +287,151 @@ exit:
     return 0;
 }
 
+#if HAVE_SSE2
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+    static const __m128d sign_swap = { 0.0, -0.0 };
+    const struct ffts_dd2_t *FFTS_RESTRICT ct;
+    const double *FFTS_RESTRICT hs;
+    struct ffts_dd2_t FFTS_ALIGN(16) w[32];
+    struct ffts_dd2_t FFTS_ALIGN(16) h[32];
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =   1.0;
+    table[0][1] =  -0.0;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const struct ffts_dd2_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const double*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i] = ct[i];
+
+        /* duplicate the high and low parts */
+        h[i].hi = _mm_set1_pd(hs[2*i + 0]);
+        h[i].lo = _mm_set1_pd(hs[2*i + 1]);
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        /* result of ffts_dd_mul_dd is normalized */
+        _mm_store_pd((double*) &table[i + 0],
+            _mm_or_pd(w[log_2].hi, sign_swap));
+        _mm_store_pd((double*) &table[table_size - i],
+            _mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap));
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        w[log_2] = ffts_dd2_mul_dd2(h[log_2],
+            ffts_dd2_add_dd2_unnormalized(w[log_2 + 1], w[offset]));
+    }
+
+mid_point:
+    table[i][0] =  0.707106781186547524;
+    table[i][1] = -0.707106781186547524;
+
+exit:
+    return 0;
+}
+#else
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+    const struct ffts_dd_t *FFTS_RESTRICT ct;
+    const struct ffts_dd_t *FFTS_RESTRICT hs;
+    struct ffts_dd_t FFTS_ALIGN(16) w[32][2];
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =  1.0;
+    table[0][1] = -0.0;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const struct ffts_dd_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const struct ffts_dd_t*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i][0].hi = ct[2*i + 0].hi;
+        w[i][0].lo = ct[2*i + 1].hi;
+        w[i][1].hi = ct[2*i + 0].lo;
+        w[i][1].lo = ct[2*i + 1].lo;
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        /* result of ffts_dd_mul_dd is normalized */
+        table[i          + 0][0] =  w[log_2][0].hi;
+        table[i          + 0][1] = -w[log_2][1].hi;
+        table[table_size - i][0] =  w[log_2][1].hi;
+        table[table_size - i][1] = -w[log_2][0].hi;
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
+            ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
+        w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
+            ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+    }
+
+mid_point:
+    table[i][0] =  0.707106781186547524;
+    table[i][1] = -0.707106781186547524;
+
+exit:
+    return 0;
+}
+#endif
+
 int
 ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
                                 int sign,
author	Jukka Ojanen <jukka.ojanen@linkotec.net>	2015-09-16 17:52:55 +0300
committer	Jukka Ojanen <jukka.ojanen@linkotec.net>	2015-09-16 17:52:55 +0300
commit	f4e533c64e0c005e567b3fa76bc3456facbe5484 (patch)
tree	2ee9201c2f17045eaf204717dc7a8f722674395c /src/ffts_trig.c
parent	7ae74d3d18c9113fffd6530c891add60046c7ee1 (diff)
download	ffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.zip ffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.tar.gz