From f4e533c64e0c005e567b3fa76bc3456facbe5484 Mon Sep 17 00:00:00 2001
From: Jukka Ojanen <jukka.ojanen@linkotec.net>
Date: Wed, 16 Sep 2015 17:52:55 +0300
Subject: Add double-double arithmetic to generate "exact" double precision
 cosine and sine tables. Correct rounding verified using MPFR upto 2^28. SSE2
 optimized ffts_generate_cosine_sine_pow2_64f takes twice as long as
 ffts_generate_cosine_sine_pow2_32f.

---
 src/ffts_dd.h   | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/ffts_trig.c | 146 +++++++++++++++++++++++++++++++++++
 src/ffts_trig.h |   3 +
 3 files changed, 379 insertions(+)
 create mode 100644 src/ffts_dd.h

diff --git a/src/ffts_dd.h b/src/ffts_dd.h
new file mode 100644
index 0000000..e9402c6
--- /dev/null
+++ b/src/ffts_dd.h
@@ -0,0 +1,230 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_DD_H
+#define FFTS_DD_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_attributes.h"
+
+#if HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+/* double-double number */
+struct ffts_dd_t
+{
+    double hi;
+    double lo;
+};
+
+#if HAVE_SSE2
+/* double-double vector */
+struct ffts_dd2_t {
+    __m128d hi;
+    __m128d lo;
+};
+#endif
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd_unnormalized(const struct ffts_dd_t a,
+                            const struct ffts_dd_t b);
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd_unnormalized(const struct ffts_dd_t a,
+                            const struct ffts_dd_t b);
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_split(double a);
+
+/* aka quick-two-sum */
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add(double a, double b)
+{
+    struct ffts_dd_t dd;
+    dd.hi = a + b;
+    dd.lo = b - (dd.hi - a);
+    return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd(const struct ffts_dd_t a,
+               const struct ffts_dd_t b)
+{
+    struct ffts_dd_t t1 = ffts_dd_add_dd_unnormalized(a, b);
+    return ffts_dd_add(t1.hi, t1.lo);
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd_unnormalized(const struct ffts_dd_t a,
+                            const struct ffts_dd_t b)
+{
+    struct ffts_dd_t dd;
+    double e1;
+    dd.hi = a.hi + b.hi;
+    e1 = dd.hi - a.hi;
+    dd.lo = ((a.hi - (dd.hi - e1)) + (b.hi - e1)) + (a.lo + b.lo);
+    return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul(const double a, const double b)
+{
+    struct ffts_dd_t dd;
+    struct ffts_dd_t t1 = ffts_dd_split(a);
+    struct ffts_dd_t t2 = ffts_dd_split(b);
+    dd.hi = a * b;
+    dd.lo = (t1.hi * t2.hi - dd.hi);
+    dd.lo += (t1.hi * t2.lo + t1.lo * t2.hi);
+    dd.lo += t1.lo * t2.lo;
+    return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd(const struct ffts_dd_t a,
+               const struct ffts_dd_t b)
+{
+    struct ffts_dd_t dd = ffts_dd_mul_dd_unnormalized(a, b);
+    return ffts_dd_add(dd.hi, dd.lo);
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd_unnormalized(const struct ffts_dd_t a,
+                            const struct ffts_dd_t b)
+{
+    struct ffts_dd_t dd = ffts_dd_mul(a.hi, b.hi);
+    dd.lo += (a.hi * b.lo + a.lo * b.hi);
+    return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_split(double a)
+{
+    /* 2^27+1 = 134217729 */
+    struct ffts_dd_t dd;
+    double t = 134217729.0 * a;
+    dd.hi = t - (t - a);
+    dd.lo = a - dd.hi;
+    return dd;
+}
+
+#if HAVE_SSE2
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2_unnormalized(const struct ffts_dd2_t a,
+                              const struct ffts_dd2_t b);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2_unnormalized(const struct ffts_dd2_t a,
+                              const struct ffts_dd2_t b);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_split(__m128d a);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add(__m128d a, __m128d b)
+{
+    struct ffts_dd2_t dd2;
+    dd2.hi = _mm_add_pd(a, b);
+    dd2.lo = _mm_sub_pd(b, _mm_sub_pd(dd2.hi, a));
+    return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2(const struct ffts_dd2_t a,
+                 const struct ffts_dd2_t b)
+{
+    struct ffts_dd2_t t1 = ffts_dd2_add_dd2_unnormalized(a, b);
+    return ffts_dd2_add(t1.hi, t1.lo);
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2_unnormalized(const struct ffts_dd2_t a,
+                              const struct ffts_dd2_t b)
+{
+    struct ffts_dd2_t dd2;
+    __m128d e1;
+    dd2.hi = _mm_add_pd(a.hi, b.hi);
+    e1 = _mm_sub_pd(dd2.hi, a.hi);
+    dd2.lo = _mm_add_pd(_mm_add_pd(_mm_sub_pd(a.hi, _mm_sub_pd(dd2.hi, e1)),
+        _mm_sub_pd(b.hi, e1)), _mm_add_pd(a.lo, b.lo));
+    return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul(const __m128d a, const __m128d b)
+{
+    struct ffts_dd2_t dd2;
+    struct ffts_dd2_t t1 = ffts_dd2_split(a);
+    struct ffts_dd2_t t2 = ffts_dd2_split(b);
+    dd2.hi = _mm_mul_pd(a, b);
+    dd2.lo = _mm_add_pd(_mm_add_pd(_mm_sub_pd(
+        _mm_mul_pd(t1.hi, t2.hi), dd2.hi),
+        _mm_add_pd(_mm_mul_pd(t1.hi, t2.lo),
+        _mm_mul_pd(t1.lo, t2.hi))),
+        _mm_mul_pd(t1.lo, t2.lo));
+    return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2(const struct ffts_dd2_t a,
+                 const struct ffts_dd2_t b)
+{
+    struct ffts_dd2_t dd2 = ffts_dd2_mul_dd2_unnormalized(a, b);
+    return ffts_dd2_add(dd2.hi, dd2.lo);
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2_unnormalized(const struct ffts_dd2_t a,
+                              const struct ffts_dd2_t b)
+{
+    struct ffts_dd2_t dd2 = ffts_dd2_mul(a.hi, b.hi);
+    dd2.lo = _mm_add_pd(dd2.lo, _mm_add_pd(
+        _mm_mul_pd(a.hi, b.lo), _mm_mul_pd(a.lo, b.hi)));
+    return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_split(__m128d a)
+{
+    /* 2^27+1 = 134217729 */
+    struct ffts_dd2_t dd2;
+    __m128d t = _mm_mul_pd(a, _mm_set1_pd(134217729.0));
+    dd2.hi = _mm_sub_pd(t, _mm_sub_pd(t, a));
+    dd2.lo = _mm_sub_pd(a, dd2.hi);
+    return dd2;
+}
+#endif /* HAVE_SSE2 */
+
+#endif /* FFTS_DD_H */
diff --git a/src/ffts_trig.c b/src/ffts_trig.c
index 624d2c3..1ca9c98 100644
--- a/src/ffts_trig.c
+++ b/src/ffts_trig.c
@@ -31,6 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include "ffts_trig.h"
+#include "ffts_dd.h"
 
 /* 1/(2*cos(pow(2,-p)*pi)) */
 static const FFTS_ALIGN(16) unsigned int half_secant[132] = {
@@ -286,6 +287,151 @@ exit:
     return 0;
 }
 
+#if HAVE_SSE2
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+    static const __m128d sign_swap = { 0.0, -0.0 };
+    const struct ffts_dd2_t *FFTS_RESTRICT ct;
+    const double *FFTS_RESTRICT hs;
+    struct ffts_dd2_t FFTS_ALIGN(16) w[32];
+    struct ffts_dd2_t FFTS_ALIGN(16) h[32];
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =   1.0;
+    table[0][1] =  -0.0;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const struct ffts_dd2_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const double*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i] = ct[i];
+
+        /* duplicate the high and low parts */
+        h[i].hi = _mm_set1_pd(hs[2*i + 0]);
+        h[i].lo = _mm_set1_pd(hs[2*i + 1]);
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        /* result of ffts_dd_mul_dd is normalized */
+        _mm_store_pd((double*) &table[i + 0],
+            _mm_or_pd(w[log_2].hi, sign_swap));
+        _mm_store_pd((double*) &table[table_size - i],
+            _mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap));
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        w[log_2] = ffts_dd2_mul_dd2(h[log_2],
+            ffts_dd2_add_dd2_unnormalized(w[log_2 + 1], w[offset]));
+    }
+
+mid_point:
+    table[i][0] =  0.707106781186547524;
+    table[i][1] = -0.707106781186547524;
+
+exit:
+    return 0;
+}
+#else
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+    const struct ffts_dd_t *FFTS_RESTRICT ct;
+    const struct ffts_dd_t *FFTS_RESTRICT hs;
+    struct ffts_dd_t FFTS_ALIGN(16) w[32][2];
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =  1.0;
+    table[0][1] = -0.0;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const struct ffts_dd_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const struct ffts_dd_t*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i][0].hi = ct[2*i + 0].hi;
+        w[i][0].lo = ct[2*i + 1].hi;
+        w[i][1].hi = ct[2*i + 0].lo;
+        w[i][1].lo = ct[2*i + 1].lo;
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        /* result of ffts_dd_mul_dd is normalized */
+        table[i          + 0][0] =  w[log_2][0].hi;
+        table[i          + 0][1] = -w[log_2][1].hi;
+        table[table_size - i][0] =  w[log_2][1].hi;
+        table[table_size - i][1] = -w[log_2][0].hi;
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
+            ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
+        w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
+            ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+    }
+
+mid_point:
+    table[i][0] =  0.707106781186547524;
+    table[i][1] = -0.707106781186547524;
+
+exit:
+    return 0;
+}
+#endif
+
 int
 ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
                                 int sign,
diff --git a/src/ffts_trig.h b/src/ffts_trig.h
index cfed2fb..0b22738 100644
--- a/src/ffts_trig.h
+++ b/src/ffts_trig.h
@@ -46,6 +46,9 @@ int
 ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size);
 
 int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size);
+
+int
 ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
                                 int sign,
                                 int invert);
-- 
cgit v1.1