summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-09-16 17:52:55 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-09-16 17:52:55 +0300
commitf4e533c64e0c005e567b3fa76bc3456facbe5484 (patch)
tree2ee9201c2f17045eaf204717dc7a8f722674395c
parent7ae74d3d18c9113fffd6530c891add60046c7ee1 (diff)
downloadffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.zip
ffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.tar.gz
Add double-double arithmetic to generate "exact" double precision cosine and sine tables. Correct rounding verified using MPFR upto 2^28. SSE2 optimized ffts_generate_cosine_sine_pow2_64f takes twice as long as ffts_generate_cosine_sine_pow2_32f.
-rw-r--r--src/ffts_dd.h230
-rw-r--r--src/ffts_trig.c146
-rw-r--r--src/ffts_trig.h3
3 files changed, 379 insertions, 0 deletions
diff --git a/src/ffts_dd.h b/src/ffts_dd.h
new file mode 100644
index 0000000..e9402c6
--- /dev/null
+++ b/src/ffts_dd.h
@@ -0,0 +1,230 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_DD_H
+#define FFTS_DD_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_attributes.h"
+
+#if HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+/* double-double number */
+struct ffts_dd_t
+{
+ double hi;
+ double lo;
+};
+
+#if HAVE_SSE2
+/* double-double vector */
+struct ffts_dd2_t {
+ __m128d hi;
+ __m128d lo;
+};
+#endif
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd_unnormalized(const struct ffts_dd_t a,
+ const struct ffts_dd_t b);
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd_unnormalized(const struct ffts_dd_t a,
+ const struct ffts_dd_t b);
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_split(double a);
+
+/* aka quick-two-sum */
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add(double a, double b)
+{
+ struct ffts_dd_t dd;
+ dd.hi = a + b;
+ dd.lo = b - (dd.hi - a);
+ return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd(const struct ffts_dd_t a,
+ const struct ffts_dd_t b)
+{
+ struct ffts_dd_t t1 = ffts_dd_add_dd_unnormalized(a, b);
+ return ffts_dd_add(t1.hi, t1.lo);
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd_unnormalized(const struct ffts_dd_t a,
+ const struct ffts_dd_t b)
+{
+ struct ffts_dd_t dd;
+ double e1;
+ dd.hi = a.hi + b.hi;
+ e1 = dd.hi - a.hi;
+ dd.lo = ((a.hi - (dd.hi - e1)) + (b.hi - e1)) + (a.lo + b.lo);
+ return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul(const double a, const double b)
+{
+ struct ffts_dd_t dd;
+ struct ffts_dd_t t1 = ffts_dd_split(a);
+ struct ffts_dd_t t2 = ffts_dd_split(b);
+ dd.hi = a * b;
+ dd.lo = (t1.hi * t2.hi - dd.hi);
+ dd.lo += (t1.hi * t2.lo + t1.lo * t2.hi);
+ dd.lo += t1.lo * t2.lo;
+ return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd(const struct ffts_dd_t a,
+ const struct ffts_dd_t b)
+{
+ struct ffts_dd_t dd = ffts_dd_mul_dd_unnormalized(a, b);
+ return ffts_dd_add(dd.hi, dd.lo);
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd_unnormalized(const struct ffts_dd_t a,
+ const struct ffts_dd_t b)
+{
+ struct ffts_dd_t dd = ffts_dd_mul(a.hi, b.hi);
+ dd.lo += (a.hi * b.lo + a.lo * b.hi);
+ return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_split(double a)
+{
+ /* 2^27+1 = 134217729 */
+ struct ffts_dd_t dd;
+ double t = 134217729.0 * a;
+ dd.hi = t - (t - a);
+ dd.lo = a - dd.hi;
+ return dd;
+}
+
+#if HAVE_SSE2
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2_unnormalized(const struct ffts_dd2_t a,
+ const struct ffts_dd2_t b);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2_unnormalized(const struct ffts_dd2_t a,
+ const struct ffts_dd2_t b);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_split(__m128d a);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add(__m128d a, __m128d b)
+{
+ struct ffts_dd2_t dd2;
+ dd2.hi = _mm_add_pd(a, b);
+ dd2.lo = _mm_sub_pd(b, _mm_sub_pd(dd2.hi, a));
+ return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2(const struct ffts_dd2_t a,
+ const struct ffts_dd2_t b)
+{
+ struct ffts_dd2_t t1 = ffts_dd2_add_dd2_unnormalized(a, b);
+ return ffts_dd2_add(t1.hi, t1.lo);
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2_unnormalized(const struct ffts_dd2_t a,
+ const struct ffts_dd2_t b)
+{
+ struct ffts_dd2_t dd2;
+ __m128d e1;
+ dd2.hi = _mm_add_pd(a.hi, b.hi);
+ e1 = _mm_sub_pd(dd2.hi, a.hi);
+ dd2.lo = _mm_add_pd(_mm_add_pd(_mm_sub_pd(a.hi, _mm_sub_pd(dd2.hi, e1)),
+ _mm_sub_pd(b.hi, e1)), _mm_add_pd(a.lo, b.lo));
+ return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul(const __m128d a, const __m128d b)
+{
+ struct ffts_dd2_t dd2;
+ struct ffts_dd2_t t1 = ffts_dd2_split(a);
+ struct ffts_dd2_t t2 = ffts_dd2_split(b);
+ dd2.hi = _mm_mul_pd(a, b);
+ dd2.lo = _mm_add_pd(_mm_add_pd(_mm_sub_pd(
+ _mm_mul_pd(t1.hi, t2.hi), dd2.hi),
+ _mm_add_pd(_mm_mul_pd(t1.hi, t2.lo),
+ _mm_mul_pd(t1.lo, t2.hi))),
+ _mm_mul_pd(t1.lo, t2.lo));
+ return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2(const struct ffts_dd2_t a,
+ const struct ffts_dd2_t b)
+{
+ struct ffts_dd2_t dd2 = ffts_dd2_mul_dd2_unnormalized(a, b);
+ return ffts_dd2_add(dd2.hi, dd2.lo);
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2_unnormalized(const struct ffts_dd2_t a,
+ const struct ffts_dd2_t b)
+{
+ struct ffts_dd2_t dd2 = ffts_dd2_mul(a.hi, b.hi);
+ dd2.lo = _mm_add_pd(dd2.lo, _mm_add_pd(
+ _mm_mul_pd(a.hi, b.lo), _mm_mul_pd(a.lo, b.hi)));
+ return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_split(__m128d a)
+{
+ /* 2^27+1 = 134217729 */
+ struct ffts_dd2_t dd2;
+ __m128d t = _mm_mul_pd(a, _mm_set1_pd(134217729.0));
+ dd2.hi = _mm_sub_pd(t, _mm_sub_pd(t, a));
+ dd2.lo = _mm_sub_pd(a, dd2.hi);
+ return dd2;
+}
+#endif /* HAVE_SSE2 */
+
+#endif /* FFTS_DD_H */
diff --git a/src/ffts_trig.c b/src/ffts_trig.c
index 624d2c3..1ca9c98 100644
--- a/src/ffts_trig.c
+++ b/src/ffts_trig.c
@@ -31,6 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_trig.h"
+#include "ffts_dd.h"
/* 1/(2*cos(pow(2,-p)*pi)) */
static const FFTS_ALIGN(16) unsigned int half_secant[132] = {
@@ -286,6 +287,151 @@ exit:
return 0;
}
+#if HAVE_SSE2
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+ static const __m128d sign_swap = { 0.0, -0.0 };
+ const struct ffts_dd2_t *FFTS_RESTRICT ct;
+ const double *FFTS_RESTRICT hs;
+ struct ffts_dd2_t FFTS_ALIGN(16) w[32];
+ struct ffts_dd2_t FFTS_ALIGN(16) h[32];
+ int i, log_2, offset;
+
+ /* size must be a power of two */
+ if (!table || !table_size || (table_size & (table_size - 1))) {
+ return -1;
+ }
+
+ /* the first */
+ table[0][0] = 1.0;
+ table[0][1] = -0.0;
+
+ if (FFTS_UNLIKELY(table_size == 1)) {
+ goto exit;
+ }
+
+ if (FFTS_UNLIKELY(table_size == 2)) {
+ /* skip over */
+ i = 1;
+ goto mid_point;
+ }
+
+ /* calculate table offset */
+ FFTS_ASSUME(table_size/2 > 1);
+ log_2 = ffts_ctzl(table_size);
+ FFTS_ASSUME(log_2 > 1);
+ offset = 32 - log_2;
+ ct = (const struct ffts_dd2_t*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+ hs = (const double*) &half_secant[4 * offset];
+
+ /* initialize from lookup table */
+ for (i = 0; i <= log_2; i++) {
+ w[i] = ct[i];
+
+ /* duplicate the high and low parts */
+ h[i].hi = _mm_set1_pd(hs[2*i + 0]);
+ h[i].lo = _mm_set1_pd(hs[2*i + 1]);
+ }
+
+ /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+ for (i = 1; i < table_size/2; i++) {
+ /* calculate trailing zeros in index */
+ log_2 = ffts_ctzl(i);
+
+ /* result of ffts_dd_mul_dd is normalized */
+ _mm_store_pd((double*) &table[i + 0],
+ _mm_or_pd(w[log_2].hi, sign_swap));
+ _mm_store_pd((double*) &table[table_size - i],
+ _mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap));
+
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ w[log_2] = ffts_dd2_mul_dd2(h[log_2],
+ ffts_dd2_add_dd2_unnormalized(w[log_2 + 1], w[offset]));
+ }
+
+mid_point:
+ table[i][0] = 0.707106781186547524;
+ table[i][1] = -0.707106781186547524;
+
+exit:
+ return 0;
+}
+#else
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+ const struct ffts_dd_t *FFTS_RESTRICT ct;
+ const struct ffts_dd_t *FFTS_RESTRICT hs;
+ struct ffts_dd_t FFTS_ALIGN(16) w[32][2];
+ int i, log_2, offset;
+
+ /* size must be a power of two */
+ if (!table || !table_size || (table_size & (table_size - 1))) {
+ return -1;
+ }
+
+ /* the first */
+ table[0][0] = 1.0;
+ table[0][1] = -0.0;
+
+ if (FFTS_UNLIKELY(table_size == 1)) {
+ goto exit;
+ }
+
+ if (FFTS_UNLIKELY(table_size == 2)) {
+ /* skip over */
+ i = 1;
+ goto mid_point;
+ }
+
+ /* calculate table offset */
+ FFTS_ASSUME(table_size/2 > 1);
+ log_2 = ffts_ctzl(table_size);
+ FFTS_ASSUME(log_2 > 1);
+ offset = 32 - log_2;
+ ct = (const struct ffts_dd_t*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+ hs = (const struct ffts_dd_t*) &half_secant[4 * offset];
+
+ /* initialize from lookup table */
+ for (i = 0; i <= log_2; i++) {
+ w[i][0].hi = ct[2*i + 0].hi;
+ w[i][0].lo = ct[2*i + 1].hi;
+ w[i][1].hi = ct[2*i + 0].lo;
+ w[i][1].lo = ct[2*i + 1].lo;
+ }
+
+ /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+ for (i = 1; i < table_size/2; i++) {
+ /* calculate trailing zeros in index */
+ log_2 = ffts_ctzl(i);
+
+ /* result of ffts_dd_mul_dd is normalized */
+ table[i + 0][0] = w[log_2][0].hi;
+ table[i + 0][1] = -w[log_2][1].hi;
+ table[table_size - i][0] = w[log_2][1].hi;
+ table[table_size - i][1] = -w[log_2][0].hi;
+
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
+ ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
+ w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
+ ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+ }
+
+mid_point:
+ table[i][0] = 0.707106781186547524;
+ table[i][1] = -0.707106781186547524;
+
+exit:
+ return 0;
+}
+#endif
+
int
ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
int sign,
diff --git a/src/ffts_trig.h b/src/ffts_trig.h
index cfed2fb..0b22738 100644
--- a/src/ffts_trig.h
+++ b/src/ffts_trig.h
@@ -46,6 +46,9 @@ int
ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size);
int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size);
+
+int
ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
int sign,
int invert);
OpenPOWER on IntegriCloud