diff options
author | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2015-09-16 17:52:55 +0300 |
---|---|---|
committer | Jukka Ojanen <jukka.ojanen@linkotec.net> | 2015-09-16 17:52:55 +0300 |
commit | f4e533c64e0c005e567b3fa76bc3456facbe5484 (patch) | |
tree | 2ee9201c2f17045eaf204717dc7a8f722674395c | |
parent | 7ae74d3d18c9113fffd6530c891add60046c7ee1 (diff) | |
download | ffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.zip ffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.tar.gz |
Add double-double arithmetic to generate "exact" double precision cosine and sine tables. Correct rounding verified using MPFR upto 2^28. SSE2 optimized ffts_generate_cosine_sine_pow2_64f takes twice as long as ffts_generate_cosine_sine_pow2_32f.
-rw-r--r-- | src/ffts_dd.h | 230 | ||||
-rw-r--r-- | src/ffts_trig.c | 146 | ||||
-rw-r--r-- | src/ffts_trig.h | 3 |
3 files changed, 379 insertions, 0 deletions
diff --git a/src/ffts_dd.h b/src/ffts_dd.h new file mode 100644 index 0000000..e9402c6 --- /dev/null +++ b/src/ffts_dd.h @@ -0,0 +1,230 @@ +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi> + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef FFTS_DD_H +#define FFTS_DD_H + +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + +#include "ffts_attributes.h" + +#if HAVE_SSE2 +#include <emmintrin.h> +#endif + +/* double-double number */ +struct ffts_dd_t +{ + double hi; + double lo; +}; + +#if HAVE_SSE2 +/* double-double vector */ +struct ffts_dd2_t { + __m128d hi; + __m128d lo; +}; +#endif + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_add_dd_unnormalized(const struct ffts_dd_t a, + const struct ffts_dd_t b); + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_mul_dd_unnormalized(const struct ffts_dd_t a, + const struct ffts_dd_t b); + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_split(double a); + +/* aka quick-two-sum */ +static FFTS_INLINE struct ffts_dd_t +ffts_dd_add(double a, double b) +{ + struct ffts_dd_t dd; + dd.hi = a + b; + dd.lo = b - (dd.hi - a); + return dd; +} + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_add_dd(const struct ffts_dd_t a, + const struct ffts_dd_t b) +{ + struct ffts_dd_t t1 = ffts_dd_add_dd_unnormalized(a, b); + return ffts_dd_add(t1.hi, t1.lo); +} + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_add_dd_unnormalized(const struct ffts_dd_t a, + const struct ffts_dd_t b) +{ + struct ffts_dd_t dd; + double e1; + dd.hi = a.hi + b.hi; + e1 = dd.hi - a.hi; + dd.lo = ((a.hi - (dd.hi - e1)) + (b.hi - e1)) + (a.lo + b.lo); + return dd; +} + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_mul(const double a, const double b) +{ + struct ffts_dd_t dd; + struct ffts_dd_t t1 = ffts_dd_split(a); + struct ffts_dd_t t2 = ffts_dd_split(b); + dd.hi = a * b; + dd.lo = (t1.hi * t2.hi - dd.hi); + dd.lo += (t1.hi * t2.lo + t1.lo * t2.hi); + dd.lo += t1.lo * t2.lo; + return dd; +} + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_mul_dd(const struct ffts_dd_t a, + const struct ffts_dd_t b) +{ + struct ffts_dd_t dd = ffts_dd_mul_dd_unnormalized(a, b); + return ffts_dd_add(dd.hi, dd.lo); +} + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_mul_dd_unnormalized(const struct ffts_dd_t a, + const struct ffts_dd_t b) +{ + struct ffts_dd_t dd = ffts_dd_mul(a.hi, b.hi); + dd.lo += (a.hi * b.lo + a.lo * b.hi); + return dd; +} + +static FFTS_INLINE struct ffts_dd_t +ffts_dd_split(double a) +{ + /* 2^27+1 = 134217729 */ + struct ffts_dd_t dd; + double t = 134217729.0 * a; + dd.hi = t - (t - a); + dd.lo = a - dd.hi; + return dd; +} + +#if HAVE_SSE2 +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_add_dd2_unnormalized(const struct ffts_dd2_t a, + const struct ffts_dd2_t b); + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_mul_dd2_unnormalized(const struct ffts_dd2_t a, + const struct ffts_dd2_t b); + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_split(__m128d a); + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_add(__m128d a, __m128d b) +{ + struct ffts_dd2_t dd2; + dd2.hi = _mm_add_pd(a, b); + dd2.lo = _mm_sub_pd(b, _mm_sub_pd(dd2.hi, a)); + return dd2; +} + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_add_dd2(const struct ffts_dd2_t a, + const struct ffts_dd2_t b) +{ + struct ffts_dd2_t t1 = ffts_dd2_add_dd2_unnormalized(a, b); + return ffts_dd2_add(t1.hi, t1.lo); +} + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_add_dd2_unnormalized(const struct ffts_dd2_t a, + const struct ffts_dd2_t b) +{ + struct ffts_dd2_t dd2; + __m128d e1; + dd2.hi = _mm_add_pd(a.hi, b.hi); + e1 = _mm_sub_pd(dd2.hi, a.hi); + dd2.lo = _mm_add_pd(_mm_add_pd(_mm_sub_pd(a.hi, _mm_sub_pd(dd2.hi, e1)), + _mm_sub_pd(b.hi, e1)), _mm_add_pd(a.lo, b.lo)); + return dd2; +} + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_mul(const __m128d a, const __m128d b) +{ + struct ffts_dd2_t dd2; + struct ffts_dd2_t t1 = ffts_dd2_split(a); + struct ffts_dd2_t t2 = ffts_dd2_split(b); + dd2.hi = _mm_mul_pd(a, b); + dd2.lo = _mm_add_pd(_mm_add_pd(_mm_sub_pd( + _mm_mul_pd(t1.hi, t2.hi), dd2.hi), + _mm_add_pd(_mm_mul_pd(t1.hi, t2.lo), + _mm_mul_pd(t1.lo, t2.hi))), + _mm_mul_pd(t1.lo, t2.lo)); + return dd2; +} + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_mul_dd2(const struct ffts_dd2_t a, + const struct ffts_dd2_t b) +{ + struct ffts_dd2_t dd2 = ffts_dd2_mul_dd2_unnormalized(a, b); + return ffts_dd2_add(dd2.hi, dd2.lo); +} + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_mul_dd2_unnormalized(const struct ffts_dd2_t a, + const struct ffts_dd2_t b) +{ + struct ffts_dd2_t dd2 = ffts_dd2_mul(a.hi, b.hi); + dd2.lo = _mm_add_pd(dd2.lo, _mm_add_pd( + _mm_mul_pd(a.hi, b.lo), _mm_mul_pd(a.lo, b.hi))); + return dd2; +} + +static FFTS_INLINE struct ffts_dd2_t +ffts_dd2_split(__m128d a) +{ + /* 2^27+1 = 134217729 */ + struct ffts_dd2_t dd2; + __m128d t = _mm_mul_pd(a, _mm_set1_pd(134217729.0)); + dd2.hi = _mm_sub_pd(t, _mm_sub_pd(t, a)); + dd2.lo = _mm_sub_pd(a, dd2.hi); + return dd2; +} +#endif /* HAVE_SSE2 */ + +#endif /* FFTS_DD_H */ diff --git a/src/ffts_trig.c b/src/ffts_trig.c index 624d2c3..1ca9c98 100644 --- a/src/ffts_trig.c +++ b/src/ffts_trig.c @@ -31,6 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "ffts_trig.h" +#include "ffts_dd.h" /* 1/(2*cos(pow(2,-p)*pi)) */ static const FFTS_ALIGN(16) unsigned int half_secant[132] = { @@ -286,6 +287,151 @@ exit: return 0; } +#if HAVE_SSE2 +int +ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) +{ + static const __m128d sign_swap = { 0.0, -0.0 }; + const struct ffts_dd2_t *FFTS_RESTRICT ct; + const double *FFTS_RESTRICT hs; + struct ffts_dd2_t FFTS_ALIGN(16) w[32]; + struct ffts_dd2_t FFTS_ALIGN(16) h[32]; + int i, log_2, offset; + + /* size must be a power of two */ + if (!table || !table_size || (table_size & (table_size - 1))) { + return -1; + } + + /* the first */ + table[0][0] = 1.0; + table[0][1] = -0.0; + + if (FFTS_UNLIKELY(table_size == 1)) { + goto exit; + } + + if (FFTS_UNLIKELY(table_size == 2)) { + /* skip over */ + i = 1; + goto mid_point; + } + + /* calculate table offset */ + FFTS_ASSUME(table_size/2 > 1); + log_2 = ffts_ctzl(table_size); + FFTS_ASSUME(log_2 > 1); + offset = 32 - log_2; + ct = (const struct ffts_dd2_t*) + FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); + hs = (const double*) &half_secant[4 * offset]; + + /* initialize from lookup table */ + for (i = 0; i <= log_2; i++) { + w[i] = ct[i]; + + /* duplicate the high and low parts */ + h[i].hi = _mm_set1_pd(hs[2*i + 0]); + h[i].lo = _mm_set1_pd(hs[2*i + 1]); + } + + /* generate sine and cosine tables with maximum error less than 0.5 ULP */ + for (i = 1; i < table_size/2; i++) { + /* calculate trailing zeros in index */ + log_2 = ffts_ctzl(i); + + /* result of ffts_dd_mul_dd is normalized */ + _mm_store_pd((double*) &table[i + 0], + _mm_or_pd(w[log_2].hi, sign_swap)); + _mm_store_pd((double*) &table[table_size - i], + _mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap)); + + /* skip and find next trailing zero */ + offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); + w[log_2] = ffts_dd2_mul_dd2(h[log_2], + ffts_dd2_add_dd2_unnormalized(w[log_2 + 1], w[offset])); + } + +mid_point: + table[i][0] = 0.707106781186547524; + table[i][1] = -0.707106781186547524; + +exit: + return 0; +} +#else +int +ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size) +{ + const struct ffts_dd_t *FFTS_RESTRICT ct; + const struct ffts_dd_t *FFTS_RESTRICT hs; + struct ffts_dd_t FFTS_ALIGN(16) w[32][2]; + int i, log_2, offset; + + /* size must be a power of two */ + if (!table || !table_size || (table_size & (table_size - 1))) { + return -1; + } + + /* the first */ + table[0][0] = 1.0; + table[0][1] = -0.0; + + if (FFTS_UNLIKELY(table_size == 1)) { + goto exit; + } + + if (FFTS_UNLIKELY(table_size == 2)) { + /* skip over */ + i = 1; + goto mid_point; + } + + /* calculate table offset */ + FFTS_ASSUME(table_size/2 > 1); + log_2 = ffts_ctzl(table_size); + FFTS_ASSUME(log_2 > 1); + offset = 32 - log_2; + ct = (const struct ffts_dd_t*) + FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]); + hs = (const struct ffts_dd_t*) &half_secant[4 * offset]; + + /* initialize from lookup table */ + for (i = 0; i <= log_2; i++) { + w[i][0].hi = ct[2*i + 0].hi; + w[i][0].lo = ct[2*i + 1].hi; + w[i][1].hi = ct[2*i + 0].lo; + w[i][1].lo = ct[2*i + 1].lo; + } + + /* generate sine and cosine tables with maximum error less than 0.5 ULP */ + for (i = 1; i < table_size/2; i++) { + /* calculate trailing zeros in index */ + log_2 = ffts_ctzl(i); + + /* result of ffts_dd_mul_dd is normalized */ + table[i + 0][0] = w[log_2][0].hi; + table[i + 0][1] = -w[log_2][1].hi; + table[table_size - i][0] = w[log_2][1].hi; + table[table_size - i][1] = -w[log_2][0].hi; + + /* skip and find next trailing zero */ + offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2))); + w[log_2][0] = ffts_dd_mul_dd(hs[log_2], + ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0])); + w[log_2][1] = ffts_dd_mul_dd(hs[log_2], + ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1])); + } + +mid_point: + table[i][0] = 0.707106781186547524; + table[i][1] = -0.707106781186547524; + +exit: + return 0; +} +#endif + int ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p, int sign, diff --git a/src/ffts_trig.h b/src/ffts_trig.h index cfed2fb..0b22738 100644 --- a/src/ffts_trig.h +++ b/src/ffts_trig.h @@ -46,6 +46,9 @@ int ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size); int +ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size); + +int ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p, int sign, int invert); |