summaryrefslogtreecommitdiffstats
path: root/src/ffts_trig.c
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2015-09-16 17:52:55 +0300
committerJukka Ojanen <jukka.ojanen@linkotec.net>2015-09-16 17:52:55 +0300
commitf4e533c64e0c005e567b3fa76bc3456facbe5484 (patch)
tree2ee9201c2f17045eaf204717dc7a8f722674395c /src/ffts_trig.c
parent7ae74d3d18c9113fffd6530c891add60046c7ee1 (diff)
downloadffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.zip
ffts-f4e533c64e0c005e567b3fa76bc3456facbe5484.tar.gz
Add double-double arithmetic to generate "exact" double precision cosine and sine tables. Correct rounding verified using MPFR upto 2^28. SSE2 optimized ffts_generate_cosine_sine_pow2_64f takes twice as long as ffts_generate_cosine_sine_pow2_32f.
Diffstat (limited to 'src/ffts_trig.c')
-rw-r--r--src/ffts_trig.c146
1 files changed, 146 insertions, 0 deletions
diff --git a/src/ffts_trig.c b/src/ffts_trig.c
index 624d2c3..1ca9c98 100644
--- a/src/ffts_trig.c
+++ b/src/ffts_trig.c
@@ -31,6 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_trig.h"
+#include "ffts_dd.h"
/* 1/(2*cos(pow(2,-p)*pi)) */
static const FFTS_ALIGN(16) unsigned int half_secant[132] = {
@@ -286,6 +287,151 @@ exit:
return 0;
}
+#if HAVE_SSE2
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+ static const __m128d sign_swap = { 0.0, -0.0 };
+ const struct ffts_dd2_t *FFTS_RESTRICT ct;
+ const double *FFTS_RESTRICT hs;
+ struct ffts_dd2_t FFTS_ALIGN(16) w[32];
+ struct ffts_dd2_t FFTS_ALIGN(16) h[32];
+ int i, log_2, offset;
+
+ /* size must be a power of two */
+ if (!table || !table_size || (table_size & (table_size - 1))) {
+ return -1;
+ }
+
+ /* the first */
+ table[0][0] = 1.0;
+ table[0][1] = -0.0;
+
+ if (FFTS_UNLIKELY(table_size == 1)) {
+ goto exit;
+ }
+
+ if (FFTS_UNLIKELY(table_size == 2)) {
+ /* skip over */
+ i = 1;
+ goto mid_point;
+ }
+
+ /* calculate table offset */
+ FFTS_ASSUME(table_size/2 > 1);
+ log_2 = ffts_ctzl(table_size);
+ FFTS_ASSUME(log_2 > 1);
+ offset = 32 - log_2;
+ ct = (const struct ffts_dd2_t*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+ hs = (const double*) &half_secant[4 * offset];
+
+ /* initialize from lookup table */
+ for (i = 0; i <= log_2; i++) {
+ w[i] = ct[i];
+
+ /* duplicate the high and low parts */
+ h[i].hi = _mm_set1_pd(hs[2*i + 0]);
+ h[i].lo = _mm_set1_pd(hs[2*i + 1]);
+ }
+
+ /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+ for (i = 1; i < table_size/2; i++) {
+ /* calculate trailing zeros in index */
+ log_2 = ffts_ctzl(i);
+
+ /* result of ffts_dd_mul_dd is normalized */
+ _mm_store_pd((double*) &table[i + 0],
+ _mm_or_pd(w[log_2].hi, sign_swap));
+ _mm_store_pd((double*) &table[table_size - i],
+ _mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap));
+
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ w[log_2] = ffts_dd2_mul_dd2(h[log_2],
+ ffts_dd2_add_dd2_unnormalized(w[log_2 + 1], w[offset]));
+ }
+
+mid_point:
+ table[i][0] = 0.707106781186547524;
+ table[i][1] = -0.707106781186547524;
+
+exit:
+ return 0;
+}
+#else
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+ const struct ffts_dd_t *FFTS_RESTRICT ct;
+ const struct ffts_dd_t *FFTS_RESTRICT hs;
+ struct ffts_dd_t FFTS_ALIGN(16) w[32][2];
+ int i, log_2, offset;
+
+ /* size must be a power of two */
+ if (!table || !table_size || (table_size & (table_size - 1))) {
+ return -1;
+ }
+
+ /* the first */
+ table[0][0] = 1.0;
+ table[0][1] = -0.0;
+
+ if (FFTS_UNLIKELY(table_size == 1)) {
+ goto exit;
+ }
+
+ if (FFTS_UNLIKELY(table_size == 2)) {
+ /* skip over */
+ i = 1;
+ goto mid_point;
+ }
+
+ /* calculate table offset */
+ FFTS_ASSUME(table_size/2 > 1);
+ log_2 = ffts_ctzl(table_size);
+ FFTS_ASSUME(log_2 > 1);
+ offset = 32 - log_2;
+ ct = (const struct ffts_dd_t*)
+ FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+ hs = (const struct ffts_dd_t*) &half_secant[4 * offset];
+
+ /* initialize from lookup table */
+ for (i = 0; i <= log_2; i++) {
+ w[i][0].hi = ct[2*i + 0].hi;
+ w[i][0].lo = ct[2*i + 1].hi;
+ w[i][1].hi = ct[2*i + 0].lo;
+ w[i][1].lo = ct[2*i + 1].lo;
+ }
+
+ /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+ for (i = 1; i < table_size/2; i++) {
+ /* calculate trailing zeros in index */
+ log_2 = ffts_ctzl(i);
+
+ /* result of ffts_dd_mul_dd is normalized */
+ table[i + 0][0] = w[log_2][0].hi;
+ table[i + 0][1] = -w[log_2][1].hi;
+ table[table_size - i][0] = w[log_2][1].hi;
+ table[table_size - i][1] = -w[log_2][0].hi;
+
+ /* skip and find next trailing zero */
+ offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+ w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
+ ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
+ w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
+ ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+ }
+
+mid_point:
+ table[i][0] = 0.707106781186547524;
+ table[i][1] = -0.707106781186547524;
+
+exit:
+ return 0;
+}
+#endif
+
int
ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
int sign,
OpenPOWER on IntegriCloud