From 3c30b9e7c87a56aec5a47a100806844e2950cf86 Mon Sep 17 00:00:00 2001
From: "Jesse W. Towner" <jessetowner@lavabit.com>
Date: Mon, 11 Feb 2013 04:10:00 -0800
Subject: Added optimized versions of sin and cos.

The new functions have been added to the mathfuncs template class,
and are named vml_sin_chebyshev_single, vml_sin_chebyshev_double,
vml_cos_chebyshev_single, and vml_cos_chebyshev_double. The
corresponding sin and cos member functions in the vector template
structs have been updated to call into the new implementations.

These functions use float optimized minimaxed Chebyshev polynomial
approximations. They have good relative error distributions for
IEEE-754 floating point numbers, as the highest contributing
coefficient is selected to precisely map to either a 32-bit or
64-bit IEEE number for the _single and _double function variants
respectively.

The _single variants produce approximately ~30-bits of precision
in the mantissa, and the _double variants produce around ~60-bits,
which is more than enough to produce accurate values.

The vml_tan function hasn't been updated, so it calls both sin
and cos as it used to, and thus relies on the compiler to factor
out common code. It's possible to implement a sincos function
using these polynomials that interleaves the fmas, and since
the fma instructions in both the sin and cos paths don't have
any dependencies on one another, one of the paths is computed
for essentially free on x86-64 platforms due to instruction
parallelism. Alternatiely, tan can be implemented in terms of
a specifically optimized Chebyshev rational function with good
performance and properties.
---
 mathfuncs_base.h  |   4 ++
 mathfuncs_sin.h   | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 vec_double.h      |   4 +-
 vec_double_avx.h  |   4 +-
 vec_double_sse2.h |   4 +-
 vec_float.h       |   4 +-
 vec_float_avx.h   |   4 +-
 vec_float_sse2.h  |   4 +-
 8 files changed, 165 insertions(+), 17 deletions(-)
diff --git a/mathfuncs_base.h b/mathfuncs_base.h
index ea552fc..99ff9fc 100644
--- a/mathfuncs_base.h
+++ b/mathfuncs_base.h
@@ -94,6 +94,10 @@ namespace vecmathlib {
     static realvec_t vml_cos(realvec_t x);
     static realvec_t vml_sin(realvec_t x);
     static realvec_t vml_tan(realvec_t x);
+    static realvec_t vml_cos_chebyshev_single(realvec_t x);
+    static realvec_t vml_cos_chebyshev_double(realvec_t x);
+    static realvec_t vml_sin_chebyshev_single(realvec_t x);
+    static realvec_t vml_sin_chebyshev_double(realvec_t x);
     
     // sinh
     static realvec_t vml_cosh(realvec_t x);
diff --git a/mathfuncs_sin.h b/mathfuncs_sin.h
index 2079603..107e6eb 100644
--- a/mathfuncs_sin.h
+++ b/mathfuncs_sin.h
@@ -55,13 +55,11 @@ namespace vecmathlib {
     
     return r;
   }
-  
-  
-  
+
   template<typename realvec_t>
   realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t x)
   {
-    return sin(x + RV(M_PI_2));
+    return vml_sin(x + RV(M_PI_2));
   }
   
   template<typename realvec_t>
@@ -92,7 +90,153 @@ namespace vecmathlib {
     
     return r;
   }
-  
+
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_sin_chebyshev_single(realvec_t x)
+  {
+    typename realvec_t::scalar_t const eps __attribute__((__unused__)) = 0.000001;
+    realvec_t r, s, t;
+
+    // Reduce range: sin(x) = sin(x + 2PI)
+    t = x + RV(M_PI);
+    t = t * RV(1.0 / (2.0 * M_PI));
+    t = floor(t);
+    t = x - (t * RV(2.0 * M_PI));
+    assert(all(t >= RV(-(1.0 + eps) * M_PI) && t <= RV((1.0 + eps) * M_PI)));
+
+    // Reduce range: -sin(x) = -sin(-x)
+    s = copysign(RV(1.0), t);
+    t = fabs(t);
+
+    // Reduce range: sin(x) = sin(pi-x)
+    x = ifthen(t > RV(M_PI_2), RV(M_PI) - t, t);
+    assert(all(x >= RV(0.0 - eps) && x <= RV(M_PI_2)));
+
+    // Evaluate Chebyshev polynomial expansion r = s*T(x)
+    // where T(x) = x + c1*x^3 + c2*x^5 + c3*x^7 + ...
+    t = x * x;
+    x = x * s;
+    s = t * x;
+    r = fma(RV(-2.2636462059494963e-09), t, RV(1.9761453001926614e-08));
+    r = fma(r, t, RV(-9.1621870444937360e-08));
+    r = fma(r, t, RV(2.8673509297532770e-06));
+    r = fma(r, t, RV(-1.9850777498822076e-04));
+    r = fma(r, t, RV(8.3333708144611930e-03));
+    r = fma(r, t, RV(-1.6666667163372040e-01));
+    r = fma(r, s, x);
+    return r;
+  }
+
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_sin_chebyshev_double(realvec_t x)
+  {
+    typename realvec_t::scalar_t const eps __attribute__((__unused__)) = 0.000001;
+    realvec_t r, s, t;
+
+    // Reduce range: sin(x) = sin(x + 2PI)
+    t = x + RV(M_PI);
+    t = t * RV(1.0 / (2.0 * M_PI));
+    t = floor(t);
+    t = x - (t * RV(2.0 * M_PI));
+    assert(all(t >= RV(-(1.0 + eps) * M_PI) && t <= RV((1.0 + eps) * M_PI)));
+
+    // Reduce range: -sin(x) = -sin(-x)
+    s = copysign(RV(1.0), t);
+    t = fabs(t);
+
+    // Reduce range: sin(x) = sin(pi-x)
+    x = ifthen(t > RV(M_PI_2), RV(M_PI) - t, t);
+    assert(all(x >= RV(0.0 - eps) && x <= RV(M_PI_2)));
+
+    // Evaluate Chebyshev polynomial expansion r = s*T(x)
+    // where T(x) = x + c1*x^3 + c2*x^5 + c3*x^7 + ...
+    t = x * x;
+    x = x * s;
+    s = t * x;
+    r = fma(RV(1.1888871779171205e-23), t, RV(-1.6213346583948200e-21));
+    r = fma(r, t, RV(9.4674830124704450e-19));
+    r = fma(r, t, RV(-3.6586864533554100e-16));
+    r = fma(r, t, RV(-7.5815669263036780e-13));
+    r = fma(r, t, RV(1.6058175109947732e-10));
+    r = fma(r, t, RV(-2.5052101017560582e-08));
+    r = fma(r, t, RV(2.7557319185249400e-06));
+    r = fma(r, t, RV(-1.9841269841152493e-04));
+    r = fma(r, t, RV(8.3333333333331560e-03));
+    r = fma(r, t, RV(-1.6666666666666666e-01));
+    r = fma(r, s, x);
+    return r;
+  }
+
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_cos_chebyshev_single(realvec_t x)
+  {
+    typename realvec_t::boolvec_t m;
+    realvec_t r, s, t;
+
+    // Reduce range: cos(x) = cos(x + 2PI)
+    t = x + RV(M_PI);
+    t = t * RV(1.0 / (2.0 * M_PI));
+    t = floor(t);
+    t = x - (t * RV(2.0 * M_PI));
+
+    // Reduce range: cos(x) = sin(PI/2 - x)
+    t = fabs(t);
+    m = t > RV(M_PI_2);
+    x = RV(M_PI) - t;
+    s = ifthen(m, RV(-1.0), RV(1.0));
+    x = ifthen(m, x, t);
+
+    // Evaluate Chebyshev polynomial s*T(x) where
+    // T(x) = 1 + c1*x^2 + c2*x^4 + c3*x^6 + ...
+    t = x * x;
+    x = t * s;
+    r = fma(RV(-1.0986536764839979e-11), t, RV(2.0856702467901100e-09));
+    r = fma(r, t, RV(-2.7556891974788950e-07));
+    r = fma(r, t, RV(2.4801582421938170e-05));
+    r = fma(r, t, RV(-1.3888888860984269e-03));
+    r = fma(r, t, RV(4.1666666666056330e-02));
+    r = fma(r, t, RV(-5.0000000000000000e-01));
+    r = fma(r, x, s);
+    return r;
+  }
+
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_cos_chebyshev_double(realvec_t x)
+  {
+    typename realvec_t::boolvec_t m;
+    realvec_t r, s, t;
+
+    // Reduce range: cos(x) = cos(x + 2PI)
+    t = x + RV(M_PI);
+    t = t * RV(1.0 / (2.0 * M_PI));
+    t = floor(t);
+    t = x - (t * RV(2.0 * M_PI));
+
+    // Reduce range: cos(x) = sin(PI/2 - x)
+    t = fabs(t);
+    m = t > RV(M_PI_2);
+    x = RV(M_PI) - t;
+    s = ifthen(m, RV(-1.0), RV(1.0));
+    x = ifthen(m, x, t);
+
+    // Evaluate Chebyshev polynomial s*T(x) where
+    // T(x) = 1 + c1*x^2 + c2*x^4 + c3*x^6 + ...
+    t = x * x;
+    x = t * s;
+    r = fma(RV(-8.6512994843471700e-22), t, RV(4.1086675770914360e-19));
+    r = fma(r, t, RV(-1.5619143199049570e-16));
+    r = fma(r, t, RV(4.7794771764282040e-14));
+    r = fma(r, t, RV(-1.1470745595224050e-11));
+    r = fma(r, t, RV(2.0876756987841530e-09));
+    r = fma(r, t, RV(-2.7557319223985710e-07));
+    r = fma(r, t, RV(2.4801587301587300e-05));
+    r = fma(r, t, RV(-1.3888888888888890e-03));
+    r = fma(r, t, RV(4.1666666666666664e-02));
+    r = fma(r, t, RV(-5.0000000000000000e-01));
+    r = fma(r, x, s);
+    return r;
+  }
+
 }; // namespace vecmathlib
 
 #endif  // #ifndef MATHFUNCS_SIN_H
diff --git a/vec_double.h b/vec_double.h
index 2392756..d19126b 100644
--- a/vec_double.h
+++ b/vec_double.h
@@ -290,7 +290,7 @@ namespace vecmathlib {
     realvec atanh() const { return MF::vml_atanh(*this); }
     realvec ceil() const { return MF::vml_ceil(*this); }
     realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cos() const { return MF::vml_cos_chebyshev_double(*this); }
     realvec cosh() const { return MF::vml_cosh(*this); }
     realvec exp() const { return MF::vml_exp(*this); }
     realvec exp10() const { return MF::vml_exp10(*this); }
@@ -319,7 +319,7 @@ namespace vecmathlib {
     realvec rsqrt() const { return MF::vml_rsqrt(*this); }
     realvec scalbn(intvec_t n) const { return MF::vml_scalbn(*this, n); }
     boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sin() const { return MF::vml_sin_chebyshev_double(*this); }
     realvec sinh() const { return MF::vml_sinh(*this); }
     realvec sqrt() const { return MF::vml_sqrt(*this); }
     realvec tan() const { return MF::vml_tan(*this); }
diff --git a/vec_double_avx.h b/vec_double_avx.h
index 214959b..5ecb2a5 100644
--- a/vec_double_avx.h
+++ b/vec_double_avx.h
@@ -598,7 +598,7 @@ namespace vecmathlib {
     realvec atanh() const { return MF::vml_atanh(*this); }
     realvec ceil() const { return _mm256_ceil_pd(v); }
     realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cos() const { return MF::vml_cos_chebyshev_double(*this); }
     realvec cosh() const { return MF::vml_cosh(*this); }
     realvec exp() const { return MF::vml_exp(*this); }
     realvec exp10() const { return MF::vml_exp10(*this); }
@@ -627,7 +627,7 @@ namespace vecmathlib {
     realvec rsqrt() const { return MF::vml_rsqrt(*this); }
     realvec scalbn(intvec_t n) const { return MF::vml_scalbn(*this, n); }
     boolvec_t signbit() const { return v; }
-    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sin() const { return MF::vml_sin_chebyshev_double(*this); }
     realvec sinh() const { return MF::vml_sinh(*this); }
     realvec sqrt() const { return _mm256_sqrt_pd(v); }
     realvec tan() const { return MF::vml_tan(*this); }
diff --git a/vec_double_sse2.h b/vec_double_sse2.h
index 40c789e..636ef47 100644
--- a/vec_double_sse2.h
+++ b/vec_double_sse2.h
@@ -496,7 +496,7 @@ namespace vecmathlib {
     realvec atanh() const { return MF::vml_atanh(*this); }
     realvec ceil() const { return _mm_ceil_pd(v); }
     realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cos() const { return MF::vml_cos_chebyshev_double(*this); }
     realvec cosh() const { return MF::vml_cosh(*this); }
     realvec exp() const { return MF::vml_exp(*this); }
     realvec exp10() const { return MF::vml_exp10(*this); }
@@ -525,7 +525,7 @@ namespace vecmathlib {
     realvec rsqrt() const { return MF::vml_rsqrt(*this); }
     realvec scalbn(intvec_t n) const { return MF::vml_scalbn(*this, n); }
     boolvec_t signbit() const { return v; }
-    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sin() const { return MF::vml_sin_chebyshev_double(*this); }
     realvec sinh() const { return MF::vml_sinh(*this); }
     realvec sqrt() const { return _mm_sqrt_pd(v); }
     realvec tan() const { return MF::vml_tan(*this); }
diff --git a/vec_float.h b/vec_float.h
index 5424b05..08bcd4f 100644
--- a/vec_float.h
+++ b/vec_float.h
@@ -290,7 +290,7 @@ namespace vecmathlib {
     realvec atanh() const { return MF::vml_atanh(*this); }
     realvec ceil() const { return MF::vml_ceil(*this); }
     realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cos() const { return MF::vml_cos_chebyshev_single(*this); }
     realvec cosh() const { return MF::vml_cosh(*this); }
     realvec exp() const { return MF::vml_exp(*this); }
     realvec exp10() const { return MF::vml_exp10(*this); }
@@ -319,7 +319,7 @@ namespace vecmathlib {
     realvec rsqrt() const { return MF::vml_rsqrt(*this); }
     realvec scalbn(intvec_t n) const { return MF::vml_scalbn(*this, n); }
     boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sin() const { return MF::vml_sin_chebyshev_single(*this); }
     realvec sinh() const { return MF::vml_sinh(*this); }
     realvec sqrt() const { return MF::vml_sqrt(*this); }
     realvec tan() const { return MF::vml_tan(*this); }
diff --git a/vec_float_avx.h b/vec_float_avx.h
index 08cdb08..ae6ab40 100644
--- a/vec_float_avx.h
+++ b/vec_float_avx.h
@@ -569,7 +569,7 @@ namespace vecmathlib {
     realvec atanh() const { return MF::vml_atanh(*this); }
     realvec ceil() const { return _mm256_ceil_ps(v); }
     realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cos() const { return MF::vml_cos_chebyshev_single(*this); }
     realvec cosh() const { return MF::vml_cosh(*this); }
     realvec exp() const { return MF::vml_exp(*this); }
     realvec exp10() const { return MF::vml_exp10(*this); }
@@ -610,7 +610,7 @@ namespace vecmathlib {
     }
     realvec scalbn(intvec_t n) const { return MF::vml_scalbn(*this, n); }
     boolvec_t signbit() const { return v; }
-    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sin() const { return MF::vml_sin_chebyshev_single(*this); }
     realvec sinh() const { return MF::vml_sinh(*this); }
     realvec sqrt() const { return _mm256_sqrt_ps(v); }
     realvec tan() const { return MF::vml_tan(*this); }
diff --git a/vec_float_sse2.h b/vec_float_sse2.h
index 38c42b3..c870751 100644
--- a/vec_float_sse2.h
+++ b/vec_float_sse2.h
@@ -472,7 +472,7 @@ namespace vecmathlib {
     realvec atanh() const { return MF::vml_atanh(*this); }
     realvec ceil() const { return _mm_ceil_ps(v); }
     realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cos() const { return MF::vml_cos_chebyshev_single(*this); }
     realvec cosh() const { return MF::vml_cosh(*this); }
     realvec exp() const { return MF::vml_exp(*this); }
     realvec exp10() const { return MF::vml_exp10(*this); }
@@ -513,7 +513,7 @@ namespace vecmathlib {
     }
     realvec scalbn(intvec_t n) const { return MF::vml_scalbn(*this, n); }
     boolvec_t signbit() const { return v; }
-    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sin() const { return MF::vml_sin_chebyshev_single(*this); }
     realvec sinh() const { return MF::vml_sinh(*this); }
     realvec sqrt() const { return _mm_sqrt_ps(v); }
     realvec tan() const { return MF::vml_tan(*this); }
-- 
cgit v1.1