mathfuncs_sqrt.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

// -*-C++-*-

#ifndef MATHFUNCS_SQRT_H
#define MATHFUNCS_SQRT_H

#include "mathfuncs_base.h"

#include <cmath>


namespace vecmathlib {
  
  template<typename realvec_t>
  realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x)
  {
#if 0
    // Handle special case: zero
    boolvec_t is_zero = x <= RV(0.0);
    x = ifthen(is_zero, RV(1.0), x);
    
    // Initial guess
    VML_ASSERT(all(x > RV(0.0)));
#if 0
    intvec_t ilogb_x = ilogb(x);
    realvec_t r = ldexp(RV(M_SQRT2), ilogb_x >> 1);
    // TODO: divide by M_SQRT2 if ilogb_x % 2 == 1 ?
#else
    real_t correction =
      vml_std::ldexp(R(FP::exponent_offset & 1 ? M_SQRT2 : 1.0),
                     FP::exponent_offset >> 1);
    realvec_t r = lsr(x.as_int(), 1).as_float() * RV(correction);
#endif
    
    // Iterate
    // nmax iterations give an accuracy of 2^nmax binary digits. 4
    // iterations suffice for double precision with its 53 digits.
    int const nmax = sizeof(real_t)==4 ? 2 : 4;
    for (int n=0; n<nmax; ++n) {
      // Step
      VML_ASSERT(all(r > RV(0.0)));
      // Newton method:
      // Solve   f(r) = 0   for   f(r) = x - r^2
      //    r <- r - f(r) / f'(r)
      //    r <- (r + x / r) / 2
      r = RV(0.5) * (r + x / r);
    }
    
    // Handle special case: zero
    r = ifthen(is_zero, RV(0.0), r);
#endif
    
    realvec_t r = x * rsqrt(x);
    // Handle special case: zero
    r = ifthen(x == RV(0.0), RV(0.0), r);
    
    return r;
  }
  
  
  // TODO: Use "Halley's method with cubic convergence":
  // <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
  template<typename realvec_t>
  realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x)
  {
    return pow(x, RV(1.0/3.0));
  }
  
  
  template<typename realvec_t>
  realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x)
  {
#if 0
    // See <http://en.wikipedia.org/wiki/Fast_inverse_square_root>
    realvec_t x_2 = RV(0.5) * x;
    realvec_t r = x;
    intvec_t i = as_int(r);
    int_t magic = sizeof(real_t)==4 ? I(0x5f375a86) : I(0x5fe6eb50c7b537a9);
    i = IV(magic) - (i >> I(1));
    r = as_float(i);
    r += r * (RV(0.5) - (x_2 * r * r));
    r += r * (RV(0.5) - (x_2 * r * r));
    r += r * (RV(0.5) - (x_2 * r * r));
    return r;
#else
    // Initial guess
    // VML_ASSERT(all(x > RV(0.0)));
    intvec_t ilogb_x = ilogb(x);
    realvec_t s =
      ifthen(convert_bool(ilogb_x & IV(I(1))), RV(R(0.583)), RV(R(0.824)));
    realvec_t r = ldexp(s, -(ilogb_x >> I(1)));
    
    realvec_t x_2 = RV(0.5) * x;
    
    // Iterate
    // nmax iterations give an accuracy of 2^nmax binary digits. 5
    // iterations suffice for double precision with its 53 digits.
    int const nmax = sizeof(real_t)==4 ? 4 : 5;
    for (int n=0; n<nmax; ++n) {
      // Step
      VML_ASSERT(all(r > RV(0.0)));
      // Newton method:
      // Solve   f(r) = 0   for   f(r) = x - 1/r^2
      //    r <- r - f(r) / f'(r)
      //    r <- (3 r - r^3 x) / 2
      //    r <- r (3/2 - r^2 x/2)
      
      // Note: don't rewrite this expression, this may introduce
      // cancellation errors (says who?)
      // r *= RV(1.5) - x_2 * r*r;
      r += r * (RV(0.5) - x_2 * r*r);
    }
    
    return r;
#endif
  }
  
  
  template<typename realvec_t>
  realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y)
  {
    return sqrt(x*x + y*y);
  }
  
}; // namespace vecmathlib

#endif  // #ifndef MATHFUNCS_SQRT_H