mathfuncs_sqrt.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

// -*-C++-*-

#ifndef MATHFUNCS_SQRT_H
#define MATHFUNCS_SQRT_H

#include "mathfuncs_base.h"

#include <cmath>

namespace vecmathlib {

template <typename realvec_t>
realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x) {
#if 0
    // Handle special case: zero
    boolvec_t is_zero = x <= RV(0.0);
    x = ifthen(is_zero, RV(1.0), x);
    
    // Initial guess
    VML_ASSERT(all(x > RV(0.0)));
#if 0
    intvec_t ilogb_x = ilogb(x);
    realvec_t r = ldexp(RV(M_SQRT2), ilogb_x >> 1);
    // TODO: divide by M_SQRT2 if ilogb_x % 2 == 1 ?
#else
    real_t correction =
      vml_std::ldexp(R(FP::exponent_offset & 1 ? M_SQRT2 : 1.0),
                     FP::exponent_offset >> 1);
    realvec_t r = lsr(x.as_int(), 1).as_float() * RV(correction);
#endif
    
    // Iterate
    // nmax iterations give an accuracy of 2^nmax binary digits. 4
    // iterations suffice for double precision with its 53 digits.
    int const nmax = sizeof(real_t)==4 ? 2 : 4;
    for (int n=0; n<nmax; ++n) {
      // Step
      VML_ASSERT(all(r > RV(0.0)));
      // Newton method:
      // Solve   f(r) = 0   for   f(r) = x - r^2
      //    r <- r - f(r) / f'(r)
      //    r <- (r + x / r) / 2
      r = RV(0.5) * (r + x / r);
    }
    
    // Handle special case: zero
    r = ifthen(is_zero, RV(0.0), r);
#endif

  realvec_t r = x * rsqrt(x);
  // Handle special case: zero
  r = ifthen(x == RV(0.0), RV(0.0), r);

  return r;
}

// TODO: Use "Halley's method with cubic convergence":
// <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
template <typename realvec_t>
realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x) {
  return pow(x, RV(1.0 / 3.0));
}

template <typename realvec_t>
realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x) {
#if 0
    // See <http://en.wikipedia.org/wiki/Fast_inverse_square_root>
    realvec_t x_2 = RV(0.5) * x;
    realvec_t r = x;
    intvec_t i = as_int(r);
    int_t magic = sizeof(real_t)==4 ? I(0x5f375a86) : I(0x5fe6eb50c7b537a9);
    i = IV(magic) - (i >> I(1));
    r = as_float(i);
    r += r * (RV(0.5) - (x_2 * r * r));
    r += r * (RV(0.5) - (x_2 * r * r));
    r += r * (RV(0.5) - (x_2 * r * r));
    return r;
#else
  // Initial guess
  // VML_ASSERT(all(x > RV(0.0)));
  intvec_t ilogb_x = ilogb(x);
  realvec_t s =
      ifthen(convert_bool(ilogb_x & IV(I(1))), RV(R(0.583)), RV(R(0.824)));
  realvec_t r = ldexp(s, -(ilogb_x >> I(1)));

  realvec_t x_2 = RV(0.5) * x;

  // Iterate
  // nmax iterations give an accuracy of 2^nmax binary digits. 5
  // iterations suffice for double precision with its 53 digits.
  int const nmax = sizeof(real_t) == 4 ? 4 : 5;
  for (int n = 0; n < nmax; ++n) {
    // Step
    VML_ASSERT(all(r > RV(0.0)));
    // Newton method:
    // Solve   f(r) = 0   for   f(r) = x - 1/r^2
    //    r <- r - f(r) / f'(r)
    //    r <- (3 r - r^3 x) / 2
    //    r <- r (3/2 - r^2 x/2)

    // Note: don't rewrite this expression, this may introduce
    // cancellation errors (says who?)
    // r *= RV(1.5) - x_2 * r*r;
    r += r * (RV(0.5) - x_2 * r * r);
  }

  return r;
#endif
}

template <typename realvec_t>
realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y) {
  return sqrt(x * x + y * y);
}

}; // namespace vecmathlib

#endif // #ifndef MATHFUNCS_SQRT_H