blob: 51a86cadb8024a0a8e51c5daba5f26bb324f05c4 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
// -*-C++-*-
#ifndef MATHFUNCS_SQRT_H
#define MATHFUNCS_SQRT_H
#include "mathfuncs_base.h"
#include <cassert>
#include <cmath>
// For cbrt: Use "Halley's method with cubic convergence":
// <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
namespace vecmathlib {
template<typename realvec_t>
realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x)
{
// Handle special case: zero
boolvec_t is_zero = x <= RV(0.0);
x = ifthen(is_zero, RV(1.0), x);
// Initial guess
assert(all(x > RV(0.0)));
#if 0
intvec_t ilogb_x = ilogb(x);
realvec_t r = scalbn(RV(M_SQRT2), ilogb_x >> 1);
// TODO: divide by M_SQRT2 if ilogb_x % 2 == 1 ?
#else
real_t correction =
std::scalbn(R(FP::exponent_offset & 1 ? M_SQRT2 : 1.0),
FP::exponent_offset >> 1);
realvec_t r = lsr(x.as_int(), 1).as_float() * RV(correction);
#endif
// Iterate
// nmax iterations give an accuracy of 2^nmax binary digits. 5
// iterations suffice for double precision with its 53 digits.
int const nmax = 5;
for (int n=1; n<nmax; ++n) {
// Step
assert(all(x > RV(0.0)));
// Newton method:
// Solve f(r) = 0 for f(r) = x - r^2
// r <- r - f(r) / f'(r)
// r <- (r + x / r) / 2
r = RV(0.5) * (r + x / r);
}
// Handle special case: zero
r = ifthen(is_zero, RV(0.0), r);
return r;
}
template<typename realvec_t>
realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x)
{
#if 0
double const x_2 = 0.5 * x; // x/2
// Newton method:
// Solve f(r) = 0 for f(r) = x - 1/r^2
// r <- r - f(r) / f'(r)
// r <- (3 r - r^3 x) / 2
return r * (1.5 - r*r * x_2);
#endif
return rcp(sqrt(x));
}
}; // namespace vecmathlib
#endif // #ifndef MATHFUNCS_SQRT_H
|