summaryrefslogtreecommitdiffstats
path: root/lib/msun/src
diff options
context:
space:
mode:
Diffstat (limited to 'lib/msun/src')
-rw-r--r--lib/msun/src/s_cbrt.c47
-rw-r--r--lib/msun/src/s_cbrtf.c20
2 files changed, 30 insertions, 37 deletions
diff --git a/lib/msun/src/s_cbrt.c b/lib/msun/src/s_cbrt.c
index e9ed330..29e53e2 100644
--- a/lib/msun/src/s_cbrt.c
+++ b/lib/msun/src/s_cbrt.c
@@ -26,12 +26,13 @@ static const u_int32_t
B1 = 715094163, /* B1 = (1023-1023/3-0.03306235651)*2**20 */
B2 = 696219795; /* B2 = (1023-1023/3-54/3-0.03306235651)*2**20 */
+/* |1/cbrt(x) - p(x)| < 2**-23.5 (~[-7.93e-8, 7.929e-8]). */
static const double
-C = 5.42857142857142815906e-01, /* 19/35 = 0x3FE15F15, 0xF15F15F1 */
-D = -7.05306122448979611050e-01, /* -864/1225 = 0xBFE691DE, 0x2532C834 */
-E = 1.41428571428571436819e+00, /* 99/70 = 0x3FF6A0EA, 0x0EA0EA0F */
-F = 1.60714285714285720630e+00, /* 45/28 = 0x3FF9B6DB, 0x6DB6DB6E */
-G = 3.57142857142857150787e-01; /* 5/14 = 0x3FD6DB6D, 0xB6DB6DB7 */
+P0 = 1.87595182427177009643, /* 0x3ffe03e6, 0x0f61e692 */
+P1 = -1.88497979543377169875, /* 0xbffe28e0, 0x92f02420 */
+P2 = 1.621429720105354466140, /* 0x3ff9f160, 0x4a49d6c2 */
+P3 = -0.758397934778766047437, /* 0xbfe844cb, 0xbee751d9 */
+P4 = 0.145996192886612446982; /* 0x3fc2b000, 0xd4e4edd7 */
double
cbrt(double x)
@@ -78,36 +79,30 @@ cbrt(double x)
SET_HIGH_WORD(t,sign|(hx/3+B1));
/*
- * New cbrt to 25 bits:
- * cbrt(x) = t*cbrt(x/t**3) ~= t*R(x/t**3)
- * where R(r) = (14*r**2 + 35*r + 5)/(5*r**2 + 35*r + 14) is the
- * (2,2) Pade approximation to cbrt(r) at r = 1. We replace
- * r = x/t**3 by 1/r = t**3/x since the latter can be evaluated
- * more efficiently, and rearrange the expression for R(r) to use
- * 4 additions and 2 divisions instead of the 4 additions, 4
- * multiplications and 1 division that would be required using
- * Horner's rule on the numerator and denominator. t being good
- * to 32 bits means that |t/cbrt(x)-1| < 1/32, so |x/t**3-1| < 0.1
- * and for R(r) we can use any approximation to cbrt(r) that is good
- * to 20 bits on [0.9, 1.1]. The (2,2) Pade approximation is not an
- * especially good choice.
+ * New cbrt to 23 bits:
+ * cbrt(x) = t*cbrt(x/t**3) ~= t*P(t**3/x)
+ * where P(r) is a polynomial of degree 4 that approximates 1/cbrt(r)
+ * to within 2**-23.5 when |r - 1| < 1/10. The rough approximation
+ * has produced t such than |t/cbrt(x) - 1| ~< 1/32, and cubing this
+ * gives us bounds for r = t**3/x.
+ *
+ * Try to optimize for parallel evaluation as in k_tanf.c.
*/
- r=t*t/x;
- s=C+r*t;
- t*=G+F/(s+E+D/s);
+ r=(t*t)*(t/x);
+ t=t*((P0+r*(P1+r*P2))+((r*r)*r)*(P3+r*P4));
/*
- * Round t away from zero to 25 bits (sloppily except for ensuring that
+ * Round t away from zero to 23 bits (sloppily except for ensuring that
* the result is larger in magnitude than cbrt(x) but not much more than
- * 2 25-bit ulps larger). With rounding towards zero, the error bound
- * would be ~5/6 instead of ~4/6. With a maximum error of 1 25-bit ulps
+ * 2 23-bit ulps larger). With rounding towards zero, the error bound
+ * would be ~5/6 instead of ~4/6. With a maximum error of 2 23-bit ulps
* in the rounded t, the infinite-precision error in the Newton
* approximation barely affects third digit in the the final error
- * 0.667; the error in the rounded t can be up to about 12 25-bit ulps
+ * 0.667; the error in the rounded t can be up to about 3 23-bit ulps
* before the final error is larger than 0.667 ulps.
*/
u.value=t;
- u.bits=(u.bits+0x20000000)&0xfffffffff0000000ULL;
+ u.bits=(u.bits+0x80000000)&0xffffffffc0000000ULL;
t=u.value;
/* one step Newton iteration to 53 bits with error < 0.667 ulps */
diff --git a/lib/msun/src/s_cbrtf.c b/lib/msun/src/s_cbrtf.c
index 89f3507..89398af 100644
--- a/lib/msun/src/s_cbrtf.c
+++ b/lib/msun/src/s_cbrtf.c
@@ -28,12 +28,11 @@ static const unsigned
B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
+/* |1/cbrt(x) - p(x)| < 2**-14.5 (~[-4.37e-4, 4.366e-5]). */
static const float
-C = 5.4285717010e-01, /* 19/35 = 0x3f0af8b0 */
-D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
-E = 1.4142856598e+00, /* 99/70 = 0x3fb50750 */
-F = 1.6071428061e+00, /* 45/28 = 0x3fcdb6db */
-G = 3.5714286566e-01; /* 5/14 = 0x3eb6db6e */
+P0 = 1.5586718321, /* 0x3fc7828f */
+P1 = -0.78271341324, /* -0xbf485fe8 */
+P2 = 0.22403796017; /* 0x3e656a35 */
float
cbrtf(float x)
@@ -59,10 +58,9 @@ cbrtf(float x)
} else
SET_FLOAT_WORD(t,sign|(hx/3+B1));
- /* new cbrt to 23 bits */
- r=t*t/x;
- s=C+r*t;
- t*=G+F/(s+E+D/s);
+ /* new cbrt to 14 bits */
+ r=(t*t)*(t/x);
+ t=t*((P0+r*P1)+(r*r)*P2);
/*
* Round t away from zero to 12 bits (sloppily except for ensuring that
@@ -73,9 +71,9 @@ cbrtf(float x)
* the second digit instead of the third digit of 4/6 = 0.666..., etc.
*/
GET_FLOAT_WORD(high,t);
- SET_FLOAT_WORD(t,(high+0x1002)&0xfffff000);
+ SET_FLOAT_WORD(t,(high+0x1800)&0xfffff000);
- /* one step Newton iteration to 24 bits with error < 0.667 ulps */
+ /* one step Newton iteration to 24 bits with error < 0.669 ulps */
s=t*t; /* t*t is exact */
r=x/s; /* error <= 0.5 ulps; |r| < |t| */
w=t+t; /* t+t is exact */
OpenPOWER on IntegriCloud