summaryrefslogtreecommitdiffstats
path: root/lib/msun
diff options
context:
space:
mode:
authorbde <bde@FreeBSD.org>2005-12-19 00:22:03 +0000
committerbde <bde@FreeBSD.org>2005-12-19 00:22:03 +0000
commitce5f09f38d1a047b71ee0447a7d9efd1ce8cb183 (patch)
treec79ab807fed96e896f06a4d68cd39de8ffa8329b /lib/msun
parentcb3a140c1cbaeeafffc172372c7475d6a0311e85 (diff)
downloadFreeBSD-src-ce5f09f38d1a047b71ee0447a7d9efd1ce8cb183.zip
FreeBSD-src-ce5f09f38d1a047b71ee0447a7d9efd1ce8cb183.tar.gz
Use a minimax polynomial approximation instead of a Pade rational
function approximation for the second step. The polynomial has degree 2 for cbrtf() and 4 for cbrt(). These degrees are minimal for the final accuracy to be essentially the same as before (slightly smaller). Adjust the rounding between steps 2 and 3 to match. Unfortunately, for cbrt(), this breaks the claimed accuracy slightly although incorrect rounding doesn't. Claim less accuracy since its not worth pessimizing the polynomial or relying on exhaustive testing to get insignificantly more accuracy. This saves about 30 cycles on Athlons (mainly by avoiding 2 divisions) so it gives an overall optimization in the 10-25% range (a larger percentage for float precision, especially in 32-bit mode, since other overheads are more dominant for double precision, surprisingly more in 32-bit mode).
Diffstat (limited to 'lib/msun')
-rw-r--r--lib/msun/src/s_cbrt.c47
-rw-r--r--lib/msun/src/s_cbrtf.c20
2 files changed, 30 insertions, 37 deletions
diff --git a/lib/msun/src/s_cbrt.c b/lib/msun/src/s_cbrt.c
index e9ed330..29e53e2 100644
--- a/lib/msun/src/s_cbrt.c
+++ b/lib/msun/src/s_cbrt.c
@@ -26,12 +26,13 @@ static const u_int32_t
B1 = 715094163, /* B1 = (1023-1023/3-0.03306235651)*2**20 */
B2 = 696219795; /* B2 = (1023-1023/3-54/3-0.03306235651)*2**20 */
+/* |1/cbrt(x) - p(x)| < 2**-23.5 (~[-7.93e-8, 7.929e-8]). */
static const double
-C = 5.42857142857142815906e-01, /* 19/35 = 0x3FE15F15, 0xF15F15F1 */
-D = -7.05306122448979611050e-01, /* -864/1225 = 0xBFE691DE, 0x2532C834 */
-E = 1.41428571428571436819e+00, /* 99/70 = 0x3FF6A0EA, 0x0EA0EA0F */
-F = 1.60714285714285720630e+00, /* 45/28 = 0x3FF9B6DB, 0x6DB6DB6E */
-G = 3.57142857142857150787e-01; /* 5/14 = 0x3FD6DB6D, 0xB6DB6DB7 */
+P0 = 1.87595182427177009643, /* 0x3ffe03e6, 0x0f61e692 */
+P1 = -1.88497979543377169875, /* 0xbffe28e0, 0x92f02420 */
+P2 = 1.621429720105354466140, /* 0x3ff9f160, 0x4a49d6c2 */
+P3 = -0.758397934778766047437, /* 0xbfe844cb, 0xbee751d9 */
+P4 = 0.145996192886612446982; /* 0x3fc2b000, 0xd4e4edd7 */
double
cbrt(double x)
@@ -78,36 +79,30 @@ cbrt(double x)
SET_HIGH_WORD(t,sign|(hx/3+B1));
/*
- * New cbrt to 25 bits:
- * cbrt(x) = t*cbrt(x/t**3) ~= t*R(x/t**3)
- * where R(r) = (14*r**2 + 35*r + 5)/(5*r**2 + 35*r + 14) is the
- * (2,2) Pade approximation to cbrt(r) at r = 1. We replace
- * r = x/t**3 by 1/r = t**3/x since the latter can be evaluated
- * more efficiently, and rearrange the expression for R(r) to use
- * 4 additions and 2 divisions instead of the 4 additions, 4
- * multiplications and 1 division that would be required using
- * Horner's rule on the numerator and denominator. t being good
- * to 32 bits means that |t/cbrt(x)-1| < 1/32, so |x/t**3-1| < 0.1
- * and for R(r) we can use any approximation to cbrt(r) that is good
- * to 20 bits on [0.9, 1.1]. The (2,2) Pade approximation is not an
- * especially good choice.
+ * New cbrt to 23 bits:
+ * cbrt(x) = t*cbrt(x/t**3) ~= t*P(t**3/x)
+ * where P(r) is a polynomial of degree 4 that approximates 1/cbrt(r)
+ * to within 2**-23.5 when |r - 1| < 1/10. The rough approximation
+ * has produced t such than |t/cbrt(x) - 1| ~< 1/32, and cubing this
+ * gives us bounds for r = t**3/x.
+ *
+ * Try to optimize for parallel evaluation as in k_tanf.c.
*/
- r=t*t/x;
- s=C+r*t;
- t*=G+F/(s+E+D/s);
+ r=(t*t)*(t/x);
+ t=t*((P0+r*(P1+r*P2))+((r*r)*r)*(P3+r*P4));
/*
- * Round t away from zero to 25 bits (sloppily except for ensuring that
+ * Round t away from zero to 23 bits (sloppily except for ensuring that
* the result is larger in magnitude than cbrt(x) but not much more than
- * 2 25-bit ulps larger). With rounding towards zero, the error bound
- * would be ~5/6 instead of ~4/6. With a maximum error of 1 25-bit ulps
+ * 2 23-bit ulps larger). With rounding towards zero, the error bound
+ * would be ~5/6 instead of ~4/6. With a maximum error of 2 23-bit ulps
* in the rounded t, the infinite-precision error in the Newton
* approximation barely affects third digit in the the final error
- * 0.667; the error in the rounded t can be up to about 12 25-bit ulps
+ * 0.667; the error in the rounded t can be up to about 3 23-bit ulps
* before the final error is larger than 0.667 ulps.
*/
u.value=t;
- u.bits=(u.bits+0x20000000)&0xfffffffff0000000ULL;
+ u.bits=(u.bits+0x80000000)&0xffffffffc0000000ULL;
t=u.value;
/* one step Newton iteration to 53 bits with error < 0.667 ulps */
diff --git a/lib/msun/src/s_cbrtf.c b/lib/msun/src/s_cbrtf.c
index 89f3507..89398af 100644
--- a/lib/msun/src/s_cbrtf.c
+++ b/lib/msun/src/s_cbrtf.c
@@ -28,12 +28,11 @@ static const unsigned
B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
+/* |1/cbrt(x) - p(x)| < 2**-14.5 (~[-4.37e-4, 4.366e-5]). */
static const float
-C = 5.4285717010e-01, /* 19/35 = 0x3f0af8b0 */
-D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
-E = 1.4142856598e+00, /* 99/70 = 0x3fb50750 */
-F = 1.6071428061e+00, /* 45/28 = 0x3fcdb6db */
-G = 3.5714286566e-01; /* 5/14 = 0x3eb6db6e */
+P0 = 1.5586718321, /* 0x3fc7828f */
+P1 = -0.78271341324, /* -0xbf485fe8 */
+P2 = 0.22403796017; /* 0x3e656a35 */
float
cbrtf(float x)
@@ -59,10 +58,9 @@ cbrtf(float x)
} else
SET_FLOAT_WORD(t,sign|(hx/3+B1));
- /* new cbrt to 23 bits */
- r=t*t/x;
- s=C+r*t;
- t*=G+F/(s+E+D/s);
+ /* new cbrt to 14 bits */
+ r=(t*t)*(t/x);
+ t=t*((P0+r*P1)+(r*r)*P2);
/*
* Round t away from zero to 12 bits (sloppily except for ensuring that
@@ -73,9 +71,9 @@ cbrtf(float x)
* the second digit instead of the third digit of 4/6 = 0.666..., etc.
*/
GET_FLOAT_WORD(high,t);
- SET_FLOAT_WORD(t,(high+0x1002)&0xfffff000);
+ SET_FLOAT_WORD(t,(high+0x1800)&0xfffff000);
- /* one step Newton iteration to 24 bits with error < 0.667 ulps */
+ /* one step Newton iteration to 24 bits with error < 0.669 ulps */
s=t*t; /* t*t is exact */
r=x/s; /* error <= 0.5 ulps; |r| < |t| */
w=t+t; /* t+t is exact */
OpenPOWER on IntegriCloud