Use a minimax polynomial approximation instead of a Pade rational

function approximation for the second step. The polynomial has degree 2 for cbrtf() and 4 for cbrt(). These degrees are minimal for the final accuracy to be essentially the same as before (slightly smaller). Adjust the rounding between steps 2 and 3 to match. Unfortunately, for cbrt(), this breaks the claimed accuracy slightly although incorrect rounding doesn't. Claim less accuracy since its not worth pessimizing the polynomial or relying on exhaustive testing to get insignificantly more accuracy. This saves about 30 cycles on Athlons (mainly by avoiding 2 divisions) so it gives an overall optimization in the 10-25% range (a larger percentage for float precision, especially in 32-bit mode, since other overheads are more dominant for double precision, surprisingly more in 32-bit mode).
author: bde <bde@FreeBSD.org> 2005-12-19 00:22:03 +0000
committer: bde <bde@FreeBSD.org> 2005-12-19 00:22:03 +0000
commit: ce5f09f38d1a047b71ee0447a7d9efd1ce8cb183 (patch)
tree: c79ab807fed96e896f06a4d68cd39de8ffa8329b /lib/msun
parent: cb3a140c1cbaeeafffc172372c7475d6a0311e85 (diff)
download: FreeBSD-src-ce5f09f38d1a047b71ee0447a7d9efd1ce8cb183.zip
FreeBSD-src-ce5f09f38d1a047b71ee0447a7d9efd1ce8cb183.tar.gz
2 files changed, 30 insertions, 37 deletions
diff --git a/lib/msun/src/s_cbrt.c b/lib/msun/src/s_cbrt.c
index e9ed330..29e53e2 100644
--- a/lib/msun/src/s_cbrt.c
+++ b/lib/msun/src/s_cbrt.c
@@ -26,12 +26,13 @@ static const u_int32_t
 	B1 = 715094163, /* B1 = (1023-1023/3-0.03306235651)*2**20 */
 	B2 = 696219795; /* B2 = (1023-1023/3-54/3-0.03306235651)*2**20 */
 
+/* |1/cbrt(x) - p(x)| < 2**-23.5 (~[-7.93e-8, 7.929e-8]). */
 static const double
-C =  5.42857142857142815906e-01, /* 19/35     = 0x3FE15F15, 0xF15F15F1 */
-D = -7.05306122448979611050e-01, /* -864/1225 = 0xBFE691DE, 0x2532C834 */
-E =  1.41428571428571436819e+00, /* 99/70     = 0x3FF6A0EA, 0x0EA0EA0F */
-F =  1.60714285714285720630e+00, /* 45/28     = 0x3FF9B6DB, 0x6DB6DB6E */
-G =  3.57142857142857150787e-01; /* 5/14      = 0x3FD6DB6D, 0xB6DB6DB7 */
+P0 =  1.87595182427177009643,		/* 0x3ffe03e6, 0x0f61e692 */
+P1 = -1.88497979543377169875,		/* 0xbffe28e0, 0x92f02420 */
+P2 =  1.621429720105354466140,		/* 0x3ff9f160, 0x4a49d6c2 */
+P3 = -0.758397934778766047437,		/* 0xbfe844cb, 0xbee751d9 */
+P4 =  0.145996192886612446982;		/* 0x3fc2b000, 0xd4e4edd7 */
 
 double
 cbrt(double x)
@@ -78,36 +79,30 @@ cbrt(double x)
 	    SET_HIGH_WORD(t,sign|(hx/3+B1));
 
     /*
-     * New cbrt to 25 bits:
-     *    cbrt(x) = t*cbrt(x/t**3) ~= t*R(x/t**3)
-     * where R(r) = (14*r**2 + 35*r + 5)/(5*r**2 + 35*r + 14) is the
-     * (2,2) Pade approximation to cbrt(r) at r = 1.  We replace
-     * r = x/t**3 by 1/r = t**3/x since the latter can be evaluated
-     * more efficiently, and rearrange the expression for R(r) to use
-     * 4 additions and 2 divisions instead of the 4 additions, 4
-     * multiplications and 1 division that would be required using
-     * Horner's rule on the numerator and denominator.  t being good
-     * to 32 bits means that |t/cbrt(x)-1| < 1/32, so |x/t**3-1| < 0.1
-     * and for R(r) we can use any approximation to cbrt(r) that is good
-     * to 20 bits on [0.9, 1.1].  The (2,2) Pade approximation is not an
-     * especially good choice.
+     * New cbrt to 23 bits:
+     *    cbrt(x) = t*cbrt(x/t**3) ~= t*P(t**3/x)
+     * where P(r) is a polynomial of degree 4 that approximates 1/cbrt(r)
+     * to within 2**-23.5 when |r - 1| < 1/10.  The rough approximation
+     * has produced t such than |t/cbrt(x) - 1| ~< 1/32, and cubing this
+     * gives us bounds for r = t**3/x.
+     *
+     * Try to optimize for parallel evaluation as in k_tanf.c.
      */
-	r=t*t/x;
-	s=C+r*t;
-	t*=G+F/(s+E+D/s);
+	r=(t*t)*(t/x);
+	t=t*((P0+r*(P1+r*P2))+((r*r)*r)*(P3+r*P4));
 
     /*
-     * Round t away from zero to 25 bits (sloppily except for ensuring that
+     * Round t away from zero to 23 bits (sloppily except for ensuring that
      * the result is larger in magnitude than cbrt(x) but not much more than
-     * 2 25-bit ulps larger).  With rounding towards zero, the error bound
-     * would be ~5/6 instead of ~4/6.  With a maximum error of 1 25-bit ulps
+     * 2 23-bit ulps larger).  With rounding towards zero, the error bound
+     * would be ~5/6 instead of ~4/6.  With a maximum error of 2 23-bit ulps
      * in the rounded t, the infinite-precision error in the Newton
      * approximation barely affects third digit in the the final error
-     * 0.667; the error in the rounded t can be up to about 12 25-bit ulps
+     * 0.667; the error in the rounded t can be up to about 3 23-bit ulps
      * before the final error is larger than 0.667 ulps.
      */
 	u.value=t;
-	u.bits=(u.bits+0x20000000)&0xfffffffff0000000ULL;
+	u.bits=(u.bits+0x80000000)&0xffffffffc0000000ULL;
 	t=u.value;
 
     /* one step Newton iteration to 53 bits with error < 0.667 ulps */
diff --git a/lib/msun/src/s_cbrtf.c b/lib/msun/src/s_cbrtf.c
index 89f3507..89398af 100644
--- a/lib/msun/src/s_cbrtf.c
+++ b/lib/msun/src/s_cbrtf.c
@@ -28,12 +28,11 @@ static const unsigned
 	B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
 	B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
 
+/* |1/cbrt(x) - p(x)| < 2**-14.5 (~[-4.37e-4, 4.366e-5]). */
 static const float
-C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
-D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
-E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
-F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
-G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
+P0 =  1.5586718321,			/*  0x3fc7828f */
+P1 = -0.78271341324,			/* -0xbf485fe8 */
+P2 =  0.22403796017;			/*  0x3e656a35 */
 
 float
 cbrtf(float x)
@@ -59,10 +58,9 @@ cbrtf(float x)
 	} else
 	    SET_FLOAT_WORD(t,sign|(hx/3+B1));
 
-    /* new cbrt to 23 bits */
-	r=t*t/x;
-	s=C+r*t;
-	t*=G+F/(s+E+D/s);
+    /* new cbrt to 14 bits */
+	r=(t*t)*(t/x);
+	t=t*((P0+r*P1)+(r*r)*P2);
 
     /*
      * Round t away from zero to 12 bits (sloppily except for ensuring that
@@ -73,9 +71,9 @@ cbrtf(float x)
      * the second digit instead of the third digit of 4/6 = 0.666..., etc.
      */
 	GET_FLOAT_WORD(high,t);
-	SET_FLOAT_WORD(t,(high+0x1002)&0xfffff000);
+	SET_FLOAT_WORD(t,(high+0x1800)&0xfffff000);
 
-    /* one step Newton iteration to 24 bits with error < 0.667 ulps */
+    /* one step Newton iteration to 24 bits with error < 0.669 ulps */
 	s=t*t;				/* t*t is exact */
 	r=x/s;				/* error <= 0.5 ulps; |r| < |t| */
 	w=t+t;				/* t+t is exact */
author	bde <bde@FreeBSD.org>	2005-12-19 00:22:03 +0000
committer	bde <bde@FreeBSD.org>	2005-12-19 00:22:03 +0000
commit	ce5f09f38d1a047b71ee0447a7d9efd1ce8cb183 (patch)
tree	c79ab807fed96e896f06a4d68cd39de8ffa8329b /lib/msun
parent	cb3a140c1cbaeeafffc172372c7475d6a0311e85 (diff)
download	FreeBSD-src-ce5f09f38d1a047b71ee0447a7d9efd1ce8cb183.zip FreeBSD-src-ce5f09f38d1a047b71ee0447a7d9efd1ce8cb183.tar.gz