2 files changed, 30 insertions, 37 deletions
diff --git a/lib/msun/src/s_cbrt.c b/lib/msun/src/s_cbrt.c
index e9ed330..29e53e2 100644
--- a/lib/msun/src/s_cbrt.c
+++ b/lib/msun/src/s_cbrt.c
@@ -26,12 +26,13 @@ static const u_int32_t
 	B1 = 715094163, /* B1 = (1023-1023/3-0.03306235651)*2**20 */
 	B2 = 696219795; /* B2 = (1023-1023/3-54/3-0.03306235651)*2**20 */
 
+/* |1/cbrt(x) - p(x)| < 2**-23.5 (~[-7.93e-8, 7.929e-8]). */
 static const double
-C =  5.42857142857142815906e-01, /* 19/35     = 0x3FE15F15, 0xF15F15F1 */
-D = -7.05306122448979611050e-01, /* -864/1225 = 0xBFE691DE, 0x2532C834 */
-E =  1.41428571428571436819e+00, /* 99/70     = 0x3FF6A0EA, 0x0EA0EA0F */
-F =  1.60714285714285720630e+00, /* 45/28     = 0x3FF9B6DB, 0x6DB6DB6E */
-G =  3.57142857142857150787e-01; /* 5/14      = 0x3FD6DB6D, 0xB6DB6DB7 */
+P0 =  1.87595182427177009643,		/* 0x3ffe03e6, 0x0f61e692 */
+P1 = -1.88497979543377169875,		/* 0xbffe28e0, 0x92f02420 */
+P2 =  1.621429720105354466140,		/* 0x3ff9f160, 0x4a49d6c2 */
+P3 = -0.758397934778766047437,		/* 0xbfe844cb, 0xbee751d9 */
+P4 =  0.145996192886612446982;		/* 0x3fc2b000, 0xd4e4edd7 */
 
 double
 cbrt(double x)
@@ -78,36 +79,30 @@ cbrt(double x)
 	    SET_HIGH_WORD(t,sign|(hx/3+B1));
 
     /*
-     * New cbrt to 25 bits:
-     *    cbrt(x) = t*cbrt(x/t**3) ~= t*R(x/t**3)
-     * where R(r) = (14*r**2 + 35*r + 5)/(5*r**2 + 35*r + 14) is the
-     * (2,2) Pade approximation to cbrt(r) at r = 1.  We replace
-     * r = x/t**3 by 1/r = t**3/x since the latter can be evaluated
-     * more efficiently, and rearrange the expression for R(r) to use
-     * 4 additions and 2 divisions instead of the 4 additions, 4
-     * multiplications and 1 division that would be required using
-     * Horner's rule on the numerator and denominator.  t being good
-     * to 32 bits means that |t/cbrt(x)-1| < 1/32, so |x/t**3-1| < 0.1
-     * and for R(r) we can use any approximation to cbrt(r) that is good
-     * to 20 bits on [0.9, 1.1].  The (2,2) Pade approximation is not an
-     * especially good choice.
+     * New cbrt to 23 bits:
+     *    cbrt(x) = t*cbrt(x/t**3) ~= t*P(t**3/x)
+     * where P(r) is a polynomial of degree 4 that approximates 1/cbrt(r)
+     * to within 2**-23.5 when |r - 1| < 1/10.  The rough approximation
+     * has produced t such than |t/cbrt(x) - 1| ~< 1/32, and cubing this
+     * gives us bounds for r = t**3/x.
+     *
+     * Try to optimize for parallel evaluation as in k_tanf.c.
      */
-	r=t*t/x;
-	s=C+r*t;
-	t*=G+F/(s+E+D/s);
+	r=(t*t)*(t/x);
+	t=t*((P0+r*(P1+r*P2))+((r*r)*r)*(P3+r*P4));
 
     /*
-     * Round t away from zero to 25 bits (sloppily except for ensuring that
+     * Round t away from zero to 23 bits (sloppily except for ensuring that
      * the result is larger in magnitude than cbrt(x) but not much more than
-     * 2 25-bit ulps larger).  With rounding towards zero, the error bound
-     * would be ~5/6 instead of ~4/6.  With a maximum error of 1 25-bit ulps
+     * 2 23-bit ulps larger).  With rounding towards zero, the error bound
+     * would be ~5/6 instead of ~4/6.  With a maximum error of 2 23-bit ulps
      * in the rounded t, the infinite-precision error in the Newton
      * approximation barely affects third digit in the the final error
-     * 0.667; the error in the rounded t can be up to about 12 25-bit ulps
+     * 0.667; the error in the rounded t can be up to about 3 23-bit ulps
      * before the final error is larger than 0.667 ulps.
      */
 	u.value=t;
-	u.bits=(u.bits+0x20000000)&0xfffffffff0000000ULL;
+	u.bits=(u.bits+0x80000000)&0xffffffffc0000000ULL;
 	t=u.value;
 
     /* one step Newton iteration to 53 bits with error < 0.667 ulps */
diff --git a/lib/msun/src/s_cbrtf.c b/lib/msun/src/s_cbrtf.c
index 89f3507..89398af 100644
--- a/lib/msun/src/s_cbrtf.c
+++ b/lib/msun/src/s_cbrtf.c
@@ -28,12 +28,11 @@ static const unsigned
 	B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
 	B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
 
+/* |1/cbrt(x) - p(x)| < 2**-14.5 (~[-4.37e-4, 4.366e-5]). */
 static const float
-C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
-D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
-E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
-F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
-G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
+P0 =  1.5586718321,			/*  0x3fc7828f */
+P1 = -0.78271341324,			/* -0xbf485fe8 */
+P2 =  0.22403796017;			/*  0x3e656a35 */
 
 float
 cbrtf(float x)
@@ -59,10 +58,9 @@ cbrtf(float x)
 	} else
 	    SET_FLOAT_WORD(t,sign|(hx/3+B1));
 
-    /* new cbrt to 23 bits */
-	r=t*t/x;
-	s=C+r*t;
-	t*=G+F/(s+E+D/s);
+    /* new cbrt to 14 bits */
+	r=(t*t)*(t/x);
+	t=t*((P0+r*P1)+(r*r)*P2);
 
     /*
      * Round t away from zero to 12 bits (sloppily except for ensuring that
@@ -73,9 +71,9 @@ cbrtf(float x)
      * the second digit instead of the third digit of 4/6 = 0.666..., etc.
      */
 	GET_FLOAT_WORD(high,t);
-	SET_FLOAT_WORD(t,(high+0x1002)&0xfffff000);
+	SET_FLOAT_WORD(t,(high+0x1800)&0xfffff000);
 
-    /* one step Newton iteration to 24 bits with error < 0.667 ulps */
+    /* one step Newton iteration to 24 bits with error < 0.669 ulps */
 	s=t*t;				/* t*t is exact */
 	r=x/s;				/* error <= 0.5 ulps; |r| < |t| */
 	w=t+t;				/* t+t is exact */