diff options
Diffstat (limited to 'lib/msun/src')
-rw-r--r-- | lib/msun/src/s_cbrt.c | 47 | ||||
-rw-r--r-- | lib/msun/src/s_cbrtf.c | 20 |
2 files changed, 30 insertions, 37 deletions
diff --git a/lib/msun/src/s_cbrt.c b/lib/msun/src/s_cbrt.c index e9ed330..29e53e2 100644 --- a/lib/msun/src/s_cbrt.c +++ b/lib/msun/src/s_cbrt.c @@ -26,12 +26,13 @@ static const u_int32_t B1 = 715094163, /* B1 = (1023-1023/3-0.03306235651)*2**20 */ B2 = 696219795; /* B2 = (1023-1023/3-54/3-0.03306235651)*2**20 */ +/* |1/cbrt(x) - p(x)| < 2**-23.5 (~[-7.93e-8, 7.929e-8]). */ static const double -C = 5.42857142857142815906e-01, /* 19/35 = 0x3FE15F15, 0xF15F15F1 */ -D = -7.05306122448979611050e-01, /* -864/1225 = 0xBFE691DE, 0x2532C834 */ -E = 1.41428571428571436819e+00, /* 99/70 = 0x3FF6A0EA, 0x0EA0EA0F */ -F = 1.60714285714285720630e+00, /* 45/28 = 0x3FF9B6DB, 0x6DB6DB6E */ -G = 3.57142857142857150787e-01; /* 5/14 = 0x3FD6DB6D, 0xB6DB6DB7 */ +P0 = 1.87595182427177009643, /* 0x3ffe03e6, 0x0f61e692 */ +P1 = -1.88497979543377169875, /* 0xbffe28e0, 0x92f02420 */ +P2 = 1.621429720105354466140, /* 0x3ff9f160, 0x4a49d6c2 */ +P3 = -0.758397934778766047437, /* 0xbfe844cb, 0xbee751d9 */ +P4 = 0.145996192886612446982; /* 0x3fc2b000, 0xd4e4edd7 */ double cbrt(double x) @@ -78,36 +79,30 @@ cbrt(double x) SET_HIGH_WORD(t,sign|(hx/3+B1)); /* - * New cbrt to 25 bits: - * cbrt(x) = t*cbrt(x/t**3) ~= t*R(x/t**3) - * where R(r) = (14*r**2 + 35*r + 5)/(5*r**2 + 35*r + 14) is the - * (2,2) Pade approximation to cbrt(r) at r = 1. We replace - * r = x/t**3 by 1/r = t**3/x since the latter can be evaluated - * more efficiently, and rearrange the expression for R(r) to use - * 4 additions and 2 divisions instead of the 4 additions, 4 - * multiplications and 1 division that would be required using - * Horner's rule on the numerator and denominator. t being good - * to 32 bits means that |t/cbrt(x)-1| < 1/32, so |x/t**3-1| < 0.1 - * and for R(r) we can use any approximation to cbrt(r) that is good - * to 20 bits on [0.9, 1.1]. The (2,2) Pade approximation is not an - * especially good choice. + * New cbrt to 23 bits: + * cbrt(x) = t*cbrt(x/t**3) ~= t*P(t**3/x) + * where P(r) is a polynomial of degree 4 that approximates 1/cbrt(r) + * to within 2**-23.5 when |r - 1| < 1/10. The rough approximation + * has produced t such than |t/cbrt(x) - 1| ~< 1/32, and cubing this + * gives us bounds for r = t**3/x. + * + * Try to optimize for parallel evaluation as in k_tanf.c. */ - r=t*t/x; - s=C+r*t; - t*=G+F/(s+E+D/s); + r=(t*t)*(t/x); + t=t*((P0+r*(P1+r*P2))+((r*r)*r)*(P3+r*P4)); /* - * Round t away from zero to 25 bits (sloppily except for ensuring that + * Round t away from zero to 23 bits (sloppily except for ensuring that * the result is larger in magnitude than cbrt(x) but not much more than - * 2 25-bit ulps larger). With rounding towards zero, the error bound - * would be ~5/6 instead of ~4/6. With a maximum error of 1 25-bit ulps + * 2 23-bit ulps larger). With rounding towards zero, the error bound + * would be ~5/6 instead of ~4/6. With a maximum error of 2 23-bit ulps * in the rounded t, the infinite-precision error in the Newton * approximation barely affects third digit in the the final error - * 0.667; the error in the rounded t can be up to about 12 25-bit ulps + * 0.667; the error in the rounded t can be up to about 3 23-bit ulps * before the final error is larger than 0.667 ulps. */ u.value=t; - u.bits=(u.bits+0x20000000)&0xfffffffff0000000ULL; + u.bits=(u.bits+0x80000000)&0xffffffffc0000000ULL; t=u.value; /* one step Newton iteration to 53 bits with error < 0.667 ulps */ diff --git a/lib/msun/src/s_cbrtf.c b/lib/msun/src/s_cbrtf.c index 89f3507..89398af 100644 --- a/lib/msun/src/s_cbrtf.c +++ b/lib/msun/src/s_cbrtf.c @@ -28,12 +28,11 @@ static const unsigned B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */ B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */ +/* |1/cbrt(x) - p(x)| < 2**-14.5 (~[-4.37e-4, 4.366e-5]). */ static const float -C = 5.4285717010e-01, /* 19/35 = 0x3f0af8b0 */ -D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */ -E = 1.4142856598e+00, /* 99/70 = 0x3fb50750 */ -F = 1.6071428061e+00, /* 45/28 = 0x3fcdb6db */ -G = 3.5714286566e-01; /* 5/14 = 0x3eb6db6e */ +P0 = 1.5586718321, /* 0x3fc7828f */ +P1 = -0.78271341324, /* -0xbf485fe8 */ +P2 = 0.22403796017; /* 0x3e656a35 */ float cbrtf(float x) @@ -59,10 +58,9 @@ cbrtf(float x) } else SET_FLOAT_WORD(t,sign|(hx/3+B1)); - /* new cbrt to 23 bits */ - r=t*t/x; - s=C+r*t; - t*=G+F/(s+E+D/s); + /* new cbrt to 14 bits */ + r=(t*t)*(t/x); + t=t*((P0+r*P1)+(r*r)*P2); /* * Round t away from zero to 12 bits (sloppily except for ensuring that @@ -73,9 +71,9 @@ cbrtf(float x) * the second digit instead of the third digit of 4/6 = 0.666..., etc. */ GET_FLOAT_WORD(high,t); - SET_FLOAT_WORD(t,(high+0x1002)&0xfffff000); + SET_FLOAT_WORD(t,(high+0x1800)&0xfffff000); - /* one step Newton iteration to 24 bits with error < 0.667 ulps */ + /* one step Newton iteration to 24 bits with error < 0.669 ulps */ s=t*t; /* t*t is exact */ r=x/s; /* error <= 0.5 ulps; |r| < |t| */ w=t+t; /* t+t is exact */ |