summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbde <bde@FreeBSD.org>2006-01-05 07:57:31 +0000
committerbde <bde@FreeBSD.org>2006-01-05 07:57:31 +0000
commitcbc5231d535bf105f68c0a6fba979cbc75938915 (patch)
tree121f03122f4bc7a49fdc51b535b6316de85f3897
parentc34b73d7df7891d6a89b0b914829c5b1405e226c (diff)
downloadFreeBSD-src-cbc5231d535bf105f68c0a6fba979cbc75938915.zip
FreeBSD-src-cbc5231d535bf105f68c0a6fba979cbc75938915.tar.gz
Use double precision internally to optimize cbrtf(), and change the
algorithm for the second step significantly to also get a perfectly rounded result in round-to-nearest mode. The resulting optimization is about 25% on Athlon64's and 30% on Athlon XP's (about 25 cycles out of 100 on the former). Using extra precision, we don't need to do anything special to avoid large rounding errors in the third step (Newton's method), so we can regroup terms to avoid a division, increase clarity, and increase opportunities for parallelism. Rearrangement for parallelism loses the increase in clarity. We end up with the same number of operations but with a division reduced to a multiplication. Using specifically double precision, there is enough extra precision for the third step to give enough precision for perfect rounding to float precision provided the previous steps are accurate to 16 bits. (They were accurate to 12 bits, which was almost minimal for imperfect rounding in the old version but would be more than enough for imperfect rounding in this version (9 bits would be enough now).) I couldn't find any significant time optimizations from optimizing the previous steps, so I decided to optimize for accuracy instead. The second step needed a division although a previous commit optimized it to use a polynomial approximation for its main detail, and this division dominated the time for the second step. Use the same Newton's method for the second step as for the third step since this is insignificantly slower than the division plus the polynomial (now that Newton's method only needs 1 division), significantly more accurate, and simpler. Single precision would be precise enough for the second step, but doesn't have enough exponent range to handle denormals without the special grouping of terms (as in previous versions) that requires another division, so we use double precision for both the second and third steps.
-rw-r--r--lib/msun/src/s_cbrtf.c41
1 files changed, 13 insertions, 28 deletions
diff --git a/lib/msun/src/s_cbrtf.c b/lib/msun/src/s_cbrtf.c
index 89398af..3506943 100644
--- a/lib/msun/src/s_cbrtf.c
+++ b/lib/msun/src/s_cbrtf.c
@@ -28,16 +28,11 @@ static const unsigned
B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
-/* |1/cbrt(x) - p(x)| < 2**-14.5 (~[-4.37e-4, 4.366e-5]). */
-static const float
-P0 = 1.5586718321, /* 0x3fc7828f */
-P1 = -0.78271341324, /* -0xbf485fe8 */
-P2 = 0.22403796017; /* 0x3e656a35 */
-
float
cbrtf(float x)
{
- float r,s,t,w;
+ double r,T;
+ float t;
int32_t hx;
u_int32_t sign;
u_int32_t high;
@@ -58,27 +53,17 @@ cbrtf(float x)
} else
SET_FLOAT_WORD(t,sign|(hx/3+B1));
- /* new cbrt to 14 bits */
- r=(t*t)*(t/x);
- t=t*((P0+r*P1)+(r*r)*P2);
-
- /*
- * Round t away from zero to 12 bits (sloppily except for ensuring that
- * the result is larger in magnitude than cbrt(x) but not much more than
- * 1 12-bit ulp larger). With rounding towards zero, the error bound
- * would be ~5/6 instead of ~4/6, and with t 2 12-bit ulps larger the
- * infinite-precision error in the Newton approximation would affect
- * the second digit instead of the third digit of 4/6 = 0.666..., etc.
- */
- GET_FLOAT_WORD(high,t);
- SET_FLOAT_WORD(t,(high+0x1800)&0xfffff000);
+ /* first step Newton iteration (solving t*t-x/t == 0) to 16 bits */
+ /* in double precision to avoid problems with denormals */
+ T=t;
+ r=T*T*T;
+ T=T*(x+x+r)/(x+r+r);
- /* one step Newton iteration to 24 bits with error < 0.669 ulps */
- s=t*t; /* t*t is exact */
- r=x/s; /* error <= 0.5 ulps; |r| < |t| */
- w=t+t; /* t+t is exact */
- r=(r-t)/(w+r); /* r-t is exact; w+r ~= 3*t */
- t=t+t*r; /* error <= 0.5 + 0.5/3 + epsilon */
+ /* second step Newton iteration to 47 bits */
+ /* in double precision for accuracy */
+ r=T*T*T;
+ T=T*(x+x+r)/(x+r+r);
- return(t);
+ /* rounding to 24 bits is perfect in round-to-nearest mode */
+ return(T);
}
OpenPOWER on IntegriCloud