summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorbde <bde@FreeBSD.org>2005-12-13 20:17:23 +0000
committerbde <bde@FreeBSD.org>2005-12-13 20:17:23 +0000
commite1faa6b5ba7aef38278abc4c0ae7df0b06550019 (patch)
tree863b0262daf05ea6ebff7492ecafc906bac56ec4 /lib
parent9ecb89a9e0f77b0027093db08c96970e9739666c (diff)
downloadFreeBSD-src-e1faa6b5ba7aef38278abc4c0ae7df0b06550019.zip
FreeBSD-src-e1faa6b5ba7aef38278abc4c0ae7df0b06550019.tar.gz
Optimize by not doing excessive conversions for handling the sign bit.
This gives an optimization of between 9 and 22% on Athlons (largest for cbrt() on amd64 -- from 205 to 159 cycles). We extracted the sign bit and worked with |x|, and restored the sign bit as the last step. We avoided branches to a fault by using accesses to FP values as bits to clear and restore the sign bit. Avoiding branches is usually good, but the bit access macros are not so good (especially for setting FP values), and here they always caused pipeline stalls on Athlons. Even using branches would be faster except on args that give perfect branch misprediction, since only mispredicted branches cause stalls, but it possible to avoid touching the sign bit in FP values at all (except to preserve it in conversions from bits to FP not related to the sign bit). Do this. The results are identical except in 2 of the 3 unsupported rounding modes, since all the approximations use odd rational functions so they work right on strictly negative values, and the special case of -0 doesn't use an approximation.
Diffstat (limited to 'lib')
-rw-r--r--lib/msun/src/s_cbrt.c12
-rw-r--r--lib/msun/src/s_cbrtf.c12
2 files changed, 9 insertions, 15 deletions
diff --git a/lib/msun/src/s_cbrt.c b/lib/msun/src/s_cbrt.c
index 853337b..4e7eaa8 100644
--- a/lib/msun/src/s_cbrt.c
+++ b/lib/msun/src/s_cbrt.c
@@ -8,6 +8,8 @@
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
+ *
+ * Optimized by Bruce D. Evans.
*/
#ifndef lint
@@ -47,7 +49,6 @@ cbrt(double x)
if((hx|low)==0)
return(x); /* cbrt(0) is itself */
- SET_HIGH_WORD(x,hx); /* x <- |x| */
/*
* Rough cbrt to 5 bits:
* cbrt(2**e*(1+m) ~= 2**(e/3)*(1+(e%3+m)/3)
@@ -67,16 +68,16 @@ cbrt(double x)
SET_HIGH_WORD(t,0x43500000); /* set t= 2**54 */
t*=x;
GET_HIGH_WORD(high,t);
- SET_HIGH_WORD(t,high/3+B2);
+ SET_HIGH_WORD(t,sign|((high&0x7fffffff)/3+B2));
} else
- SET_HIGH_WORD(t,hx/3+B1);
+ SET_HIGH_WORD(t,sign|(hx/3+B1));
/* new cbrt to 23 bits; may be implemented in single precision */
r=t*t/x;
s=C+r*t;
t*=G+F/(s+E+D/s);
- /* chop t to 20 bits and make it larger than cbrt(x) */
+ /* chop t to 20 bits and make it larger in magnitude than cbrt(x) */
GET_HIGH_WORD(high,t);
INSERT_WORDS(t,high+0x00000001,0);
@@ -87,8 +88,5 @@ cbrt(double x)
r=(r-t)/(w+r); /* r-t is exact */
t=t+t*r;
- /* restore the sign bit */
- GET_HIGH_WORD(high,t);
- SET_HIGH_WORD(t,high|sign);
return(t);
}
diff --git a/lib/msun/src/s_cbrtf.c b/lib/msun/src/s_cbrtf.c
index 3af0e40..d6f6d73 100644
--- a/lib/msun/src/s_cbrtf.c
+++ b/lib/msun/src/s_cbrtf.c
@@ -1,6 +1,6 @@
/* s_cbrtf.c -- float version of s_cbrt.c.
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- * Debugged by Bruce D. Evans.
+ * Debugged and optimized by Bruce D. Evans.
*/
/*
@@ -50,22 +50,21 @@ cbrtf(float x)
if(hx==0)
return(x); /* cbrt(0) is itself */
- SET_FLOAT_WORD(x,hx); /* x <- |x| */
/* rough cbrt to 5 bits */
if(hx<0x00800000) { /* subnormal number */
SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
t*=x;
GET_FLOAT_WORD(high,t);
- SET_FLOAT_WORD(t,high/3+B2);
+ SET_FLOAT_WORD(t,sign|((high&0x7fffffff)/3+B2));
} else
- SET_FLOAT_WORD(t,hx/3+B1);
+ SET_FLOAT_WORD(t,sign|(hx/3+B1));
/* new cbrt to 23 bits */
r=t*t/x;
s=C+r*t;
t*=G+F/(s+E+D/s);
- /* chop t to 12 bits and make it larger than cbrt(x) */
+ /* chop t to 12 bits and make it larger in magnitude than cbrt(x) */
GET_FLOAT_WORD(high,t);
SET_FLOAT_WORD(t,(high&0xfffff000)+0x00001000);
@@ -76,8 +75,5 @@ cbrtf(float x)
r=(r-t)/(w+r); /* r-t is exact */
t=t+t*r;
- /* restore the sign bit */
- GET_FLOAT_WORD(high,t);
- SET_FLOAT_WORD(t,high|sign);
return(t);
}
OpenPOWER on IntegriCloud