Use a 53-bit approximation to pi/2 instead of a 33+53 bit one for the

special case pi/4 <= |x| < 3*pi/4. This gives a tiny optimization (it saves 2 subtractions, which are scheduled well so they take a whole 1 cycle extra on an AthlonXP), and simplifies the code so that the following optimization is not so ugly. Optimize for the range 3*pi/4 < |x| < 9*Pi/2 in the same way. On Athlon{XP,64} systems, this gives a 25-40% optimization (depending a lot on CFLAGS) for the cosf() and sinf() consumers on this range. Relative to i387 hardware fcos and fsin, it makes the software versions faster in most cases instead of slower in most cases. The relative optimization is smaller for tanf() the inefficient part is elsewhere. The 53-bit approximation to pi/2 is good enough for pi/4 <= |x| < 3*pi/4 because after losing up to 24 bits to subtraction, we still have 29 bits of precision and only need 25 bits. Even with only 5 extra bits, it is possible to get perfectly rounded results starting with the reduced x, since if x is nearly a multiple of pi/2 then x is not near a half-way case and if x is not nearly a multiple of pi/2 then we don't lose many bits. With our intentionally imperfect rounding we get the same results for cosf(), sinf() and tanf() as without this optimization.
author: bde <bde@FreeBSD.org> 2005-11-06 17:48:02 +0000
committer: bde <bde@FreeBSD.org> 2005-11-06 17:48:02 +0000
commit: e016ebc9a18997b6b2285eca5d9a9b03bc1c8cc8 (patch)
tree: fe6afcf2e6deb5ade4395295176996f7fd31d6e4 /lib
parent: 192c946ba5d70acf20a985ff4934bafb240df963 (diff)
download: FreeBSD-src-e016ebc9a18997b6b2285eca5d9a9b03bc1c8cc8.zip
FreeBSD-src-e016ebc9a18997b6b2285eca5d9a9b03bc1c8cc8.tar.gz
1 files changed, 39 insertions, 9 deletions
diff --git a/lib/msun/src/e_rem_pio2f.c b/lib/msun/src/e_rem_pio2f.c
index eaedc48..3d17158 100644
--- a/lib/msun/src/e_rem_pio2f.c
+++ b/lib/msun/src/e_rem_pio2f.c
@@ -55,6 +55,7 @@ zero =  0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
 half =  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
 two24 =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
 invpio2 =  6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
+pio2    =  1.57079632679489655700e+00, /* 0x3FF921FB, 0x54442d18 */
 pio2_1  =  1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 pio2_1t =  6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */
 
@@ -71,16 +72,45 @@ pio2_1t =  6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */
 	/* 33+53 bit pi is good enough for special and medium size cases */
 	if(ix<0x4016cbe4) {  /* |x| < 3pi/4, special case with n=+-1 */
 	    if(hx>0) { 
-		z = x - pio2_1;
-		y[0] = z - pio2_1t;
-		y[1] = (z-y[0])-pio2_1t;
-		return 1;
-	    } else {	/* negative x */
-		z = x + pio2_1;
-		y[0] = z + pio2_1t;
-		y[1] = (z-y[0])+pio2_1t;
-		return -1;
+		z = x - pio2;
+		n = 1;
+	    } else {
+		z = x + pio2;
+		n = 3;
 	    }
+	    y[0] = z;
+	    y[1] = z - y[0];
+	    return n;
+	}
+	if(ix<0x407b53d1) {  /* |x| < 5*pi/4, special case with n=+-2 */
+	    if(hx>0)
+		z = x - 2*pio2;
+	    else
+		z = x + 2*pio2;
+	    y[0] = z;
+	    y[1] = z - y[0];
+	    return 2;
+	}
+	if(ix<0x40afeddf) {  /* |x| < 7*pi/4, special case with n=+-3 */
+	    if(hx>0) { 
+		z = x - 3*pio2;
+		n = 3;
+	    } else {
+		z = x + 3*pio2;
+		n = 1;
+	    }
+	    y[0] = z;
+	    y[1] = z - y[0];
+	    return n;
+	}
+	if(ix<0x40e231d6) {  /* |x| < 9*pi/4, special case with n=+-4 */
+	    if(hx>0)
+		z = x - 4*pio2;
+	    else
+		z = x + 4*pio2;
+	    y[0] = z;
+	    y[1] = z - y[0];
+	    return 0;
 	}
 	if(ix<=0x49490f80) { /* |x| ~<= 2^19*(pi/2), medium size */
 	    t  = fabsf(x);
author	bde <bde@FreeBSD.org>	2005-11-06 17:48:02 +0000
committer	bde <bde@FreeBSD.org>	2005-11-06 17:48:02 +0000
commit	e016ebc9a18997b6b2285eca5d9a9b03bc1c8cc8 (patch)
tree	fe6afcf2e6deb5ade4395295176996f7fd31d6e4 /lib
parent	192c946ba5d70acf20a985ff4934bafb240df963 (diff)
download	FreeBSD-src-e016ebc9a18997b6b2285eca5d9a9b03bc1c8cc8.zip FreeBSD-src-e016ebc9a18997b6b2285eca5d9a9b03bc1c8cc8.tar.gz