Optimize for 3pi/4 <= |x| <= 9pi/4 in much the same way as for

pi/4 <= |x| <= 3pi/4. Use the same branch ladder as for float precision. Remove the optimization for |x| near pi/2 and don't do it near the multiples of pi/2 in the newly optimized range, since it requires fairly large code to handle only relativley few cases. Ifdef out optimization for |x| <= pi/4 since this case can't occur because it is done in callers. On amd64 (A64), for cos() and sin() with uniformly distributed args, no cache misses, some parallelism in the caller, and good but not great CC and CFLAGS, etc., this saves about 40 cycles or 38% in the newly optimized range, or about 27% on average across the range |x| <= 2pi (~65 cycles for most args, while the A64 hardware fcos and fsin take ~75 cycles for half the args and 125 cycles for the other half). The speedup for tan() is much smaller, especially relatively. The speedup on i386 (A64) is slightly smaller, especially relatively. i386 is still much slower than amd64 here (unlike in the float case where it is slightly faster).
author: bde <bde@FreeBSD.org> 2008-02-19 15:30:58 +0000
committer: bde <bde@FreeBSD.org> 2008-02-19 15:30:58 +0000
commit: 30565c600e032f4ffa87a25756cc0bef6e9f2f0d (patch)
tree: 2e36b01874df4a4d753d355f47ab0645349f9d1f /lib/msun
parent: e508bf1279c21ed622306af43844139988972c81 (diff)
download: FreeBSD-src-30565c600e032f4ffa87a25756cc0bef6e9f2f0d.zip
FreeBSD-src-30565c600e032f4ffa87a25756cc0bef6e9f2f0d.tar.gz
1 files changed, 56 insertions, 18 deletions
diff --git a/lib/msun/src/e_rem_pio2.c b/lib/msun/src/e_rem_pio2.c
index 5467744..c7d2064 100644
--- a/lib/msun/src/e_rem_pio2.c
+++ b/lib/msun/src/e_rem_pio2.c
@@ -68,34 +68,72 @@ __ieee754_rem_pio2(double x, double *y)
 
 	GET_HIGH_WORD(hx,x);		/* high word of x */
 	ix = hx&0x7fffffff;
+#if 0 /* Must be handled in caller. */
 	if(ix<=0x3fe921fb)   /* |x| ~<= pi/4 , no need for reduction */
 	    {y[0] = x; y[1] = 0; return 0;}
-	if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
-	    if(hx>0) { 
-		z = x - pio2_1;
-		if(ix!=0x3ff921fb) { 	/* 33+53 bit pi is good enough */
+#endif
+	if (ix <= 0x400f6a7a) {		/* |x| ~<= 5pi/4 */
+	    if ((ix & 0xfffff) == 0x921fb)  /* |x| ~= pi/2 or 2pi/2 */
+		goto medium;		/* cancellation -- use medium case */
+	    if (ix <= 0x4002d97c) {	/* |x| ~<= 3pi/4 */
+		if (hx > 0) {
+		    z = x - pio2_1;	/* one round good to 85 bits */
 		    y[0] = z - pio2_1t;
 		    y[1] = (z-y[0])-pio2_1t;
-		} else {		/* near pi/2, use 33+33+53 bit pi */
-		    z -= pio2_2;
-		    y[0] = z - pio2_2t;
-		    y[1] = (z-y[0])-pio2_2t;
-		}
-		return 1;
-	    } else {	/* negative x */
-		z = x + pio2_1;
-		if(ix!=0x3ff921fb) { 	/* 33+53 bit pi is good enough */
+		    return 1;
+		} else {
+		    z = x + pio2_1;
 		    y[0] = z + pio2_1t;
 		    y[1] = (z-y[0])+pio2_1t;
-		} else {		/* near pi/2, use 33+33+53 bit pi */
-		    z += pio2_2;
-		    y[0] = z + pio2_2t;
-		    y[1] = (z-y[0])+pio2_2t;
+		    return -1;
+		}
+	    } else {
+		if (hx > 0) {
+		    z = x - 2*pio2_1;
+		    y[0] = z - 2*pio2_1t;
+		    y[1] = (z-y[0])-2*pio2_1t;
+		    return 2;
+		} else {
+		    z = x + 2*pio2_1;
+		    y[0] = z + 2*pio2_1t;
+		    y[1] = (z-y[0])+2*pio2_1t;
+		    return -2;
+		}
+	    }
+	}
+	if (ix <= 0x401c463b) {		/* |x| ~<= 9pi/4 */
+	    if (ix <= 0x4015fdbc) {	/* |x| ~<= 7pi/4 */
+		if (ix == 0x4012d97c)	/* |x| ~= 3pi/2 */
+		    goto medium;
+		if (hx > 0) {
+		    z = x - 3*pio2_1;
+		    y[0] = z - 3*pio2_1t;
+		    y[1] = (z-y[0])-3*pio2_1t;
+		    return 3;
+		} else {
+		    z = x + 3*pio2_1;
+		    y[0] = z + 3*pio2_1t;
+		    y[1] = (z-y[0])+3*pio2_1t;
+		    return -3;
+		}
+	    } else {
+		if (ix == 0x401921fb)	/* |x| ~= 4pi/2 */
+		    goto medium;
+		if (hx > 0) {
+		    z = x - 4*pio2_1;
+		    y[0] = z - 4*pio2_1t;
+		    y[1] = (z-y[0])-4*pio2_1t;
+		    return 4;
+		} else {
+		    z = x + 4*pio2_1;
+		    y[0] = z + 4*pio2_1t;
+		    y[1] = (z-y[0])+4*pio2_1t;
+		    return -4;
 		}
-		return -1;
 	    }
 	}
 	if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
+medium:
 	    t  = fabs(x);
 	    n  = (int32_t) (t*invpio2+half);
 	    fn = (double)n;
author	bde <bde@FreeBSD.org>	2008-02-19 15:30:58 +0000
committer	bde <bde@FreeBSD.org>	2008-02-19 15:30:58 +0000
commit	30565c600e032f4ffa87a25756cc0bef6e9f2f0d (patch)
tree	2e36b01874df4a4d753d355f47ab0645349f9d1f /lib/msun
parent	e508bf1279c21ed622306af43844139988972c81 (diff)
download	FreeBSD-src-30565c600e032f4ffa87a25756cc0bef6e9f2f0d.zip FreeBSD-src-30565c600e032f4ffa87a25756cc0bef6e9f2f0d.tar.gz