summaryrefslogtreecommitdiffstats
path: root/lib/msun/src/e_rem_pio2f.c
diff options
context:
space:
mode:
authorbde <bde@FreeBSD.org>2005-11-19 02:38:27 +0000
committerbde <bde@FreeBSD.org>2005-11-19 02:38:27 +0000
commit558fb238b170a8cea89a078ae31ee7cf11dc49a6 (patch)
tree06f77ae5b835dd0091c1d286f4c63858859a2f15 /lib/msun/src/e_rem_pio2f.c
parent666e602c465a2f1c8965ed92a47086e3d5e98ecf (diff)
downloadFreeBSD-src-558fb238b170a8cea89a078ae31ee7cf11dc49a6.zip
FreeBSD-src-558fb238b170a8cea89a078ae31ee7cf11dc49a6.tar.gz
Moved all the optimizations for |x| <= 9pi/2 from
__ieee754_rem_pio2f() to its 3 callers and manually inline them. On Athlons, with favourable compiler flags and optimizations and favourable pipeline conditions, this gives a speedup of 30-40 cycles for cosf(), sinf() and tanf() on the range pi/4 < |x| <= 9pi/4, so thes functions are now signifcantly faster than the hardware trig functions in many cases. E.g., in a benchmark with uniformly distributed x in [-2pi, 2pi], A64 hardware fcos took 72-129 cycles and cosf() took 37-55 cycles. Out-of-order execution is needed to get both of these times. The optimizations in this commit apparently work more by removing 1 serialization point than by reducing latency.
Diffstat (limited to 'lib/msun/src/e_rem_pio2f.c')
-rw-r--r--lib/msun/src/e_rem_pio2f.c55
1 files changed, 0 insertions, 55 deletions
diff --git a/lib/msun/src/e_rem_pio2f.c b/lib/msun/src/e_rem_pio2f.c
index a1d741c..6e939c1 100644
--- a/lib/msun/src/e_rem_pio2f.c
+++ b/lib/msun/src/e_rem_pio2f.c
@@ -47,10 +47,6 @@ static const int32_t two_over_pi[] = {
/*
* invpio2: 53 bits of 2/pi
- * e1pio2: 1*pi/2 rounded to 53 bits
- * e2pio2: 2*pi/2 rounded to 53 bits
- * e3pio2: 3*pi/2 rounded to 53 bits
- * e4pio2: 4*pi/2 rounded to 53 bits
* pio2_1: first 33 bit of pi/2
* pio2_1t: pi/2 - pio2_1
*/
@@ -60,10 +56,6 @@ zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
-e1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */
-e2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */
-e3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */
-e4pio2 = 4*M_PI_2, /* 0x401921FB, 0x54442D18 */
pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
pio2_1t = 6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */
@@ -75,53 +67,6 @@ pio2_1t = 6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */
GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff;
- if(ix<=0x3f490fda) /* |x| ~<= pi/4, reduction is null */
- {y[0] = x; y[1] = 0; return 0;}
- /* 53 bit pi is good enough for special cases */
- if(ix<=0x407b53d1) { /* |x| ~<= 5*pi/4 */
- if(ix<=0x4016cbe3) { /* |x| ~<= 3*pi/4 */
- if(hx>0) {
- z = x - e1pio2;
- n = 1;
- } else {
- z = x + e1pio2;
- n = 3;
- }
- y[0] = z;
- y[1] = z - y[0];
- return n;
- } else {
- if(hx>0)
- z = x - e2pio2;
- else
- z = x + e2pio2;
- y[0] = z;
- y[1] = z - y[0];
- return 2;
- }
- }
- if(ix<=0x40e231d5) { /* |x| ~<= 9*pi/4*/
- if(ix<=0x40afeddf) { /* |x| ~<= 7*pi/4 */
- if(hx>0) {
- z = x - e3pio2;
- n = 3;
- } else {
- z = x + e3pio2;
- n = 1;
- }
- y[0] = z;
- y[1] = z - y[0];
- return n;
- } else {
- if(hx>0)
- z = x - e4pio2;
- else
- z = x + e4pio2;
- y[0] = z;
- y[1] = z - y[0];
- return 0;
- }
- }
/* 33+53 bit pi is good enough for medium size */
if(ix<=0x49490f80) { /* |x| ~<= 2^19*(pi/2), medium size */
t = fabsf(x);
OpenPOWER on IntegriCloud