diff options
author | bde <bde@FreeBSD.org> | 2005-11-19 02:38:27 +0000 |
---|---|---|
committer | bde <bde@FreeBSD.org> | 2005-11-19 02:38:27 +0000 |
commit | 558fb238b170a8cea89a078ae31ee7cf11dc49a6 (patch) | |
tree | 06f77ae5b835dd0091c1d286f4c63858859a2f15 /lib/msun/src | |
parent | 666e602c465a2f1c8965ed92a47086e3d5e98ecf (diff) | |
download | FreeBSD-src-558fb238b170a8cea89a078ae31ee7cf11dc49a6.zip FreeBSD-src-558fb238b170a8cea89a078ae31ee7cf11dc49a6.tar.gz |
Moved all the optimizations for |x| <= 9pi/2 from
__ieee754_rem_pio2f() to its 3 callers and manually inline them.
On Athlons, with favourable compiler flags and optimizations and
favourable pipeline conditions, this gives a speedup of 30-40 cycles
for cosf(), sinf() and tanf() on the range pi/4 < |x| <= 9pi/4, so
thes functions are now signifcantly faster than the hardware trig
functions in many cases. E.g., in a benchmark with uniformly distributed
x in [-2pi, 2pi], A64 hardware fcos took 72-129 cycles and cosf() took
37-55 cycles. Out-of-order execution is needed to get both of these
times. The optimizations in this commit apparently work more by
removing 1 serialization point than by reducing latency.
Diffstat (limited to 'lib/msun/src')
-rw-r--r-- | lib/msun/src/e_rem_pio2f.c | 55 | ||||
-rw-r--r-- | lib/msun/src/s_cosf.c | 35 | ||||
-rw-r--r-- | lib/msun/src/s_sinf.c | 45 | ||||
-rw-r--r-- | lib/msun/src/s_tanf.c | 37 |
4 files changed, 105 insertions, 67 deletions
diff --git a/lib/msun/src/e_rem_pio2f.c b/lib/msun/src/e_rem_pio2f.c index a1d741c..6e939c1 100644 --- a/lib/msun/src/e_rem_pio2f.c +++ b/lib/msun/src/e_rem_pio2f.c @@ -47,10 +47,6 @@ static const int32_t two_over_pi[] = { /* * invpio2: 53 bits of 2/pi - * e1pio2: 1*pi/2 rounded to 53 bits - * e2pio2: 2*pi/2 rounded to 53 bits - * e3pio2: 3*pi/2 rounded to 53 bits - * e4pio2: 4*pi/2 rounded to 53 bits * pio2_1: first 33 bit of pi/2 * pio2_1t: pi/2 - pio2_1 */ @@ -60,10 +56,6 @@ zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */ half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */ two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */ invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */ -e1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */ -e2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */ -e3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */ -e4pio2 = 4*M_PI_2, /* 0x401921FB, 0x54442D18 */ pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */ pio2_1t = 6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */ @@ -75,53 +67,6 @@ pio2_1t = 6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */ GET_FLOAT_WORD(hx,x); ix = hx&0x7fffffff; - if(ix<=0x3f490fda) /* |x| ~<= pi/4, reduction is null */ - {y[0] = x; y[1] = 0; return 0;} - /* 53 bit pi is good enough for special cases */ - if(ix<=0x407b53d1) { /* |x| ~<= 5*pi/4 */ - if(ix<=0x4016cbe3) { /* |x| ~<= 3*pi/4 */ - if(hx>0) { - z = x - e1pio2; - n = 1; - } else { - z = x + e1pio2; - n = 3; - } - y[0] = z; - y[1] = z - y[0]; - return n; - } else { - if(hx>0) - z = x - e2pio2; - else - z = x + e2pio2; - y[0] = z; - y[1] = z - y[0]; - return 2; - } - } - if(ix<=0x40e231d5) { /* |x| ~<= 9*pi/4*/ - if(ix<=0x40afeddf) { /* |x| ~<= 7*pi/4 */ - if(hx>0) { - z = x - e3pio2; - n = 3; - } else { - z = x + e3pio2; - n = 1; - } - y[0] = z; - y[1] = z - y[0]; - return n; - } else { - if(hx>0) - z = x - e4pio2; - else - z = x + e4pio2; - y[0] = z; - y[1] = z - y[0]; - return 0; - } - } /* 33+53 bit pi is good enough for medium size */ if(ix<=0x49490f80) { /* |x| ~<= 2^19*(pi/2), medium size */ t = fabsf(x); diff --git a/lib/msun/src/s_cosf.c b/lib/msun/src/s_cosf.c index 3a854b0..ec7f0cf 100644 --- a/lib/msun/src/s_cosf.c +++ b/lib/msun/src/s_cosf.c @@ -20,25 +20,56 @@ static char rcsid[] = "$FreeBSD$"; #include "math.h" #include "math_private.h" +/* Small multiples of pi/2 rounded to double precision. */ +static const double +c1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */ +c2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */ +c3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */ +c4pio2 = 4*M_PI_2; /* 0x401921FB, 0x54442D18 */ + +static inline float +__kernel_cosdf(double x) +{ + return __kernel_cosf((float)x, x - (float)x); +} + +static inline float +__kernel_sindf(double x) +{ + return __kernel_sinf((float)x, x - (float)x, 1); +} + float cosf(float x) { float y[2]; int32_t n,ix; + x = fabsf(x); GET_FLOAT_WORD(ix,x); - ix &= 0x7fffffff; if(ix <= 0x3f490fda) { /* |x| ~<= pi/4 */ if(ix<0x39800000) /* |x| < 2**-12 */ if(((int)x)==0) return 1.0; /* 1 with inexact if x != 0 */ return __kernel_cosf(x,0.0); } + if(ix<=0x407b53d1) { /* |x| <= ~5*pi/4 */ + if(ix<=0x4016cbe3) /* |x| <= ~3pi/4 */ + return -__kernel_sindf(x - c1pio2); + else + return -__kernel_cosdf(x - c2pio2); + } + if(ix<=0x40e231d5) { /* |x| <= ~9*pi/4 */ + if(ix<=0x40afeddf) /* |x| <= ~7*pi/4 */ + return __kernel_sindf(x - c3pio2); + else + return __kernel_cosdf(x - c4pio2); + } /* cos(Inf or NaN) is NaN */ else if (ix>=0x7f800000) return x-x; - /* argument reduction needed */ + /* general argument reduction needed */ else { n = __ieee754_rem_pio2f(x,y); switch(n&3) { diff --git a/lib/msun/src/s_sinf.c b/lib/msun/src/s_sinf.c index 7ddb8b6..cac6e26 100644 --- a/lib/msun/src/s_sinf.c +++ b/lib/msun/src/s_sinf.c @@ -20,25 +20,62 @@ static char rcsid[] = "$FreeBSD$"; #include "math.h" #include "math_private.h" +/* Small multiples of pi/2 rounded to double precision. */ +static const double +s1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */ +s2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */ +s3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */ +s4pio2 = 4*M_PI_2; /* 0x401921FB, 0x54442D18 */ + +static inline float +__kernel_cosdf(double x) +{ + return __kernel_cosf((float)x, x - (float)x); +} + +static inline float +__kernel_sindf(double x) +{ + return __kernel_sinf((float)x, x - (float)x, 1); +} + float sinf(float x) { float y[2]; - int32_t n, ix; + int32_t n, hx, ix; - GET_FLOAT_WORD(ix,x); - ix &= 0x7fffffff; + GET_FLOAT_WORD(hx,x); + ix = hx & 0x7fffffff; if(ix <= 0x3f490fda) { /* |x| ~<= pi/4 */ if(ix<0x39800000) /* |x| < 2**-12 */ if(((int)x)==0) return x; /* x with inexact if x != 0 */ return __kernel_sinf(x,0.0,0); } + if(ix<=0x407b53d1) { /* |x| <= ~5*pi/4 */ + if(ix<=0x4016cbe3) { /* |x| <= ~3pi/4 */ + if(hx>0) + return __kernel_cosdf(x - s1pio2); + else + return -__kernel_cosdf(x + s1pio2); + } else + return -__kernel_sindf(x + (hx > 0 ? -s2pio2 : s2pio2)); + } + if(ix<=0x40e231d5) { /* |x| <= ~9*pi/4 */ + if(ix<=0x40afeddf) { /* |x| <= ~7*pi/4 */ + if(hx>0) + return -__kernel_cosdf(x - s3pio2); + else + return __kernel_cosdf(x + s3pio2); + } else + return __kernel_sindf(x + (hx > 0 ? -s4pio2 : s4pio2)); + } /* sin(Inf or NaN) is NaN */ else if (ix>=0x7f800000) return x-x; - /* argument reduction needed */ + /* general argument reduction needed */ else { n = __ieee754_rem_pio2f(x,y); switch(n&3) { diff --git a/lib/msun/src/s_tanf.c b/lib/msun/src/s_tanf.c index 7092226..f97c4f6 100644 --- a/lib/msun/src/s_tanf.c +++ b/lib/msun/src/s_tanf.c @@ -20,25 +20,50 @@ static char rcsid[] = "$FreeBSD$"; #include "math.h" #include "math_private.h" +/* Small multiples of pi/2 rounded to double precision. */ +static const double +t1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */ +t2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */ +t3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */ +t4pio2 = 4*M_PI_2; /* 0x401921FB, 0x54442D18 */ + +static inline float +__kernel_tandf(double x, int iy) +{ + return __kernel_tanf((float)x, x - (float)x, iy); +} + float tanf(float x) { - float y[2]; - int32_t n, ix; + float y[2],z=0.0; + int32_t n, hx, ix; - GET_FLOAT_WORD(ix,x); - ix &= 0x7fffffff; + GET_FLOAT_WORD(hx,x); + ix = hx & 0x7fffffff; if(ix <= 0x3f490fda) { /* |x| ~<= pi/4 */ if(ix<0x39800000) /* |x| < 2**-12 */ if(((int)x)==0) return x; /* x with inexact if x != 0 */ - return __kernel_tanf(x,0.0,1); + return __kernel_tanf(x,z,1); + } + if(ix<=0x407b53d1) { /* |x| ~<= 5*pi/4 */ + if(ix<=0x4016cbe3) /* |x| ~<= 3pi/4 */ + return __kernel_tandf(x + (hx>0 ? -t1pio2 : t1pio2), -1); + else + return __kernel_tandf(x + (hx>0 ? -t2pio2 : t2pio2), 1); + } + if(ix<=0x40e231d5) { /* |x| ~<= 9*pi/4 */ + if(ix<=0x40afeddf) /* |x| ~<= 7*pi/4 */ + return __kernel_tandf(x + (hx>0 ? -t3pio2 : t3pio2), -1); + else + return __kernel_tandf(x + (hx>0 ? -t4pio2 : t4pio2), 1); } /* tan(Inf or NaN) is NaN */ else if (ix>=0x7f800000) return x-x; - /* argument reduction needed */ + /* general argument reduction needed */ else { n = __ieee754_rem_pio2f(x,y); /* integer parameter: 1 -- n even; -1 -- n odd */ |