summaryrefslogtreecommitdiffstats
path: root/sys/amd64/include/ieeefp.h
diff options
context:
space:
mode:
authorbde <bde@FreeBSD.org>2008-01-11 17:11:32 +0000
committerbde <bde@FreeBSD.org>2008-01-11 17:11:32 +0000
commit369b6ad6b5135717ecaa2a37fe06b76f38f19948 (patch)
tree72300e360553c07432bd4306de5d2b71969f4a99 /sys/amd64/include/ieeefp.h
parent508466854a8b80a9c20126b0aee844486b4a2b3d (diff)
downloadFreeBSD-src-369b6ad6b5135717ecaa2a37fe06b76f38f19948.zip
FreeBSD-src-369b6ad6b5135717ecaa2a37fe06b76f38f19948.tar.gz
Fix fpset*() to not trap if there is a currently unmasked exception.
Unmasked exceptions (which can be fixed up using fpset*() before they trap) are very rare, especially on amd64 since SSE exceptions trap synchronously, but I want to merge the faster amd64 implementations of fpset*() back to i386 without introducing the bug on i386. The i386 implementation has always avoided the trap automatically by changing things using load/store of the FP environment, but this is very slow. Most changes only affect the control word, so they can usually be done much more efficiently, and amd64 has always done this, but loading the control word can trap. This version use the fast method only in the usual case where it will not trap. This only costs a couple of integer instructions (including one branch which I haven't optimized carefully yet) in the usual case, but bloats the inlines a lot. The inlines were already a bit too large to handle both the FPU and SSE.
Diffstat (limited to 'sys/amd64/include/ieeefp.h')
-rw-r--r--sys/amd64/include/ieeefp.h54
1 files changed, 42 insertions, 12 deletions
diff --git a/sys/amd64/include/ieeefp.h b/sys/amd64/include/ieeefp.h
index ce03cd3..1244492 100644
--- a/sys/amd64/include/ieeefp.h
+++ b/sys/amd64/include/ieeefp.h
@@ -125,12 +125,42 @@ typedef enum {
#ifdef __GNUCLIKE_ASM
#define __fldcw(addr) __asm __volatile("fldcw %0" : : "m" (*(addr)))
+#define __fldenv(addr) __asm __volatile("fldenv %0" : : "m" (*(addr)))
#define __fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr)))
+#define __fnstenv(addr) __asm __volatile("fnstenv %0" : "=m" (*(addr)))
#define __fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr)))
#define __ldmxcsr(addr) __asm __volatile("ldmxcsr %0" : : "m" (*(addr)))
#define __stmxcsr(addr) __asm __volatile("stmxcsr %0" : "=m" (*(addr)))
/*
+ * Load the control word. Be careful not to trap if there is a currently
+ * unmasked exception (ones that will become freshly unmasked are not a
+ * problem). This case must be handled by a save/restore of the
+ * environment or even of the full x87 state. Accessing the environment
+ * is very inefficient, so only do it when necessary.
+ */
+static __inline void
+__fnldcw(unsigned short _cw, unsigned short _newcw)
+{
+ struct {
+ unsigned _cw;
+ unsigned _other[6];
+ } _env;
+ unsigned short _sw;
+
+ if ((_cw & FP_MSKS_FLD) != FP_MSKS_FLD) {
+ __fnstsw(&_sw);
+ if (((_sw & ~_cw) & FP_STKY_FLD) != 0) {
+ __fnstenv(&_env);
+ _env._cw = _newcw;
+ __fldenv(&_env);
+ return;
+ }
+ }
+ __fldcw(&_newcw);
+}
+
+/*
* General notes about conflicting SSE vs FP status bits.
* This code assumes that software will not fiddle with the control
* bits of the SSE and x87 in such a way to get them out of sync and
@@ -155,13 +185,13 @@ __fpsetround(fp_rnd_t _m)
{
fp_rnd_t _p;
unsigned _mxcsr;
- unsigned short _cw;
+ unsigned short _cw, _newcw;
__fnstcw(&_cw);
_p = (fp_rnd_t)((_cw & FP_RND_FLD) >> FP_RND_OFF);
- _cw &= ~FP_RND_FLD;
- _cw |= (_m << FP_RND_OFF) & FP_RND_FLD;
- __fldcw(&_cw);
+ _newcw = _cw & ~FP_RND_FLD;
+ _newcw |= (_m << FP_RND_OFF) & FP_RND_FLD;
+ __fnldcw(_cw, _newcw);
__stmxcsr(&_mxcsr);
_mxcsr &= ~SSE_RND_FLD;
_mxcsr |= (_m << SSE_RND_OFF) & SSE_RND_FLD;
@@ -187,13 +217,13 @@ static __inline fp_prec_t
__fpsetprec(fp_prec_t _m)
{
fp_prec_t _p;
- unsigned short _cw;
+ unsigned short _cw, _newcw;
__fnstcw(&_cw);
_p = (fp_prec_t)((_cw & FP_PRC_FLD) >> FP_PRC_OFF);
- _cw &= ~FP_PRC_FLD;
- _cw |= (_m << FP_PRC_OFF) & FP_PRC_FLD;
- __fldcw(&_cw);
+ _newcw = _cw & ~FP_PRC_FLD;
+ _newcw |= (_m << FP_PRC_OFF) & FP_PRC_FLD;
+ __fnldcw(_cw, _newcw);
return (_p);
}
@@ -217,13 +247,13 @@ __fpsetmask(fp_except_t _m)
{
fp_except_t _p;
unsigned _mxcsr;
- unsigned short _cw;
+ unsigned short _cw, _newcw;
__fnstcw(&_cw);
_p = (~_cw & FP_MSKS_FLD) >> FP_MSKS_OFF;
- _cw &= ~FP_MSKS_FLD;
- _cw |= (~_m << FP_MSKS_OFF) & FP_MSKS_FLD;
- __fldcw(&_cw);
+ _newcw = _cw & ~FP_MSKS_FLD;
+ _newcw |= (~_m << FP_MSKS_OFF) & FP_MSKS_FLD;
+ __fnldcw(_cw, _newcw);
__stmxcsr(&_mxcsr);
/* XXX should we clear non-ieee SSE_DAZ_FLD and SSE_FZ_FLD ? */
_mxcsr &= ~SSE_MSKS_FLD;
OpenPOWER on IntegriCloud