summaryrefslogtreecommitdiffstats
path: root/sys/amd64
diff options
context:
space:
mode:
authorpeter <peter@FreeBSD.org>2001-07-12 06:32:51 +0000
committerpeter <peter@FreeBSD.org>2001-07-12 06:32:51 +0000
commite00129231d89ad2ab6ec8862931dd864ff079d0c (patch)
treeacb374c639798d8213f972fae7e8609dec013b5f /sys/amd64
parentb5164c6585fabc3c848ff92ada3f6ffb64a9e8f2 (diff)
downloadFreeBSD-src-e00129231d89ad2ab6ec8862931dd864ff079d0c.zip
FreeBSD-src-e00129231d89ad2ab6ec8862931dd864ff079d0c.tar.gz
Activate SSE/SIMD. This is the extra context switching support that
we are required to do if we let user processes use the extra 128 bit registers etc. This is the base part of the diff I got from: http://www.issei.org/issei/FreeBSD/sse.html I believe this is by: Mr. SUZUKI Issei <issei@issei.org> SMP support apparently by: Takekazu KATO <kato@chino.it.okayama-u.ac.jp> Test code by: NAKAMURA Kazushi <kaz@kobe1995.net>, see http://kobe1995.net/~kaz/FreeBSD/SSE.en.html I have fixed a couple of style(9) deviations. I have some followup commits to fix a couple of non-style things.
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/amd64/exception.S3
-rw-r--r--sys/amd64/amd64/exception.s3
-rw-r--r--sys/amd64/amd64/fpu.c93
-rw-r--r--sys/amd64/amd64/genassym.c5
-rw-r--r--sys/amd64/amd64/initcpu.c22
-rw-r--r--sys/amd64/amd64/locore.S3
-rw-r--r--sys/amd64/amd64/locore.s3
-rw-r--r--sys/amd64/amd64/machdep.c83
-rw-r--r--sys/amd64/amd64/mp_machdep.c5
-rw-r--r--sys/amd64/amd64/mptable.c5
-rw-r--r--sys/amd64/amd64/support.S8
-rw-r--r--sys/amd64/amd64/support.s8
-rw-r--r--sys/amd64/amd64/trap.c5
-rw-r--r--sys/amd64/amd64/vm_machdep.c2
-rw-r--r--sys/amd64/include/fpu.h38
-rw-r--r--sys/amd64/include/md_var.h1
-rw-r--r--sys/amd64/include/mptable.h5
-rw-r--r--sys/amd64/include/npx.h38
-rw-r--r--sys/amd64/include/pcb.h3
-rw-r--r--sys/amd64/include/specialreg.h2
-rw-r--r--sys/amd64/include/trap.h3
-rw-r--r--sys/amd64/isa/npx.c93
22 files changed, 387 insertions, 44 deletions
diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S
index 3ccbc23..419fbd2 100644
--- a/sys/amd64/amd64/exception.S
+++ b/sys/amd64/amd64/exception.S
@@ -153,6 +153,9 @@ IDTVEC(fpu)
IDTVEC(align)
TRAP(T_ALIGNFLT)
+IDTVEC(xmm)
+ pushl $0; TRAP(T_XMMFLT)
+
/*
* alltraps entry point. Interrupts are enabled if this was a trap
* gate (TGT), else disabled if this was an interrupt gate (IGT).
diff --git a/sys/amd64/amd64/exception.s b/sys/amd64/amd64/exception.s
index 3ccbc23..419fbd2 100644
--- a/sys/amd64/amd64/exception.s
+++ b/sys/amd64/amd64/exception.s
@@ -153,6 +153,9 @@ IDTVEC(fpu)
IDTVEC(align)
TRAP(T_ALIGNFLT)
+IDTVEC(xmm)
+ pushl $0; TRAP(T_XMMFLT)
+
/*
* alltraps entry point. Interrupts are enabled if this was a trap
* gate (TGT), else disabled if this was an interrupt gate (IGT).
diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c
index b6c69a0..f6410e9 100644
--- a/sys/amd64/amd64/fpu.c
+++ b/sys/amd64/amd64/fpu.c
@@ -35,6 +35,7 @@
* $FreeBSD$
*/
+#include "opt_cpu.h"
#include "opt_debug_npx.h"
#include "opt_math_emulate.h"
@@ -99,6 +100,8 @@
#define fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr)))
#define fp_divide_by_0() __asm("fldz; fld1; fdiv %st,%st(1); fnop")
#define frstor(addr) __asm("frstor %0" : : "m" (*(addr)))
+#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr)))
+#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr)))
#define start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \
: : "n" (CR0_TS) : "ax")
#define stop_emulating() __asm("clts")
@@ -113,11 +116,41 @@ void fnstcw __P((caddr_t addr));
void fnstsw __P((caddr_t addr));
void fp_divide_by_0 __P((void));
void frstor __P((caddr_t addr));
+void fxsave __P((caddr_t addr));
+void fxrstor __P((caddr_t addr));
void start_emulating __P((void));
void stop_emulating __P((void));
#endif /* __GNUC__ */
+#ifdef CPU_ENABLE_SSE
+#define GET_FPU_CW(proc) \
+ (cpu_fxsr ? \
+ (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_cw : \
+ (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw)
+#define GET_FPU_SW(proc) \
+ (cpu_fxsr ? \
+ (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw : \
+ (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw)
+#define MASK_FPU_SW(proc, mask) \
+ (cpu_fxsr ? \
+ (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw & (mask) : \
+ (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask))
+#define GET_FPU_EXSW_PTR(pcb) \
+ (cpu_fxsr ? \
+ &(pcb)->pcb_save.sv_xmm.sv_ex_sw : \
+ &(pcb)->pcb_save.sv_87.sv_ex_sw)
+#else /* CPU_ENABLE_SSE */
+#define GET_FPU_CW(proc) \
+ (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw)
+#define GET_FPU_SW(proc) \
+ (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw)
+#define MASK_FPU_SW(proc, mask) \
+ ((proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask))
+#define GET_FPU_EXSW_PTR(pcb) \
+ (&(pcb)->pcb_save.sv_87.sv_ex_sw)
+#endif /* CPU_ENABLE_SSE */
+
typedef u_char bool_t;
static int npx_attach __P((device_t dev));
@@ -127,6 +160,8 @@ static void npx_intr __P((void *));
#endif
static int npx_probe __P((device_t dev));
static int npx_probe1 __P((device_t dev));
+static void fpusave __P((union savefpu *, u_char));
+static void fpurstor __P((union savefpu *, u_char));
#ifdef I586_CPU_XXX
static long timezero __P((const char *funcname,
void (*func)(void *buf, size_t len)));
@@ -529,7 +564,7 @@ void
npxinit(control)
u_short control;
{
- struct save87 dummy;
+ union savefpu dummy;
critical_t savecrit;
if (!npx_exists)
@@ -544,7 +579,7 @@ npxinit(control)
stop_emulating();
fldcw(&control);
if (PCPU_GET(curpcb) != NULL)
- fnsave(&PCPU_GET(curpcb)->pcb_savefpu);
+ fpusave(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu);
start_emulating();
critical_exit(savecrit);
}
@@ -560,7 +595,7 @@ npxexit(p)
savecrit = critical_enter();
if (p == PCPU_GET(npxproc))
- npxsave(&PCPU_GET(curpcb)->pcb_savefpu);
+ npxsave(&PCPU_GET(curpcb)->pcb_save);
critical_exit(savecrit);
#ifdef NPX_DEBUG
if (npx_exists) {
@@ -773,6 +808,7 @@ npxtrap()
{
critical_t savecrit;
u_short control, status;
+ u_long *exstat;
if (!npx_exists) {
printf("npxtrap: npxproc = %p, curproc = %p, npx_exists = %d\n",
@@ -787,16 +823,17 @@ npxtrap()
* wherever they are.
*/
if (PCPU_GET(npxproc) != curproc) {
- control = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_cw;
- status = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw;
+ control = GET_FPU_CW(curproc);
+ status = GET_FPU_SW(curproc);
} else {
fnstcw(&control);
fnstsw(&status);
}
- curproc->p_addr->u_pcb.pcb_savefpu.sv_ex_sw = status;
+ exstat = GET_FPU_EXSW_PTR(&curproc->p_addr->u_pcb);
+ *exstat = status;
if (PCPU_GET(npxproc) != curproc)
- curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw &= ~0x80bf;
+ MASK_FPU_SW(curproc, ~0x80bf);
else
fnclex();
critical_exit(savecrit);
@@ -813,6 +850,7 @@ npxtrap()
int
npxdna()
{
+ u_long *exstat;
critical_t s;
if (!npx_exists)
@@ -828,7 +866,9 @@ npxdna()
* Record new context early in case frstor causes an IRQ13.
*/
PCPU_SET(npxproc, CURPROC);
- PCPU_GET(curpcb)->pcb_savefpu.sv_ex_sw = 0;
+
+ exstat = GET_FPU_EXSW_PTR(PCPU_GET(curpcb));
+ *exstat = 0;
/*
* The following frstor may cause an IRQ13 when the state being
* restored has a pending error. The error will appear to have been
@@ -841,7 +881,7 @@ npxdna()
* fnsave are broken, so our treatment breaks fnclex if it is the
* first FPU instruction after a context switch.
*/
- frstor(&PCPU_GET(curpcb)->pcb_savefpu);
+ fpurstor(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu);
critical_exit(s);
return (1);
@@ -872,15 +912,46 @@ npxdna()
*/
void
npxsave(addr)
- struct save87 *addr;
+ union savefpu *addr;
{
stop_emulating();
- fnsave(addr);
+ fpusave(addr, curproc->p_oncpu);
+
start_emulating();
PCPU_SET(npxproc, NULL);
}
+static void
+fpusave(addr, oncpu)
+ union savefpu *addr;
+ u_char oncpu;
+{
+ static struct savexmm svxmm[MAXCPU];
+
+ if (!cpu_fxsr)
+ fnsave(addr);
+ else {
+ fxsave(&svxmm[oncpu]);
+ bcopy(&svxmm[oncpu], addr, sizeof(struct savexmm));
+ }
+}
+
+static void
+fpurstor(addr, oncpu)
+ union savefpu *addr;
+ u_char oncpu;
+{
+ static struct savexmm svxmm[MAXCPU];
+
+ if (!cpu_fxsr)
+ frstor(addr);
+ else {
+ bcopy(addr, &svxmm[oncpu], sizeof (struct savexmm));
+ fxrstor(&svxmm[oncpu]);
+ }
+}
+
#ifdef I586_CPU_XXX
static long
timezero(funcname, func)
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
index c6f0970..ac664e8 100644
--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@@ -126,8 +126,9 @@ ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare));
ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
-ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_savefpu));
-ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct save87));
+ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
+ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu));
+ASSYM(PCB_SAVE87_SIZE, sizeof(struct save87));
ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
#ifdef SMP
diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c
index 8b39b44..7fb56fa 100644
--- a/sys/amd64/amd64/initcpu.c
+++ b/sys/amd64/amd64/initcpu.c
@@ -34,6 +34,7 @@
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
+#include <sys/sysctl.h>
#include <machine/cputypes.h>
#include <machine/md_var.h>
@@ -61,8 +62,14 @@ static void init_6x86(void);
static void init_6x86MX(void);
static void init_ppro(void);
static void init_mendocino(void);
+void enable_sse();
#endif
+int hw_instruction_sse = 0;
+SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
+ &hw_instruction_sse, 0,
+ "SIMD/MMX2 instructions available in CPU");
+
#ifdef I486_CPU
/*
* IBM Blue Lightning
@@ -501,6 +508,20 @@ init_mendocino(void)
#endif /* CPU_PPRO2CELERON */
}
+/*
+ * Initialize CR4 (Control register 4) to enable SSE instructions.
+ */
+void
+enable_sse(void)
+{
+#if defined(CPU_ENABLE_SSE)
+ if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) {
+ load_cr4(rcr4() | CR4_FXSR | CR4_XMM);
+ cpu_fxsr = hw_instruction_sse = 1;
+ }
+#endif
+}
+
#endif /* I686_CPU */
void
@@ -544,6 +565,7 @@ initializecpu(void)
init_mendocino();
break;
}
+ enable_sse();
}
break;
#endif
diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S
index 379af45..cdfd799 100644
--- a/sys/amd64/amd64/locore.S
+++ b/sys/amd64/amd64/locore.S
@@ -113,12 +113,13 @@ HIDENAME(tmpstk):
.globl boothowto,bootdev
.globl cpu,cpu_vendor,cpu_id,bootinfo
- .globl cpu_high, cpu_feature
+ .globl cpu_high, cpu_feature, cpu_fxsr
cpu: .long 0 /* are we 386, 386sx, or 486 */
cpu_id: .long 0 /* stepping ID */
cpu_high: .long 0 /* highest arg to CPUID */
cpu_feature: .long 0 /* features */
+cpu_fxsr: .long 0 /* use fxsave/fxrstor instruction */
cpu_vendor: .space 20 /* CPU origin code */
bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */
diff --git a/sys/amd64/amd64/locore.s b/sys/amd64/amd64/locore.s
index 379af45..cdfd799 100644
--- a/sys/amd64/amd64/locore.s
+++ b/sys/amd64/amd64/locore.s
@@ -113,12 +113,13 @@ HIDENAME(tmpstk):
.globl boothowto,bootdev
.globl cpu,cpu_vendor,cpu_id,bootinfo
- .globl cpu_high, cpu_feature
+ .globl cpu_high, cpu_feature, cpu_fxsr
cpu: .long 0 /* are we 386, 386sx, or 486 */
cpu_id: .long 0 /* stepping ID */
cpu_high: .long 0 /* highest arg to CPUID */
cpu_feature: .long 0 /* features */
+cpu_fxsr: .long 0 /* use fxsave/fxrstor instruction */
cpu_vendor: .space 20 /* CPU origin code */
bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 544aff5..7e9a4dd 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -127,6 +127,10 @@ extern void initializecpu(void);
#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
static void cpu_startup __P((void *));
+#ifdef CPU_ENABLE_SSE
+static void set_fpregs_xmm __P((struct save87 *, struct savexmm *));
+static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *));
+#endif /* CPU_ENABLE_SSE */
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
int _udatasel, _ucodesel;
@@ -1361,7 +1365,7 @@ extern inthand_t
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
- IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
+ IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
void
sdtossd(sd, ssd)
@@ -1900,6 +1904,7 @@ init386(first)
setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
setidt(0x80, &IDTVEC(int0x80_syscall),
SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
@@ -2092,8 +2097,8 @@ int ptrace_write_u(p, off, data)
*(int*)((char *)p->p_addr + off) = data;
return (0);
}
- min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
- if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
+ min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_save);
+ if (off >= min && off <= min + sizeof(union savefpu) - sizeof(int)) {
*(int*)((char *)p->p_addr + off) = data;
return (0);
}
@@ -2161,12 +2166,73 @@ set_regs(p, regs)
return (0);
}
+#ifdef CPU_ENABLE_SSE
+static void
+fill_fpregs_xmm(sv_xmm, sv_87)
+ struct savexmm *sv_xmm;
+ struct save87 *sv_87;
+{
+ register struct env87 *penv_87 = &sv_87->sv_env;
+ register struct envxmm *penv_xmm = &sv_xmm->sv_env;
+ int i;
+
+ /* FPU control/status */
+ penv_87->en_cw = penv_xmm->en_cw;
+ penv_87->en_sw = penv_xmm->en_sw;
+ penv_87->en_tw = penv_xmm->en_tw;
+ penv_87->en_fip = penv_xmm->en_fip;
+ penv_87->en_fcs = penv_xmm->en_fcs;
+ penv_87->en_opcode = penv_xmm->en_opcode;
+ penv_87->en_foo = penv_xmm->en_foo;
+ penv_87->en_fos = penv_xmm->en_fos;
+
+ /* FPU registers */
+ for (i = 0; i < 8; ++i)
+ sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
+
+ sv_87->sv_ex_sw = sv_xmm->sv_ex_sw;
+}
+
+static void
+set_fpregs_xmm(sv_87, sv_xmm)
+ struct save87 *sv_87;
+ struct savexmm *sv_xmm;
+{
+ register struct env87 *penv_87 = &sv_87->sv_env;
+ register struct envxmm *penv_xmm = &sv_xmm->sv_env;
+ int i;
+
+ /* FPU control/status */
+ penv_xmm->en_cw = penv_87->en_cw;
+ penv_xmm->en_sw = penv_87->en_sw;
+ penv_xmm->en_tw = penv_87->en_tw;
+ penv_xmm->en_fip = penv_87->en_fip;
+ penv_xmm->en_fcs = penv_87->en_fcs;
+ penv_xmm->en_opcode = penv_87->en_opcode;
+ penv_xmm->en_foo = penv_87->en_foo;
+ penv_xmm->en_fos = penv_87->en_fos;
+
+ /* FPU registers */
+ for (i = 0; i < 8; ++i)
+ sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
+
+ sv_xmm->sv_ex_sw = sv_87->sv_ex_sw;
+}
+#endif /* CPU_ENABLE_SSE */
+
int
fill_fpregs(p, fpregs)
struct proc *p;
struct fpreg *fpregs;
{
- bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs);
+#ifdef CPU_ENABLE_SSE
+ if (cpu_fxsr) {
+ fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm,
+ (struct save87 *)fpregs);
+ return (0);
+ }
+#endif /* CPU_ENABLE_SSE */
+ bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs);
return (0);
}
@@ -2175,7 +2241,14 @@ set_fpregs(p, fpregs)
struct proc *p;
struct fpreg *fpregs;
{
- bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs);
+#ifdef CPU_ENABLE_SSE
+ if (cpu_fxsr) {
+ set_fpregs_xmm((struct save87 *)fpregs,
+ &p->p_addr->u_pcb.pcb_save.sv_xmm);
+ return (0);
+ }
+#endif /* CPU_ENABLE_SSE */
+ bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs);
return (0);
}
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 0a0de69..d5af7b3 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -2221,6 +2221,8 @@ invltlb(void)
* This is called once the rest of the system is up and running and we're
* ready to let the AP's out of the pen.
*/
+extern void enable_sse(void);
+
void
ap_init(void)
{
@@ -2260,6 +2262,9 @@ ap_init(void)
/* set up FPU state on the AP */
npxinit(__INITIAL_NPXCW__);
+ /* set up SSE registers */
+ enable_sse();
+
/* A quick check from sanity claus */
apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
if (PCPU_GET(cpuid) != apic_id) {
diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c
index 0a0de69..d5af7b3 100644
--- a/sys/amd64/amd64/mptable.c
+++ b/sys/amd64/amd64/mptable.c
@@ -2221,6 +2221,8 @@ invltlb(void)
* This is called once the rest of the system is up and running and we're
* ready to let the AP's out of the pen.
*/
+extern void enable_sse(void);
+
void
ap_init(void)
{
@@ -2260,6 +2262,9 @@ ap_init(void)
/* set up FPU state on the AP */
npxinit(__INITIAL_NPXCW__);
+ /* set up SSE registers */
+ enable_sse();
+
/* A quick check from sanity claus */
apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
if (PCPU_GET(cpuid) != apic_id) {
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index 3218774..55bc29c 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -976,7 +976,7 @@ ENTRY(i586_copyin)
ENTRY(fastmove)
pushl %ebp
movl %esp,%ebp
- subl $PCB_SAVEFPU_SIZE+3*4,%esp
+ subl $PCB_SAVE87_SIZE+3*4,%esp
movl 8(%ebp),%ecx
cmpl $63,%ecx
@@ -1018,7 +1018,7 @@ ENTRY(fastmove)
movl PCPU(CURPCB),%esi
addl $PCB_SAVEFPU,%esi
cld
- movl $PCB_SAVEFPU_SIZE>>2,%ecx
+ movl $PCB_SAVE87_SIZE>>2,%ecx
rep
movsl
movl -12(%ebp),%ecx
@@ -1102,7 +1102,7 @@ fastmove_loop:
addl $PCB_SAVEFPU,%edi
movl %esp,%esi
cld
- movl $PCB_SAVEFPU_SIZE>>2,%ecx
+ movl $PCB_SAVE87_SIZE>>2,%ecx
rep
movsl
movl -12(%ebp),%ecx
@@ -1147,7 +1147,7 @@ fastmove_fault:
addl $PCB_SAVEFPU,%edi
movl %esp,%esi
cld
- movl $PCB_SAVEFPU_SIZE>>2,%ecx
+ movl $PCB_SAVE87_SIZE>>2,%ecx
rep
movsl
diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s
index 3218774..55bc29c 100644
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@@ -976,7 +976,7 @@ ENTRY(i586_copyin)
ENTRY(fastmove)
pushl %ebp
movl %esp,%ebp
- subl $PCB_SAVEFPU_SIZE+3*4,%esp
+ subl $PCB_SAVE87_SIZE+3*4,%esp
movl 8(%ebp),%ecx
cmpl $63,%ecx
@@ -1018,7 +1018,7 @@ ENTRY(fastmove)
movl PCPU(CURPCB),%esi
addl $PCB_SAVEFPU,%esi
cld
- movl $PCB_SAVEFPU_SIZE>>2,%ecx
+ movl $PCB_SAVE87_SIZE>>2,%ecx
rep
movsl
movl -12(%ebp),%ecx
@@ -1102,7 +1102,7 @@ fastmove_loop:
addl $PCB_SAVEFPU,%edi
movl %esp,%esi
cld
- movl $PCB_SAVEFPU_SIZE>>2,%ecx
+ movl $PCB_SAVE87_SIZE>>2,%ecx
rep
movsl
movl -12(%ebp),%ecx
@@ -1147,7 +1147,7 @@ fastmove_fault:
addl $PCB_SAVEFPU,%edi
movl %esp,%esi
cld
- movl $PCB_SAVEFPU_SIZE>>2,%ecx
+ movl $PCB_SAVE87_SIZE>>2,%ecx
rep
movsl
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index b1ab3bd..0431e0e 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -386,6 +386,11 @@ restart:
ucode = T_FPOPFLT;
i = SIGILL;
break;
+
+ case T_XMMFLT: /* SIMD floating-point exception */
+ ucode = 0; /* XXX */
+ i = SIGFPE;
+ break;
}
} else {
/* kernel trap */
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index e5e5ea6..05efb4d 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -148,7 +148,7 @@ cpu_fork(p1, p2, flags)
p1->p_addr->u_pcb.pcb_gs = rgs();
savecrit = critical_enter();
if (PCPU_GET(npxproc) == p1)
- npxsave(&p1->p_addr->u_pcb.pcb_savefpu);
+ npxsave(&p1->p_addr->u_pcb.pcb_save);
critical_exit(savecrit);
#endif
diff --git a/sys/amd64/include/fpu.h b/sys/amd64/include/fpu.h
index 11f0478..1474f2f 100644
--- a/sys/amd64/include/fpu.h
+++ b/sys/amd64/include/fpu.h
@@ -85,6 +85,42 @@ struct save87 {
u_char sv_pad[64]; /* padding; used by emulators */
};
+struct envxmm {
+ u_int16_t en_cw; /* control word (16bits) */
+ u_int16_t en_sw; /* status word (16bits) */
+ u_int16_t en_tw; /* tag word (16bits) */
+ u_int16_t en_opcode; /* opcode last executed (11 bits ) */
+ u_int32_t en_fip; /* floating point instruction pointer */
+ u_int16_t en_fcs; /* floating code segment selector */
+ u_int16_t en_pad0; /* padding */
+ u_int32_t en_foo; /* floating operand offset */
+ u_int16_t en_fos; /* floating operand segment selector */
+ u_int16_t en_pad1; /* padding */
+ u_int32_t en_mxcsr; /* SSE sontorol/status register */
+ u_int32_t en_pad2; /* padding */
+};
+
+/* Contents of each SSE extended accumulator */
+struct xmmacc {
+ u_char xmm_bytes[16];
+};
+
+struct savexmm {
+ struct envxmm sv_env;
+ struct {
+ struct fpacc87 fp_acc;
+ u_char fp_pad[6]; /* padding */
+ } sv_fp[8];
+ struct xmmacc sv_xmm[8];
+ u_long sv_ex_sw; /* status word for last exception */
+ u_char sv_pad[220];
+} __attribute__((aligned(16)));
+
+union savefpu {
+ struct save87 sv_87;
+ struct savexmm sv_xmm;
+};
+
/*
* The hardware default control word for i387's and later coprocessors is
* 0x37F, giving:
@@ -108,7 +144,7 @@ struct save87 {
int npxdna __P((void));
void npxexit __P((struct proc *p));
void npxinit __P((int control));
-void npxsave __P((struct save87 *addr));
+void npxsave __P((union savefpu *addr));
int npxtrap __P((void));
#endif
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index 5a2ed26..6c81a96 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -47,6 +47,7 @@ extern int (*copyout_vector) __P((const void *kaddr, void *udaddr,
extern u_int cpu_feature;
extern u_int cpu_high;
extern u_int cpu_id;
+extern u_int cpu_fxsr;
extern char cpu_vendor[];
extern u_int cyrix_did;
extern char kstack[];
diff --git a/sys/amd64/include/mptable.h b/sys/amd64/include/mptable.h
index 0a0de69..d5af7b3 100644
--- a/sys/amd64/include/mptable.h
+++ b/sys/amd64/include/mptable.h
@@ -2221,6 +2221,8 @@ invltlb(void)
* This is called once the rest of the system is up and running and we're
* ready to let the AP's out of the pen.
*/
+extern void enable_sse(void);
+
void
ap_init(void)
{
@@ -2260,6 +2262,9 @@ ap_init(void)
/* set up FPU state on the AP */
npxinit(__INITIAL_NPXCW__);
+ /* set up SSE registers */
+ enable_sse();
+
/* A quick check from sanity claus */
apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
if (PCPU_GET(cpuid) != apic_id) {
diff --git a/sys/amd64/include/npx.h b/sys/amd64/include/npx.h
index 11f0478..1474f2f 100644
--- a/sys/amd64/include/npx.h
+++ b/sys/amd64/include/npx.h
@@ -85,6 +85,42 @@ struct save87 {
u_char sv_pad[64]; /* padding; used by emulators */
};
+struct envxmm {
+ u_int16_t en_cw; /* control word (16bits) */
+ u_int16_t en_sw; /* status word (16bits) */
+ u_int16_t en_tw; /* tag word (16bits) */
+ u_int16_t en_opcode; /* opcode last executed (11 bits ) */
+ u_int32_t en_fip; /* floating point instruction pointer */
+ u_int16_t en_fcs; /* floating code segment selector */
+ u_int16_t en_pad0; /* padding */
+ u_int32_t en_foo; /* floating operand offset */
+ u_int16_t en_fos; /* floating operand segment selector */
+ u_int16_t en_pad1; /* padding */
+ u_int32_t en_mxcsr; /* SSE sontorol/status register */
+ u_int32_t en_pad2; /* padding */
+};
+
+/* Contents of each SSE extended accumulator */
+struct xmmacc {
+ u_char xmm_bytes[16];
+};
+
+struct savexmm {
+ struct envxmm sv_env;
+ struct {
+ struct fpacc87 fp_acc;
+ u_char fp_pad[6]; /* padding */
+ } sv_fp[8];
+ struct xmmacc sv_xmm[8];
+ u_long sv_ex_sw; /* status word for last exception */
+ u_char sv_pad[220];
+} __attribute__((aligned(16)));
+
+union savefpu {
+ struct save87 sv_87;
+ struct savexmm sv_xmm;
+};
+
/*
* The hardware default control word for i387's and later coprocessors is
* 0x37F, giving:
@@ -108,7 +144,7 @@ struct save87 {
int npxdna __P((void));
void npxexit __P((struct proc *p));
void npxinit __P((int control));
-void npxsave __P((struct save87 *addr));
+void npxsave __P((union savefpu *addr));
int npxtrap __P((void));
#endif
diff --git a/sys/amd64/include/pcb.h b/sys/amd64/include/pcb.h
index 962fc6f..6ea7c3d 100644
--- a/sys/amd64/include/pcb.h
+++ b/sys/amd64/include/pcb.h
@@ -62,7 +62,8 @@ struct pcb {
int pcb_dr7;
struct pcb_ldt *pcb_ldt; /* per process (user) LDT */
- struct save87 pcb_savefpu; /* floating point state for 287/387 */
+ union savefpu pcb_save;
+#define pcb_savefpu pcb_save.sv_87
u_char pcb_flags;
#define FP_SOFTFP 0x01 /* process using software fltng pnt emulator */
#define PCB_DBREGS 0x02 /* process using debug registers */
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 937cab0..02440c9 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -93,6 +93,8 @@
#define CPUID_PGE 0x2000
#define CPUID_MCA 0x4000
#define CPUID_CMOV 0x8000
+#define CPUID_FXSR 0x01000000
+#define CPUID_XMM 0x02000000
/*
* Model-specific registers for the i386 family
diff --git a/sys/amd64/include/trap.h b/sys/amd64/include/trap.h
index 6db97ec..67becb3 100644
--- a/sys/amd64/include/trap.h
+++ b/sys/amd64/include/trap.h
@@ -64,7 +64,8 @@
#define T_SEGNPFLT 26 /* segment not present fault */
#define T_STKFLT 27 /* stack fault */
#define T_MCHK 28 /* machine check trap */
-#define T_RESERVED 29 /* reserved (unknown) */
+#define T_XMMFLT 29 /* SIMD floating-point exception */
+#define T_RESERVED 30 /* reserved (unknown) */
/* XXX most of the following codes aren't used, but could be. */
diff --git a/sys/amd64/isa/npx.c b/sys/amd64/isa/npx.c
index b6c69a0..f6410e9 100644
--- a/sys/amd64/isa/npx.c
+++ b/sys/amd64/isa/npx.c
@@ -35,6 +35,7 @@
* $FreeBSD$
*/
+#include "opt_cpu.h"
#include "opt_debug_npx.h"
#include "opt_math_emulate.h"
@@ -99,6 +100,8 @@
#define fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr)))
#define fp_divide_by_0() __asm("fldz; fld1; fdiv %st,%st(1); fnop")
#define frstor(addr) __asm("frstor %0" : : "m" (*(addr)))
+#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr)))
+#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr)))
#define start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \
: : "n" (CR0_TS) : "ax")
#define stop_emulating() __asm("clts")
@@ -113,11 +116,41 @@ void fnstcw __P((caddr_t addr));
void fnstsw __P((caddr_t addr));
void fp_divide_by_0 __P((void));
void frstor __P((caddr_t addr));
+void fxsave __P((caddr_t addr));
+void fxrstor __P((caddr_t addr));
void start_emulating __P((void));
void stop_emulating __P((void));
#endif /* __GNUC__ */
+#ifdef CPU_ENABLE_SSE
+#define GET_FPU_CW(proc) \
+ (cpu_fxsr ? \
+ (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_cw : \
+ (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw)
+#define GET_FPU_SW(proc) \
+ (cpu_fxsr ? \
+ (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw : \
+ (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw)
+#define MASK_FPU_SW(proc, mask) \
+ (cpu_fxsr ? \
+ (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw & (mask) : \
+ (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask))
+#define GET_FPU_EXSW_PTR(pcb) \
+ (cpu_fxsr ? \
+ &(pcb)->pcb_save.sv_xmm.sv_ex_sw : \
+ &(pcb)->pcb_save.sv_87.sv_ex_sw)
+#else /* CPU_ENABLE_SSE */
+#define GET_FPU_CW(proc) \
+ (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw)
+#define GET_FPU_SW(proc) \
+ (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw)
+#define MASK_FPU_SW(proc, mask) \
+ ((proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask))
+#define GET_FPU_EXSW_PTR(pcb) \
+ (&(pcb)->pcb_save.sv_87.sv_ex_sw)
+#endif /* CPU_ENABLE_SSE */
+
typedef u_char bool_t;
static int npx_attach __P((device_t dev));
@@ -127,6 +160,8 @@ static void npx_intr __P((void *));
#endif
static int npx_probe __P((device_t dev));
static int npx_probe1 __P((device_t dev));
+static void fpusave __P((union savefpu *, u_char));
+static void fpurstor __P((union savefpu *, u_char));
#ifdef I586_CPU_XXX
static long timezero __P((const char *funcname,
void (*func)(void *buf, size_t len)));
@@ -529,7 +564,7 @@ void
npxinit(control)
u_short control;
{
- struct save87 dummy;
+ union savefpu dummy;
critical_t savecrit;
if (!npx_exists)
@@ -544,7 +579,7 @@ npxinit(control)
stop_emulating();
fldcw(&control);
if (PCPU_GET(curpcb) != NULL)
- fnsave(&PCPU_GET(curpcb)->pcb_savefpu);
+ fpusave(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu);
start_emulating();
critical_exit(savecrit);
}
@@ -560,7 +595,7 @@ npxexit(p)
savecrit = critical_enter();
if (p == PCPU_GET(npxproc))
- npxsave(&PCPU_GET(curpcb)->pcb_savefpu);
+ npxsave(&PCPU_GET(curpcb)->pcb_save);
critical_exit(savecrit);
#ifdef NPX_DEBUG
if (npx_exists) {
@@ -773,6 +808,7 @@ npxtrap()
{
critical_t savecrit;
u_short control, status;
+ u_long *exstat;
if (!npx_exists) {
printf("npxtrap: npxproc = %p, curproc = %p, npx_exists = %d\n",
@@ -787,16 +823,17 @@ npxtrap()
* wherever they are.
*/
if (PCPU_GET(npxproc) != curproc) {
- control = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_cw;
- status = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw;
+ control = GET_FPU_CW(curproc);
+ status = GET_FPU_SW(curproc);
} else {
fnstcw(&control);
fnstsw(&status);
}
- curproc->p_addr->u_pcb.pcb_savefpu.sv_ex_sw = status;
+ exstat = GET_FPU_EXSW_PTR(&curproc->p_addr->u_pcb);
+ *exstat = status;
if (PCPU_GET(npxproc) != curproc)
- curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw &= ~0x80bf;
+ MASK_FPU_SW(curproc, ~0x80bf);
else
fnclex();
critical_exit(savecrit);
@@ -813,6 +850,7 @@ npxtrap()
int
npxdna()
{
+ u_long *exstat;
critical_t s;
if (!npx_exists)
@@ -828,7 +866,9 @@ npxdna()
* Record new context early in case frstor causes an IRQ13.
*/
PCPU_SET(npxproc, CURPROC);
- PCPU_GET(curpcb)->pcb_savefpu.sv_ex_sw = 0;
+
+ exstat = GET_FPU_EXSW_PTR(PCPU_GET(curpcb));
+ *exstat = 0;
/*
* The following frstor may cause an IRQ13 when the state being
* restored has a pending error. The error will appear to have been
@@ -841,7 +881,7 @@ npxdna()
* fnsave are broken, so our treatment breaks fnclex if it is the
* first FPU instruction after a context switch.
*/
- frstor(&PCPU_GET(curpcb)->pcb_savefpu);
+ fpurstor(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu);
critical_exit(s);
return (1);
@@ -872,15 +912,46 @@ npxdna()
*/
void
npxsave(addr)
- struct save87 *addr;
+ union savefpu *addr;
{
stop_emulating();
- fnsave(addr);
+ fpusave(addr, curproc->p_oncpu);
+
start_emulating();
PCPU_SET(npxproc, NULL);
}
+static void
+fpusave(addr, oncpu)
+ union savefpu *addr;
+ u_char oncpu;
+{
+ static struct savexmm svxmm[MAXCPU];
+
+ if (!cpu_fxsr)
+ fnsave(addr);
+ else {
+ fxsave(&svxmm[oncpu]);
+ bcopy(&svxmm[oncpu], addr, sizeof(struct savexmm));
+ }
+}
+
+static void
+fpurstor(addr, oncpu)
+ union savefpu *addr;
+ u_char oncpu;
+{
+ static struct savexmm svxmm[MAXCPU];
+
+ if (!cpu_fxsr)
+ frstor(addr);
+ else {
+ bcopy(addr, &svxmm[oncpu], sizeof (struct savexmm));
+ fxrstor(&svxmm[oncpu]);
+ }
+}
+
#ifdef I586_CPU_XXX
static long
timezero(funcname, func)
OpenPOWER on IntegriCloud