diff options
42 files changed, 653 insertions, 71 deletions
diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 3ccbc23..419fbd2 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -153,6 +153,9 @@ IDTVEC(fpu) IDTVEC(align) TRAP(T_ALIGNFLT) +IDTVEC(xmm) + pushl $0; TRAP(T_XMMFLT) + /* * alltraps entry point. Interrupts are enabled if this was a trap * gate (TGT), else disabled if this was an interrupt gate (IGT). diff --git a/sys/amd64/amd64/exception.s b/sys/amd64/amd64/exception.s index 3ccbc23..419fbd2 100644 --- a/sys/amd64/amd64/exception.s +++ b/sys/amd64/amd64/exception.s @@ -153,6 +153,9 @@ IDTVEC(fpu) IDTVEC(align) TRAP(T_ALIGNFLT) +IDTVEC(xmm) + pushl $0; TRAP(T_XMMFLT) + /* * alltraps entry point. Interrupts are enabled if this was a trap * gate (TGT), else disabled if this was an interrupt gate (IGT). diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c index b6c69a0..f6410e9 100644 --- a/sys/amd64/amd64/fpu.c +++ b/sys/amd64/amd64/fpu.c @@ -35,6 +35,7 @@ * $FreeBSD$ */ +#include "opt_cpu.h" #include "opt_debug_npx.h" #include "opt_math_emulate.h" @@ -99,6 +100,8 @@ #define fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr))) #define fp_divide_by_0() __asm("fldz; fld1; fdiv %st,%st(1); fnop") #define frstor(addr) __asm("frstor %0" : : "m" (*(addr))) +#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) #define start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ : : "n" (CR0_TS) : "ax") #define stop_emulating() __asm("clts") @@ -113,11 +116,41 @@ void fnstcw __P((caddr_t addr)); void fnstsw __P((caddr_t addr)); void fp_divide_by_0 __P((void)); void frstor __P((caddr_t addr)); +void fxsave __P((caddr_t addr)); +void fxrstor __P((caddr_t addr)); void start_emulating __P((void)); void stop_emulating __P((void)); #endif /* __GNUC__ */ +#ifdef CPU_ENABLE_SSE +#define GET_FPU_CW(proc) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_cw : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw) +#define GET_FPU_SW(proc) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw) +#define MASK_FPU_SW(proc, mask) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw & (mask) : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask)) +#define GET_FPU_EXSW_PTR(pcb) \ + (cpu_fxsr ? \ + &(pcb)->pcb_save.sv_xmm.sv_ex_sw : \ + &(pcb)->pcb_save.sv_87.sv_ex_sw) +#else /* CPU_ENABLE_SSE */ +#define GET_FPU_CW(proc) \ + (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw) +#define GET_FPU_SW(proc) \ + (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw) +#define MASK_FPU_SW(proc, mask) \ + ((proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask)) +#define GET_FPU_EXSW_PTR(pcb) \ + (&(pcb)->pcb_save.sv_87.sv_ex_sw) +#endif /* CPU_ENABLE_SSE */ + typedef u_char bool_t; static int npx_attach __P((device_t dev)); @@ -127,6 +160,8 @@ static void npx_intr __P((void *)); #endif static int npx_probe __P((device_t dev)); static int npx_probe1 __P((device_t dev)); +static void fpusave __P((union savefpu *, u_char)); +static void fpurstor __P((union savefpu *, u_char)); #ifdef I586_CPU_XXX static long timezero __P((const char *funcname, void (*func)(void *buf, size_t len))); @@ -529,7 +564,7 @@ void npxinit(control) u_short control; { - struct save87 dummy; + union savefpu dummy; critical_t savecrit; if (!npx_exists) @@ -544,7 +579,7 @@ npxinit(control) stop_emulating(); fldcw(&control); if (PCPU_GET(curpcb) != NULL) - fnsave(&PCPU_GET(curpcb)->pcb_savefpu); + fpusave(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu); start_emulating(); critical_exit(savecrit); } @@ -560,7 +595,7 @@ npxexit(p) savecrit = critical_enter(); if (p == PCPU_GET(npxproc)) - npxsave(&PCPU_GET(curpcb)->pcb_savefpu); + npxsave(&PCPU_GET(curpcb)->pcb_save); critical_exit(savecrit); #ifdef NPX_DEBUG if (npx_exists) { @@ -773,6 +808,7 @@ npxtrap() { critical_t savecrit; u_short control, status; + u_long *exstat; if (!npx_exists) { printf("npxtrap: npxproc = %p, curproc = %p, npx_exists = %d\n", @@ -787,16 +823,17 @@ npxtrap() * wherever they are. */ if (PCPU_GET(npxproc) != curproc) { - control = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_cw; - status = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw; + control = GET_FPU_CW(curproc); + status = GET_FPU_SW(curproc); } else { fnstcw(&control); fnstsw(&status); } - curproc->p_addr->u_pcb.pcb_savefpu.sv_ex_sw = status; + exstat = GET_FPU_EXSW_PTR(&curproc->p_addr->u_pcb); + *exstat = status; if (PCPU_GET(npxproc) != curproc) - curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw &= ~0x80bf; + MASK_FPU_SW(curproc, ~0x80bf); else fnclex(); critical_exit(savecrit); @@ -813,6 +850,7 @@ npxtrap() int npxdna() { + u_long *exstat; critical_t s; if (!npx_exists) @@ -828,7 +866,9 @@ npxdna() * Record new context early in case frstor causes an IRQ13. */ PCPU_SET(npxproc, CURPROC); - PCPU_GET(curpcb)->pcb_savefpu.sv_ex_sw = 0; + + exstat = GET_FPU_EXSW_PTR(PCPU_GET(curpcb)); + *exstat = 0; /* * The following frstor may cause an IRQ13 when the state being * restored has a pending error. The error will appear to have been @@ -841,7 +881,7 @@ npxdna() * fnsave are broken, so our treatment breaks fnclex if it is the * first FPU instruction after a context switch. */ - frstor(&PCPU_GET(curpcb)->pcb_savefpu); + fpurstor(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu); critical_exit(s); return (1); @@ -872,15 +912,46 @@ npxdna() */ void npxsave(addr) - struct save87 *addr; + union savefpu *addr; { stop_emulating(); - fnsave(addr); + fpusave(addr, curproc->p_oncpu); + start_emulating(); PCPU_SET(npxproc, NULL); } +static void +fpusave(addr, oncpu) + union savefpu *addr; + u_char oncpu; +{ + static struct savexmm svxmm[MAXCPU]; + + if (!cpu_fxsr) + fnsave(addr); + else { + fxsave(&svxmm[oncpu]); + bcopy(&svxmm[oncpu], addr, sizeof(struct savexmm)); + } +} + +static void +fpurstor(addr, oncpu) + union savefpu *addr; + u_char oncpu; +{ + static struct savexmm svxmm[MAXCPU]; + + if (!cpu_fxsr) + frstor(addr); + else { + bcopy(addr, &svxmm[oncpu], sizeof (struct savexmm)); + fxrstor(&svxmm[oncpu]); + } +} + #ifdef I586_CPU_XXX static long timezero(funcname, func) diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index c6f0970..ac664e8 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -126,8 +126,9 @@ ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); -ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_savefpu)); -ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct save87)); +ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); +ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu)); +ASSYM(PCB_SAVE87_SIZE, sizeof(struct save87)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); #ifdef SMP diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 8b39b44..7fb56fa 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -34,6 +34,7 @@ #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> +#include <sys/sysctl.h> #include <machine/cputypes.h> #include <machine/md_var.h> @@ -61,8 +62,14 @@ static void init_6x86(void); static void init_6x86MX(void); static void init_ppro(void); static void init_mendocino(void); +void enable_sse(); #endif +int hw_instruction_sse = 0; +SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, + &hw_instruction_sse, 0, + "SIMD/MMX2 instructions available in CPU"); + #ifdef I486_CPU /* * IBM Blue Lightning @@ -501,6 +508,20 @@ init_mendocino(void) #endif /* CPU_PPRO2CELERON */ } +/* + * Initialize CR4 (Control register 4) to enable SSE instructions. + */ +void +enable_sse(void) +{ +#if defined(CPU_ENABLE_SSE) + if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { + load_cr4(rcr4() | CR4_FXSR | CR4_XMM); + cpu_fxsr = hw_instruction_sse = 1; + } +#endif +} + #endif /* I686_CPU */ void @@ -544,6 +565,7 @@ initializecpu(void) init_mendocino(); break; } + enable_sse(); } break; #endif diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S index 379af45..cdfd799 100644 --- a/sys/amd64/amd64/locore.S +++ b/sys/amd64/amd64/locore.S @@ -113,12 +113,13 @@ HIDENAME(tmpstk): .globl boothowto,bootdev .globl cpu,cpu_vendor,cpu_id,bootinfo - .globl cpu_high, cpu_feature + .globl cpu_high, cpu_feature, cpu_fxsr cpu: .long 0 /* are we 386, 386sx, or 486 */ cpu_id: .long 0 /* stepping ID */ cpu_high: .long 0 /* highest arg to CPUID */ cpu_feature: .long 0 /* features */ +cpu_fxsr: .long 0 /* use fxsave/fxrstor instruction */ cpu_vendor: .space 20 /* CPU origin code */ bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ diff --git a/sys/amd64/amd64/locore.s b/sys/amd64/amd64/locore.s index 379af45..cdfd799 100644 --- a/sys/amd64/amd64/locore.s +++ b/sys/amd64/amd64/locore.s @@ -113,12 +113,13 @@ HIDENAME(tmpstk): .globl boothowto,bootdev .globl cpu,cpu_vendor,cpu_id,bootinfo - .globl cpu_high, cpu_feature + .globl cpu_high, cpu_feature, cpu_fxsr cpu: .long 0 /* are we 386, 386sx, or 486 */ cpu_id: .long 0 /* stepping ID */ cpu_high: .long 0 /* highest arg to CPUID */ cpu_feature: .long 0 /* features */ +cpu_fxsr: .long 0 /* use fxsave/fxrstor instruction */ cpu_vendor: .space 20 /* CPU origin code */ bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 544aff5..7e9a4dd 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -127,6 +127,10 @@ extern void initializecpu(void); #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup __P((void *)); +#ifdef CPU_ENABLE_SSE +static void set_fpregs_xmm __P((struct save87 *, struct savexmm *)); +static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *)); +#endif /* CPU_ENABLE_SSE */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) int _udatasel, _ucodesel; @@ -1361,7 +1365,7 @@ extern inthand_t IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), - IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) @@ -1900,6 +1904,7 @@ init386(first) setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2092,8 +2097,8 @@ int ptrace_write_u(p, off, data) *(int*)((char *)p->p_addr + off) = data; return (0); } - min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu); - if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) { + min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_save); + if (off >= min && off <= min + sizeof(union savefpu) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } @@ -2161,12 +2166,73 @@ set_regs(p, regs) return (0); } +#ifdef CPU_ENABLE_SSE +static void +fill_fpregs_xmm(sv_xmm, sv_87) + struct savexmm *sv_xmm; + struct save87 *sv_87; +{ + register struct env87 *penv_87 = &sv_87->sv_env; + register struct envxmm *penv_xmm = &sv_xmm->sv_env; + int i; + + /* FPU control/status */ + penv_87->en_cw = penv_xmm->en_cw; + penv_87->en_sw = penv_xmm->en_sw; + penv_87->en_tw = penv_xmm->en_tw; + penv_87->en_fip = penv_xmm->en_fip; + penv_87->en_fcs = penv_xmm->en_fcs; + penv_87->en_opcode = penv_xmm->en_opcode; + penv_87->en_foo = penv_xmm->en_foo; + penv_87->en_fos = penv_xmm->en_fos; + + /* FPU registers */ + for (i = 0; i < 8; ++i) + sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; + + sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; +} + +static void +set_fpregs_xmm(sv_87, sv_xmm) + struct save87 *sv_87; + struct savexmm *sv_xmm; +{ + register struct env87 *penv_87 = &sv_87->sv_env; + register struct envxmm *penv_xmm = &sv_xmm->sv_env; + int i; + + /* FPU control/status */ + penv_xmm->en_cw = penv_87->en_cw; + penv_xmm->en_sw = penv_87->en_sw; + penv_xmm->en_tw = penv_87->en_tw; + penv_xmm->en_fip = penv_87->en_fip; + penv_xmm->en_fcs = penv_87->en_fcs; + penv_xmm->en_opcode = penv_87->en_opcode; + penv_xmm->en_foo = penv_87->en_foo; + penv_xmm->en_fos = penv_87->en_fos; + + /* FPU registers */ + for (i = 0; i < 8; ++i) + sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; + + sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; +} +#endif /* CPU_ENABLE_SSE */ + int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { - bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs); +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm, + (struct save87 *)fpregs); + return (0); + } +#endif /* CPU_ENABLE_SSE */ + bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } @@ -2175,7 +2241,14 @@ set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { - bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs); +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + set_fpregs_xmm((struct save87 *)fpregs, + &p->p_addr->u_pcb.pcb_save.sv_xmm); + return (0); + } +#endif /* CPU_ENABLE_SSE */ + bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs); return (0); } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 0a0de69..d5af7b3 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -2221,6 +2221,8 @@ invltlb(void) * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ +extern void enable_sse(void); + void ap_init(void) { @@ -2260,6 +2262,9 @@ ap_init(void) /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); + /* set up SSE registers */ + enable_sse(); + /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c index 0a0de69..d5af7b3 100644 --- a/sys/amd64/amd64/mptable.c +++ b/sys/amd64/amd64/mptable.c @@ -2221,6 +2221,8 @@ invltlb(void) * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ +extern void enable_sse(void); + void ap_init(void) { @@ -2260,6 +2262,9 @@ ap_init(void) /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); + /* set up SSE registers */ + enable_sse(); + /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 3218774..55bc29c 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -976,7 +976,7 @@ ENTRY(i586_copyin) ENTRY(fastmove) pushl %ebp movl %esp,%ebp - subl $PCB_SAVEFPU_SIZE+3*4,%esp + subl $PCB_SAVE87_SIZE+3*4,%esp movl 8(%ebp),%ecx cmpl $63,%ecx @@ -1018,7 +1018,7 @@ ENTRY(fastmove) movl PCPU(CURPCB),%esi addl $PCB_SAVEFPU,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx @@ -1102,7 +1102,7 @@ fastmove_loop: addl $PCB_SAVEFPU,%edi movl %esp,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx @@ -1147,7 +1147,7 @@ fastmove_fault: addl $PCB_SAVEFPU,%edi movl %esp,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s index 3218774..55bc29c 100644 --- a/sys/amd64/amd64/support.s +++ b/sys/amd64/amd64/support.s @@ -976,7 +976,7 @@ ENTRY(i586_copyin) ENTRY(fastmove) pushl %ebp movl %esp,%ebp - subl $PCB_SAVEFPU_SIZE+3*4,%esp + subl $PCB_SAVE87_SIZE+3*4,%esp movl 8(%ebp),%ecx cmpl $63,%ecx @@ -1018,7 +1018,7 @@ ENTRY(fastmove) movl PCPU(CURPCB),%esi addl $PCB_SAVEFPU,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx @@ -1102,7 +1102,7 @@ fastmove_loop: addl $PCB_SAVEFPU,%edi movl %esp,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx @@ -1147,7 +1147,7 @@ fastmove_fault: addl $PCB_SAVEFPU,%edi movl %esp,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index b1ab3bd..0431e0e 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -386,6 +386,11 @@ restart: ucode = T_FPOPFLT; i = SIGILL; break; + + case T_XMMFLT: /* SIMD floating-point exception */ + ucode = 0; /* XXX */ + i = SIGFPE; + break; } } else { /* kernel trap */ diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index e5e5ea6..05efb4d 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -148,7 +148,7 @@ cpu_fork(p1, p2, flags) p1->p_addr->u_pcb.pcb_gs = rgs(); savecrit = critical_enter(); if (PCPU_GET(npxproc) == p1) - npxsave(&p1->p_addr->u_pcb.pcb_savefpu); + npxsave(&p1->p_addr->u_pcb.pcb_save); critical_exit(savecrit); #endif diff --git a/sys/amd64/include/fpu.h b/sys/amd64/include/fpu.h index 11f0478..1474f2f 100644 --- a/sys/amd64/include/fpu.h +++ b/sys/amd64/include/fpu.h @@ -85,6 +85,42 @@ struct save87 { u_char sv_pad[64]; /* padding; used by emulators */ }; +struct envxmm { + u_int16_t en_cw; /* control word (16bits) */ + u_int16_t en_sw; /* status word (16bits) */ + u_int16_t en_tw; /* tag word (16bits) */ + u_int16_t en_opcode; /* opcode last executed (11 bits ) */ + u_int32_t en_fip; /* floating point instruction pointer */ + u_int16_t en_fcs; /* floating code segment selector */ + u_int16_t en_pad0; /* padding */ + u_int32_t en_foo; /* floating operand offset */ + u_int16_t en_fos; /* floating operand segment selector */ + u_int16_t en_pad1; /* padding */ + u_int32_t en_mxcsr; /* SSE sontorol/status register */ + u_int32_t en_pad2; /* padding */ +}; + +/* Contents of each SSE extended accumulator */ +struct xmmacc { + u_char xmm_bytes[16]; +}; + +struct savexmm { + struct envxmm sv_env; + struct { + struct fpacc87 fp_acc; + u_char fp_pad[6]; /* padding */ + } sv_fp[8]; + struct xmmacc sv_xmm[8]; + u_long sv_ex_sw; /* status word for last exception */ + u_char sv_pad[220]; +} __attribute__((aligned(16))); + +union savefpu { + struct save87 sv_87; + struct savexmm sv_xmm; +}; + /* * The hardware default control word for i387's and later coprocessors is * 0x37F, giving: @@ -108,7 +144,7 @@ struct save87 { int npxdna __P((void)); void npxexit __P((struct proc *p)); void npxinit __P((int control)); -void npxsave __P((struct save87 *addr)); +void npxsave __P((union savefpu *addr)); int npxtrap __P((void)); #endif diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index 5a2ed26..6c81a96 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -47,6 +47,7 @@ extern int (*copyout_vector) __P((const void *kaddr, void *udaddr, extern u_int cpu_feature; extern u_int cpu_high; extern u_int cpu_id; +extern u_int cpu_fxsr; extern char cpu_vendor[]; extern u_int cyrix_did; extern char kstack[]; diff --git a/sys/amd64/include/mptable.h b/sys/amd64/include/mptable.h index 0a0de69..d5af7b3 100644 --- a/sys/amd64/include/mptable.h +++ b/sys/amd64/include/mptable.h @@ -2221,6 +2221,8 @@ invltlb(void) * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ +extern void enable_sse(void); + void ap_init(void) { @@ -2260,6 +2262,9 @@ ap_init(void) /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); + /* set up SSE registers */ + enable_sse(); + /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { diff --git a/sys/amd64/include/npx.h b/sys/amd64/include/npx.h index 11f0478..1474f2f 100644 --- a/sys/amd64/include/npx.h +++ b/sys/amd64/include/npx.h @@ -85,6 +85,42 @@ struct save87 { u_char sv_pad[64]; /* padding; used by emulators */ }; +struct envxmm { + u_int16_t en_cw; /* control word (16bits) */ + u_int16_t en_sw; /* status word (16bits) */ + u_int16_t en_tw; /* tag word (16bits) */ + u_int16_t en_opcode; /* opcode last executed (11 bits ) */ + u_int32_t en_fip; /* floating point instruction pointer */ + u_int16_t en_fcs; /* floating code segment selector */ + u_int16_t en_pad0; /* padding */ + u_int32_t en_foo; /* floating operand offset */ + u_int16_t en_fos; /* floating operand segment selector */ + u_int16_t en_pad1; /* padding */ + u_int32_t en_mxcsr; /* SSE sontorol/status register */ + u_int32_t en_pad2; /* padding */ +}; + +/* Contents of each SSE extended accumulator */ +struct xmmacc { + u_char xmm_bytes[16]; +}; + +struct savexmm { + struct envxmm sv_env; + struct { + struct fpacc87 fp_acc; + u_char fp_pad[6]; /* padding */ + } sv_fp[8]; + struct xmmacc sv_xmm[8]; + u_long sv_ex_sw; /* status word for last exception */ + u_char sv_pad[220]; +} __attribute__((aligned(16))); + +union savefpu { + struct save87 sv_87; + struct savexmm sv_xmm; +}; + /* * The hardware default control word for i387's and later coprocessors is * 0x37F, giving: @@ -108,7 +144,7 @@ struct save87 { int npxdna __P((void)); void npxexit __P((struct proc *p)); void npxinit __P((int control)); -void npxsave __P((struct save87 *addr)); +void npxsave __P((union savefpu *addr)); int npxtrap __P((void)); #endif diff --git a/sys/amd64/include/pcb.h b/sys/amd64/include/pcb.h index 962fc6f..6ea7c3d 100644 --- a/sys/amd64/include/pcb.h +++ b/sys/amd64/include/pcb.h @@ -62,7 +62,8 @@ struct pcb { int pcb_dr7; struct pcb_ldt *pcb_ldt; /* per process (user) LDT */ - struct save87 pcb_savefpu; /* floating point state for 287/387 */ + union savefpu pcb_save; +#define pcb_savefpu pcb_save.sv_87 u_char pcb_flags; #define FP_SOFTFP 0x01 /* process using software fltng pnt emulator */ #define PCB_DBREGS 0x02 /* process using debug registers */ diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 937cab0..02440c9 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -93,6 +93,8 @@ #define CPUID_PGE 0x2000 #define CPUID_MCA 0x4000 #define CPUID_CMOV 0x8000 +#define CPUID_FXSR 0x01000000 +#define CPUID_XMM 0x02000000 /* * Model-specific registers for the i386 family diff --git a/sys/amd64/include/trap.h b/sys/amd64/include/trap.h index 6db97ec..67becb3 100644 --- a/sys/amd64/include/trap.h +++ b/sys/amd64/include/trap.h @@ -64,7 +64,8 @@ #define T_SEGNPFLT 26 /* segment not present fault */ #define T_STKFLT 27 /* stack fault */ #define T_MCHK 28 /* machine check trap */ -#define T_RESERVED 29 /* reserved (unknown) */ +#define T_XMMFLT 29 /* SIMD floating-point exception */ +#define T_RESERVED 30 /* reserved (unknown) */ /* XXX most of the following codes aren't used, but could be. */ diff --git a/sys/amd64/isa/npx.c b/sys/amd64/isa/npx.c index b6c69a0..f6410e9 100644 --- a/sys/amd64/isa/npx.c +++ b/sys/amd64/isa/npx.c @@ -35,6 +35,7 @@ * $FreeBSD$ */ +#include "opt_cpu.h" #include "opt_debug_npx.h" #include "opt_math_emulate.h" @@ -99,6 +100,8 @@ #define fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr))) #define fp_divide_by_0() __asm("fldz; fld1; fdiv %st,%st(1); fnop") #define frstor(addr) __asm("frstor %0" : : "m" (*(addr))) +#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) #define start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ : : "n" (CR0_TS) : "ax") #define stop_emulating() __asm("clts") @@ -113,11 +116,41 @@ void fnstcw __P((caddr_t addr)); void fnstsw __P((caddr_t addr)); void fp_divide_by_0 __P((void)); void frstor __P((caddr_t addr)); +void fxsave __P((caddr_t addr)); +void fxrstor __P((caddr_t addr)); void start_emulating __P((void)); void stop_emulating __P((void)); #endif /* __GNUC__ */ +#ifdef CPU_ENABLE_SSE +#define GET_FPU_CW(proc) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_cw : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw) +#define GET_FPU_SW(proc) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw) +#define MASK_FPU_SW(proc, mask) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw & (mask) : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask)) +#define GET_FPU_EXSW_PTR(pcb) \ + (cpu_fxsr ? \ + &(pcb)->pcb_save.sv_xmm.sv_ex_sw : \ + &(pcb)->pcb_save.sv_87.sv_ex_sw) +#else /* CPU_ENABLE_SSE */ +#define GET_FPU_CW(proc) \ + (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw) +#define GET_FPU_SW(proc) \ + (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw) +#define MASK_FPU_SW(proc, mask) \ + ((proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask)) +#define GET_FPU_EXSW_PTR(pcb) \ + (&(pcb)->pcb_save.sv_87.sv_ex_sw) +#endif /* CPU_ENABLE_SSE */ + typedef u_char bool_t; static int npx_attach __P((device_t dev)); @@ -127,6 +160,8 @@ static void npx_intr __P((void *)); #endif static int npx_probe __P((device_t dev)); static int npx_probe1 __P((device_t dev)); +static void fpusave __P((union savefpu *, u_char)); +static void fpurstor __P((union savefpu *, u_char)); #ifdef I586_CPU_XXX static long timezero __P((const char *funcname, void (*func)(void *buf, size_t len))); @@ -529,7 +564,7 @@ void npxinit(control) u_short control; { - struct save87 dummy; + union savefpu dummy; critical_t savecrit; if (!npx_exists) @@ -544,7 +579,7 @@ npxinit(control) stop_emulating(); fldcw(&control); if (PCPU_GET(curpcb) != NULL) - fnsave(&PCPU_GET(curpcb)->pcb_savefpu); + fpusave(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu); start_emulating(); critical_exit(savecrit); } @@ -560,7 +595,7 @@ npxexit(p) savecrit = critical_enter(); if (p == PCPU_GET(npxproc)) - npxsave(&PCPU_GET(curpcb)->pcb_savefpu); + npxsave(&PCPU_GET(curpcb)->pcb_save); critical_exit(savecrit); #ifdef NPX_DEBUG if (npx_exists) { @@ -773,6 +808,7 @@ npxtrap() { critical_t savecrit; u_short control, status; + u_long *exstat; if (!npx_exists) { printf("npxtrap: npxproc = %p, curproc = %p, npx_exists = %d\n", @@ -787,16 +823,17 @@ npxtrap() * wherever they are. */ if (PCPU_GET(npxproc) != curproc) { - control = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_cw; - status = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw; + control = GET_FPU_CW(curproc); + status = GET_FPU_SW(curproc); } else { fnstcw(&control); fnstsw(&status); } - curproc->p_addr->u_pcb.pcb_savefpu.sv_ex_sw = status; + exstat = GET_FPU_EXSW_PTR(&curproc->p_addr->u_pcb); + *exstat = status; if (PCPU_GET(npxproc) != curproc) - curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw &= ~0x80bf; + MASK_FPU_SW(curproc, ~0x80bf); else fnclex(); critical_exit(savecrit); @@ -813,6 +850,7 @@ npxtrap() int npxdna() { + u_long *exstat; critical_t s; if (!npx_exists) @@ -828,7 +866,9 @@ npxdna() * Record new context early in case frstor causes an IRQ13. */ PCPU_SET(npxproc, CURPROC); - PCPU_GET(curpcb)->pcb_savefpu.sv_ex_sw = 0; + + exstat = GET_FPU_EXSW_PTR(PCPU_GET(curpcb)); + *exstat = 0; /* * The following frstor may cause an IRQ13 when the state being * restored has a pending error. The error will appear to have been @@ -841,7 +881,7 @@ npxdna() * fnsave are broken, so our treatment breaks fnclex if it is the * first FPU instruction after a context switch. */ - frstor(&PCPU_GET(curpcb)->pcb_savefpu); + fpurstor(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu); critical_exit(s); return (1); @@ -872,15 +912,46 @@ npxdna() */ void npxsave(addr) - struct save87 *addr; + union savefpu *addr; { stop_emulating(); - fnsave(addr); + fpusave(addr, curproc->p_oncpu); + start_emulating(); PCPU_SET(npxproc, NULL); } +static void +fpusave(addr, oncpu) + union savefpu *addr; + u_char oncpu; +{ + static struct savexmm svxmm[MAXCPU]; + + if (!cpu_fxsr) + fnsave(addr); + else { + fxsave(&svxmm[oncpu]); + bcopy(&svxmm[oncpu], addr, sizeof(struct savexmm)); + } +} + +static void +fpurstor(addr, oncpu) + union savefpu *addr; + u_char oncpu; +{ + static struct savexmm svxmm[MAXCPU]; + + if (!cpu_fxsr) + frstor(addr); + else { + bcopy(addr, &svxmm[oncpu], sizeof (struct savexmm)); + fxrstor(&svxmm[oncpu]); + } +} + #ifdef I586_CPU_XXX static long timezero(funcname, func) diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 6933609..e92da02 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -187,6 +187,8 @@ cpu I686_CPU # aka Pentium Pro(tm) # reorder). This option should not be used if you use memory mapped # I/O device(s). # +# CPU_ENABLE_SSE enables SSE/MMX2 instructions support. +# # CPU_FASTER_5X86_FPU enables faster FPU exception handler. # # CPU_I486_ON_386 enables CPU cache on i486 based CPU upgrade products @@ -248,6 +250,7 @@ options CPU_BLUELIGHTNING_3X options CPU_BTB_EN options CPU_DIRECT_MAPPED_CACHE options CPU_DISABLE_5X86_LSSER +options CPU_ENABLE_SSE options CPU_FASTER_5X86_FPU options CPU_I486_ON_386 options CPU_IORT diff --git a/sys/conf/options.i386 b/sys/conf/options.i386 index e37882c..6cc4ebc 100644 --- a/sys/conf/options.i386 +++ b/sys/conf/options.i386 @@ -59,6 +59,7 @@ CPU_WT_ALLOC opt_cpu.h CYRIX_CACHE_WORKS opt_cpu.h CYRIX_CACHE_REALLY_WORKS opt_cpu.h NO_MEMORY_HOLE opt_cpu.h +CPU_ENABLE_SSE opt_cpu.h # The CPU type affects the endian conversion functions all over the kernel. I386_CPU opt_global.h diff --git a/sys/i386/conf/NOTES b/sys/i386/conf/NOTES index 6933609..e92da02 100644 --- a/sys/i386/conf/NOTES +++ b/sys/i386/conf/NOTES @@ -187,6 +187,8 @@ cpu I686_CPU # aka Pentium Pro(tm) # reorder). This option should not be used if you use memory mapped # I/O device(s). # +# CPU_ENABLE_SSE enables SSE/MMX2 instructions support. +# # CPU_FASTER_5X86_FPU enables faster FPU exception handler. # # CPU_I486_ON_386 enables CPU cache on i486 based CPU upgrade products @@ -248,6 +250,7 @@ options CPU_BLUELIGHTNING_3X options CPU_BTB_EN options CPU_DIRECT_MAPPED_CACHE options CPU_DISABLE_5X86_LSSER +options CPU_ENABLE_SSE options CPU_FASTER_5X86_FPU options CPU_I486_ON_386 options CPU_IORT diff --git a/sys/i386/i386/exception.s b/sys/i386/i386/exception.s index 3ccbc23..419fbd2 100644 --- a/sys/i386/i386/exception.s +++ b/sys/i386/i386/exception.s @@ -153,6 +153,9 @@ IDTVEC(fpu) IDTVEC(align) TRAP(T_ALIGNFLT) +IDTVEC(xmm) + pushl $0; TRAP(T_XMMFLT) + /* * alltraps entry point. Interrupts are enabled if this was a trap * gate (TGT), else disabled if this was an interrupt gate (IGT). diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c index c6f0970..ac664e8 100644 --- a/sys/i386/i386/genassym.c +++ b/sys/i386/i386/genassym.c @@ -126,8 +126,9 @@ ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); -ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_savefpu)); -ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct save87)); +ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); +ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu)); +ASSYM(PCB_SAVE87_SIZE, sizeof(struct save87)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); #ifdef SMP diff --git a/sys/i386/i386/initcpu.c b/sys/i386/i386/initcpu.c index 8b39b44..7fb56fa 100644 --- a/sys/i386/i386/initcpu.c +++ b/sys/i386/i386/initcpu.c @@ -34,6 +34,7 @@ #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> +#include <sys/sysctl.h> #include <machine/cputypes.h> #include <machine/md_var.h> @@ -61,8 +62,14 @@ static void init_6x86(void); static void init_6x86MX(void); static void init_ppro(void); static void init_mendocino(void); +void enable_sse(); #endif +int hw_instruction_sse = 0; +SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, + &hw_instruction_sse, 0, + "SIMD/MMX2 instructions available in CPU"); + #ifdef I486_CPU /* * IBM Blue Lightning @@ -501,6 +508,20 @@ init_mendocino(void) #endif /* CPU_PPRO2CELERON */ } +/* + * Initialize CR4 (Control register 4) to enable SSE instructions. + */ +void +enable_sse(void) +{ +#if defined(CPU_ENABLE_SSE) + if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { + load_cr4(rcr4() | CR4_FXSR | CR4_XMM); + cpu_fxsr = hw_instruction_sse = 1; + } +#endif +} + #endif /* I686_CPU */ void @@ -544,6 +565,7 @@ initializecpu(void) init_mendocino(); break; } + enable_sse(); } break; #endif diff --git a/sys/i386/i386/locore.s b/sys/i386/i386/locore.s index 379af45..cdfd799 100644 --- a/sys/i386/i386/locore.s +++ b/sys/i386/i386/locore.s @@ -113,12 +113,13 @@ HIDENAME(tmpstk): .globl boothowto,bootdev .globl cpu,cpu_vendor,cpu_id,bootinfo - .globl cpu_high, cpu_feature + .globl cpu_high, cpu_feature, cpu_fxsr cpu: .long 0 /* are we 386, 386sx, or 486 */ cpu_id: .long 0 /* stepping ID */ cpu_high: .long 0 /* highest arg to CPUID */ cpu_feature: .long 0 /* features */ +cpu_fxsr: .long 0 /* use fxsave/fxrstor instruction */ cpu_vendor: .space 20 /* CPU origin code */ bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 544aff5..7e9a4dd 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -127,6 +127,10 @@ extern void initializecpu(void); #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup __P((void *)); +#ifdef CPU_ENABLE_SSE +static void set_fpregs_xmm __P((struct save87 *, struct savexmm *)); +static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *)); +#endif /* CPU_ENABLE_SSE */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) int _udatasel, _ucodesel; @@ -1361,7 +1365,7 @@ extern inthand_t IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), - IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) @@ -1900,6 +1904,7 @@ init386(first) setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2092,8 +2097,8 @@ int ptrace_write_u(p, off, data) *(int*)((char *)p->p_addr + off) = data; return (0); } - min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu); - if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) { + min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_save); + if (off >= min && off <= min + sizeof(union savefpu) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } @@ -2161,12 +2166,73 @@ set_regs(p, regs) return (0); } +#ifdef CPU_ENABLE_SSE +static void +fill_fpregs_xmm(sv_xmm, sv_87) + struct savexmm *sv_xmm; + struct save87 *sv_87; +{ + register struct env87 *penv_87 = &sv_87->sv_env; + register struct envxmm *penv_xmm = &sv_xmm->sv_env; + int i; + + /* FPU control/status */ + penv_87->en_cw = penv_xmm->en_cw; + penv_87->en_sw = penv_xmm->en_sw; + penv_87->en_tw = penv_xmm->en_tw; + penv_87->en_fip = penv_xmm->en_fip; + penv_87->en_fcs = penv_xmm->en_fcs; + penv_87->en_opcode = penv_xmm->en_opcode; + penv_87->en_foo = penv_xmm->en_foo; + penv_87->en_fos = penv_xmm->en_fos; + + /* FPU registers */ + for (i = 0; i < 8; ++i) + sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; + + sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; +} + +static void +set_fpregs_xmm(sv_87, sv_xmm) + struct save87 *sv_87; + struct savexmm *sv_xmm; +{ + register struct env87 *penv_87 = &sv_87->sv_env; + register struct envxmm *penv_xmm = &sv_xmm->sv_env; + int i; + + /* FPU control/status */ + penv_xmm->en_cw = penv_87->en_cw; + penv_xmm->en_sw = penv_87->en_sw; + penv_xmm->en_tw = penv_87->en_tw; + penv_xmm->en_fip = penv_87->en_fip; + penv_xmm->en_fcs = penv_87->en_fcs; + penv_xmm->en_opcode = penv_87->en_opcode; + penv_xmm->en_foo = penv_87->en_foo; + penv_xmm->en_fos = penv_87->en_fos; + + /* FPU registers */ + for (i = 0; i < 8; ++i) + sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; + + sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; +} +#endif /* CPU_ENABLE_SSE */ + int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { - bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs); +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm, + (struct save87 *)fpregs); + return (0); + } +#endif /* CPU_ENABLE_SSE */ + bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } @@ -2175,7 +2241,14 @@ set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { - bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs); +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + set_fpregs_xmm((struct save87 *)fpregs, + &p->p_addr->u_pcb.pcb_save.sv_xmm); + return (0); + } +#endif /* CPU_ENABLE_SSE */ + bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs); return (0); } diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index 0a0de69..d5af7b3 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -2221,6 +2221,8 @@ invltlb(void) * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ +extern void enable_sse(void); + void ap_init(void) { @@ -2260,6 +2262,9 @@ ap_init(void) /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); + /* set up SSE registers */ + enable_sse(); + /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { diff --git a/sys/i386/i386/mptable.c b/sys/i386/i386/mptable.c index 0a0de69..d5af7b3 100644 --- a/sys/i386/i386/mptable.c +++ b/sys/i386/i386/mptable.c @@ -2221,6 +2221,8 @@ invltlb(void) * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ +extern void enable_sse(void); + void ap_init(void) { @@ -2260,6 +2262,9 @@ ap_init(void) /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); + /* set up SSE registers */ + enable_sse(); + /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s index 3218774..55bc29c 100644 --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -976,7 +976,7 @@ ENTRY(i586_copyin) ENTRY(fastmove) pushl %ebp movl %esp,%ebp - subl $PCB_SAVEFPU_SIZE+3*4,%esp + subl $PCB_SAVE87_SIZE+3*4,%esp movl 8(%ebp),%ecx cmpl $63,%ecx @@ -1018,7 +1018,7 @@ ENTRY(fastmove) movl PCPU(CURPCB),%esi addl $PCB_SAVEFPU,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx @@ -1102,7 +1102,7 @@ fastmove_loop: addl $PCB_SAVEFPU,%edi movl %esp,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx @@ -1147,7 +1147,7 @@ fastmove_fault: addl $PCB_SAVEFPU,%edi movl %esp,%esi cld - movl $PCB_SAVEFPU_SIZE>>2,%ecx + movl $PCB_SAVE87_SIZE>>2,%ecx rep movsl diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index b1ab3bd..0431e0e 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -386,6 +386,11 @@ restart: ucode = T_FPOPFLT; i = SIGILL; break; + + case T_XMMFLT: /* SIMD floating-point exception */ + ucode = 0; /* XXX */ + i = SIGFPE; + break; } } else { /* kernel trap */ diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c index e5e5ea6..05efb4d 100644 --- a/sys/i386/i386/vm_machdep.c +++ b/sys/i386/i386/vm_machdep.c @@ -148,7 +148,7 @@ cpu_fork(p1, p2, flags) p1->p_addr->u_pcb.pcb_gs = rgs(); savecrit = critical_enter(); if (PCPU_GET(npxproc) == p1) - npxsave(&p1->p_addr->u_pcb.pcb_savefpu); + npxsave(&p1->p_addr->u_pcb.pcb_save); critical_exit(savecrit); #endif diff --git a/sys/i386/include/md_var.h b/sys/i386/include/md_var.h index 5a2ed26..6c81a96 100644 --- a/sys/i386/include/md_var.h +++ b/sys/i386/include/md_var.h @@ -47,6 +47,7 @@ extern int (*copyout_vector) __P((const void *kaddr, void *udaddr, extern u_int cpu_feature; extern u_int cpu_high; extern u_int cpu_id; +extern u_int cpu_fxsr; extern char cpu_vendor[]; extern u_int cyrix_did; extern char kstack[]; diff --git a/sys/i386/include/mptable.h b/sys/i386/include/mptable.h index 0a0de69..d5af7b3 100644 --- a/sys/i386/include/mptable.h +++ b/sys/i386/include/mptable.h @@ -2221,6 +2221,8 @@ invltlb(void) * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ +extern void enable_sse(void); + void ap_init(void) { @@ -2260,6 +2262,9 @@ ap_init(void) /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); + /* set up SSE registers */ + enable_sse(); + /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { diff --git a/sys/i386/include/npx.h b/sys/i386/include/npx.h index 11f0478..1474f2f 100644 --- a/sys/i386/include/npx.h +++ b/sys/i386/include/npx.h @@ -85,6 +85,42 @@ struct save87 { u_char sv_pad[64]; /* padding; used by emulators */ }; +struct envxmm { + u_int16_t en_cw; /* control word (16bits) */ + u_int16_t en_sw; /* status word (16bits) */ + u_int16_t en_tw; /* tag word (16bits) */ + u_int16_t en_opcode; /* opcode last executed (11 bits ) */ + u_int32_t en_fip; /* floating point instruction pointer */ + u_int16_t en_fcs; /* floating code segment selector */ + u_int16_t en_pad0; /* padding */ + u_int32_t en_foo; /* floating operand offset */ + u_int16_t en_fos; /* floating operand segment selector */ + u_int16_t en_pad1; /* padding */ + u_int32_t en_mxcsr; /* SSE sontorol/status register */ + u_int32_t en_pad2; /* padding */ +}; + +/* Contents of each SSE extended accumulator */ +struct xmmacc { + u_char xmm_bytes[16]; +}; + +struct savexmm { + struct envxmm sv_env; + struct { + struct fpacc87 fp_acc; + u_char fp_pad[6]; /* padding */ + } sv_fp[8]; + struct xmmacc sv_xmm[8]; + u_long sv_ex_sw; /* status word for last exception */ + u_char sv_pad[220]; +} __attribute__((aligned(16))); + +union savefpu { + struct save87 sv_87; + struct savexmm sv_xmm; +}; + /* * The hardware default control word for i387's and later coprocessors is * 0x37F, giving: @@ -108,7 +144,7 @@ struct save87 { int npxdna __P((void)); void npxexit __P((struct proc *p)); void npxinit __P((int control)); -void npxsave __P((struct save87 *addr)); +void npxsave __P((union savefpu *addr)); int npxtrap __P((void)); #endif diff --git a/sys/i386/include/pcb.h b/sys/i386/include/pcb.h index 962fc6f..6ea7c3d 100644 --- a/sys/i386/include/pcb.h +++ b/sys/i386/include/pcb.h @@ -62,7 +62,8 @@ struct pcb { int pcb_dr7; struct pcb_ldt *pcb_ldt; /* per process (user) LDT */ - struct save87 pcb_savefpu; /* floating point state for 287/387 */ + union savefpu pcb_save; +#define pcb_savefpu pcb_save.sv_87 u_char pcb_flags; #define FP_SOFTFP 0x01 /* process using software fltng pnt emulator */ #define PCB_DBREGS 0x02 /* process using debug registers */ diff --git a/sys/i386/include/specialreg.h b/sys/i386/include/specialreg.h index 937cab0..02440c9 100644 --- a/sys/i386/include/specialreg.h +++ b/sys/i386/include/specialreg.h @@ -93,6 +93,8 @@ #define CPUID_PGE 0x2000 #define CPUID_MCA 0x4000 #define CPUID_CMOV 0x8000 +#define CPUID_FXSR 0x01000000 +#define CPUID_XMM 0x02000000 /* * Model-specific registers for the i386 family diff --git a/sys/i386/include/trap.h b/sys/i386/include/trap.h index 6db97ec..67becb3 100644 --- a/sys/i386/include/trap.h +++ b/sys/i386/include/trap.h @@ -64,7 +64,8 @@ #define T_SEGNPFLT 26 /* segment not present fault */ #define T_STKFLT 27 /* stack fault */ #define T_MCHK 28 /* machine check trap */ -#define T_RESERVED 29 /* reserved (unknown) */ +#define T_XMMFLT 29 /* SIMD floating-point exception */ +#define T_RESERVED 30 /* reserved (unknown) */ /* XXX most of the following codes aren't used, but could be. */ diff --git a/sys/i386/isa/npx.c b/sys/i386/isa/npx.c index b6c69a0..f6410e9 100644 --- a/sys/i386/isa/npx.c +++ b/sys/i386/isa/npx.c @@ -35,6 +35,7 @@ * $FreeBSD$ */ +#include "opt_cpu.h" #include "opt_debug_npx.h" #include "opt_math_emulate.h" @@ -99,6 +100,8 @@ #define fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr))) #define fp_divide_by_0() __asm("fldz; fld1; fdiv %st,%st(1); fnop") #define frstor(addr) __asm("frstor %0" : : "m" (*(addr))) +#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) #define start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ : : "n" (CR0_TS) : "ax") #define stop_emulating() __asm("clts") @@ -113,11 +116,41 @@ void fnstcw __P((caddr_t addr)); void fnstsw __P((caddr_t addr)); void fp_divide_by_0 __P((void)); void frstor __P((caddr_t addr)); +void fxsave __P((caddr_t addr)); +void fxrstor __P((caddr_t addr)); void start_emulating __P((void)); void stop_emulating __P((void)); #endif /* __GNUC__ */ +#ifdef CPU_ENABLE_SSE +#define GET_FPU_CW(proc) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_cw : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw) +#define GET_FPU_SW(proc) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw) +#define MASK_FPU_SW(proc, mask) \ + (cpu_fxsr ? \ + (proc)->p_addr->u_pcb.pcb_save.sv_xmm.sv_env.en_sw & (mask) : \ + (proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask)) +#define GET_FPU_EXSW_PTR(pcb) \ + (cpu_fxsr ? \ + &(pcb)->pcb_save.sv_xmm.sv_ex_sw : \ + &(pcb)->pcb_save.sv_87.sv_ex_sw) +#else /* CPU_ENABLE_SSE */ +#define GET_FPU_CW(proc) \ + (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_cw) +#define GET_FPU_SW(proc) \ + (proc->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw) +#define MASK_FPU_SW(proc, mask) \ + ((proc)->p_addr->u_pcb.pcb_save.sv_87.sv_env.en_sw & (mask)) +#define GET_FPU_EXSW_PTR(pcb) \ + (&(pcb)->pcb_save.sv_87.sv_ex_sw) +#endif /* CPU_ENABLE_SSE */ + typedef u_char bool_t; static int npx_attach __P((device_t dev)); @@ -127,6 +160,8 @@ static void npx_intr __P((void *)); #endif static int npx_probe __P((device_t dev)); static int npx_probe1 __P((device_t dev)); +static void fpusave __P((union savefpu *, u_char)); +static void fpurstor __P((union savefpu *, u_char)); #ifdef I586_CPU_XXX static long timezero __P((const char *funcname, void (*func)(void *buf, size_t len))); @@ -529,7 +564,7 @@ void npxinit(control) u_short control; { - struct save87 dummy; + union savefpu dummy; critical_t savecrit; if (!npx_exists) @@ -544,7 +579,7 @@ npxinit(control) stop_emulating(); fldcw(&control); if (PCPU_GET(curpcb) != NULL) - fnsave(&PCPU_GET(curpcb)->pcb_savefpu); + fpusave(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu); start_emulating(); critical_exit(savecrit); } @@ -560,7 +595,7 @@ npxexit(p) savecrit = critical_enter(); if (p == PCPU_GET(npxproc)) - npxsave(&PCPU_GET(curpcb)->pcb_savefpu); + npxsave(&PCPU_GET(curpcb)->pcb_save); critical_exit(savecrit); #ifdef NPX_DEBUG if (npx_exists) { @@ -773,6 +808,7 @@ npxtrap() { critical_t savecrit; u_short control, status; + u_long *exstat; if (!npx_exists) { printf("npxtrap: npxproc = %p, curproc = %p, npx_exists = %d\n", @@ -787,16 +823,17 @@ npxtrap() * wherever they are. */ if (PCPU_GET(npxproc) != curproc) { - control = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_cw; - status = curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw; + control = GET_FPU_CW(curproc); + status = GET_FPU_SW(curproc); } else { fnstcw(&control); fnstsw(&status); } - curproc->p_addr->u_pcb.pcb_savefpu.sv_ex_sw = status; + exstat = GET_FPU_EXSW_PTR(&curproc->p_addr->u_pcb); + *exstat = status; if (PCPU_GET(npxproc) != curproc) - curproc->p_addr->u_pcb.pcb_savefpu.sv_env.en_sw &= ~0x80bf; + MASK_FPU_SW(curproc, ~0x80bf); else fnclex(); critical_exit(savecrit); @@ -813,6 +850,7 @@ npxtrap() int npxdna() { + u_long *exstat; critical_t s; if (!npx_exists) @@ -828,7 +866,9 @@ npxdna() * Record new context early in case frstor causes an IRQ13. */ PCPU_SET(npxproc, CURPROC); - PCPU_GET(curpcb)->pcb_savefpu.sv_ex_sw = 0; + + exstat = GET_FPU_EXSW_PTR(PCPU_GET(curpcb)); + *exstat = 0; /* * The following frstor may cause an IRQ13 when the state being * restored has a pending error. The error will appear to have been @@ -841,7 +881,7 @@ npxdna() * fnsave are broken, so our treatment breaks fnclex if it is the * first FPU instruction after a context switch. */ - frstor(&PCPU_GET(curpcb)->pcb_savefpu); + fpurstor(&PCPU_GET(curpcb)->pcb_save, curproc->p_oncpu); critical_exit(s); return (1); @@ -872,15 +912,46 @@ npxdna() */ void npxsave(addr) - struct save87 *addr; + union savefpu *addr; { stop_emulating(); - fnsave(addr); + fpusave(addr, curproc->p_oncpu); + start_emulating(); PCPU_SET(npxproc, NULL); } +static void +fpusave(addr, oncpu) + union savefpu *addr; + u_char oncpu; +{ + static struct savexmm svxmm[MAXCPU]; + + if (!cpu_fxsr) + fnsave(addr); + else { + fxsave(&svxmm[oncpu]); + bcopy(&svxmm[oncpu], addr, sizeof(struct savexmm)); + } +} + +static void +fpurstor(addr, oncpu) + union savefpu *addr; + u_char oncpu; +{ + static struct savexmm svxmm[MAXCPU]; + + if (!cpu_fxsr) + frstor(addr); + else { + bcopy(addr, &svxmm[oncpu], sizeof (struct savexmm)); + fxrstor(&svxmm[oncpu]); + } +} + #ifdef I586_CPU_XXX static long timezero(funcname, func) |