From 4d88d6566a61c3b7598a583389954ccba701acb4 Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 12 Jul 2002 07:56:11 +0000 Subject: Revive backed out pmap related changes from Feb 2002. The highlights are: - It actually works this time, honest! - Fine grained TLB shootdowns for SMP on i386. IPI's are very expensive, so try and optimize things where possible. - Introduce ranged shootdowns that can be done as a single IPI. - PG_G support for i386 - Specific-cpu targeted shootdowns. For example, there is no sense in globally purging the TLB cache for where we are stealing a page from the local unshared process on the local cpu. Use pm_active to track this. - Add some instrumentation for the tlb shootdown code. - Rip out SMP code from - Try and fix some very bogus PG_G and PG_PS interactions that were bad enough to cause vm86 bios calls to break. vm86 depended on our existing bugs and this was the cause of the VESA panics last time. - Fix the silly one-line error that caused the 'panic: bad pte' last time. - Fix a couple of other silly one-line errors that should have caused more pain than they did. Some more work is needed: - pmap_{zero,copy}_page[_idle]. These can be done without IPI's if we have a hook in cpu_switch. - The IPI handlers need some cleanup. I have a bogus %ds load that can be avoided. - APTD handling is rather bogus and appears to be a large source of global TLB IPI shootdowns for no really good reason. I see speedups of between 1.5% and ~4% on buildworlds in a while 1 loop. I expect to see a bigger difference when there is significant pageout activity or the system otherwise has memory shortages. I have backed out a few optimizations that I had been using over the last few days in order to be a little more conservative. I'll revisit these again over the next few days as the dust settles. New option: DISABLE_PG_G - In case I missed something. --- sys/i386/include/cpufunc.h | 168 ++++++++++++++++-------------- sys/i386/include/mptable.h | 252 +++++++++++++++++++++++++++++++++++++++++---- sys/i386/include/pmap.h | 9 +- sys/i386/include/smp.h | 10 +- 4 files changed, 337 insertions(+), 102 deletions(-) (limited to 'sys/i386/include') diff --git a/sys/i386/include/cpufunc.h b/sys/i386/include/cpufunc.h index 2e64138..0896659 100644 --- a/sys/i386/include/cpufunc.h +++ b/sys/i386/include/cpufunc.h @@ -237,62 +237,6 @@ invd(void) __asm __volatile("invd"); } -#if defined(SMP) && defined(_KERNEL) - -/* - * When using APIC IPI's, invlpg() is not simply the invlpg instruction - * (this is a bug) and the inlining cost is prohibitive since the call - * executes into the IPI transmission system. - */ -void invlpg(u_int addr); -void invltlb(void); - -static __inline void -cpu_invlpg(void *addr) -{ - __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); -} - -static __inline void -cpu_invltlb(void) -{ - u_int temp; - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() - * is inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp) - : : "memory"); -#if defined(SWTCH_OPTIM_STATS) - ++tlb_flush_count; -#endif -} - -#else /* !(SMP && _KERNEL) */ - -static __inline void -invlpg(u_int addr) -{ - __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); -} - -static __inline void -invltlb(void) -{ - u_int temp; - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() - * is inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp) - : : "memory"); -#ifdef SWTCH_OPTIM_STATS - ++tlb_flush_count; -#endif -} - -#endif /* SMP && _KERNEL */ - static __inline u_short inw(u_int port) { @@ -364,15 +308,6 @@ ia32_pause(void) } static __inline u_int -rcr2(void) -{ - u_int data; - - __asm __volatile("movl %%cr2,%0" : "=r" (data)); - return (data); -} - -static __inline u_int read_eflags(void) { u_int ef; @@ -426,6 +361,86 @@ wrmsr(u_int msr, u_int64_t newval) __asm __volatile("wrmsr" : : "A" (newval), "c" (msr)); } +static __inline void +load_cr0(u_int data) +{ + + __asm __volatile("movl %0,%%cr0" : : "r" (data)); +} + +static __inline u_int +rcr0(void) +{ + u_int data; + + __asm __volatile("movl %%cr0,%0" : "=r" (data)); + return (data); +} + +static __inline u_int +rcr2(void) +{ + u_int data; + + __asm __volatile("movl %%cr2,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_cr3(u_int data) +{ + + __asm __volatile("movl %0,%%cr3" : : "r" (data) : "memory"); +#if defined(SWTCH_OPTIM_STATS) + ++tlb_flush_count; +#endif +} + +static __inline u_int +rcr3(void) +{ + u_int data; + + __asm __volatile("movl %%cr3,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_cr4(u_int data) +{ + __asm __volatile("movl %0,%%cr4" : : "r" (data)); +} + +static __inline u_int +rcr4(void) +{ + u_int data; + + __asm __volatile("movl %%cr4,%0" : "=r" (data)); + return (data); +} + +/* + * Global TLB flush (except for thise for pages marked PG_G) + */ +static __inline void +invltlb(void) +{ + + load_cr3(rcr3()); +} + +/* + * TLB flush for an individual page (even if it has PG_G). + * Only works on 486+ CPUs (i386 does not have PG_G). + */ +static __inline void +invlpg(u_int addr) +{ + + __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); +} + static __inline u_int rfs(void) { @@ -587,6 +602,8 @@ intr_restore(register_t eflags) int breakpoint(void); u_int bsfl(u_int mask); u_int bsrl(u_int mask); +void cpu_invlpg(u_int addr); +void cpu_invlpg_range(u_int start, u_int end); void disable_intr(void); void do_cpuid(u_int ax, u_int *p); void enable_intr(void); @@ -597,8 +614,14 @@ void insl(u_int port, void *addr, size_t cnt); void insw(u_int port, void *addr, size_t cnt); void invd(void); void invlpg(u_int addr); +void invlpg_range(u_int start, u_int end); void invltlb(void); u_short inw(u_int port); +void load_cr0(u_int cr0); +void load_cr3(u_int cr3); +void load_cr4(u_int cr4); +void load_fs(u_int sel); +void load_gs(u_int sel); void outb(u_int port, u_char data); void outl(u_int port, u_int data); void outsb(u_int port, void *addr, size_t cnt); @@ -606,7 +629,12 @@ void outsl(u_int port, void *addr, size_t cnt); void outsw(u_int port, void *addr, size_t cnt); void outw(u_int port, u_short data); void ia32_pause(void); +u_int rcr0(void); u_int rcr2(void); +u_int rcr3(void); +u_int rcr4(void); +u_int rfs(void); +u_int rgs(void); u_int64_t rdmsr(u_int msr); u_int64_t rdpmc(u_int pmc); u_int64_t rdtsc(void); @@ -614,10 +642,6 @@ u_int read_eflags(void); void wbinvd(void); void write_eflags(u_int ef); void wrmsr(u_int msr, u_int64_t newval); -u_int rfs(void); -u_int rgs(void); -void load_fs(u_int sel); -void load_gs(u_int sel); u_int rdr0(void); void load_dr0(u_int dr0); u_int rdr1(void); @@ -639,13 +663,7 @@ void intr_restore(register_t ef); #endif /* __GNUC__ */ -void load_cr0(u_int cr0); -void load_cr3(u_int cr3); -void load_cr4(u_int cr4); void ltr(u_short sel); -u_int rcr0(void); -u_int rcr3(void); -u_int rcr4(void); void reset_dbregs(void); __END_DECLS diff --git a/sys/i386/include/mptable.h b/sys/i386/include/mptable.h index 63fec0e..29e9c6e 100644 --- a/sys/i386/include/mptable.h +++ b/sys/i386/include/mptable.h @@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; +#ifdef APIC_IO +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; +static struct mtx smp_tlb_mtx; +#endif + /* * Local data and functions. */ @@ -336,6 +344,9 @@ init_locks(void) #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", NULL, MTX_SPIN); #endif /* USE_COMLOCK */ +#ifdef APIC_IO + mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN); +#endif } /* @@ -605,6 +616,10 @@ mp_enable(u_int boot_addr) /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLPG_OFFSET, Xinvlpg, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLRNG_OFFSET, Xinvlrng, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, @@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } -#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS) -u_int xhits[MAXCPU]; -SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits), - "IU", ""); +#if defined(APIC_IO) + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); #endif /* * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + register_t eflags; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} + +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. * - * XXX: Needs to handshake and wait for completion before proceding. + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. */ +static __inline u_int32_t +popcnt(u_int32_t m) +{ + + m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); + m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); + m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); + m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); + m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); + return m; +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + register_t eflags; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + /* XXX there should be a pcpu self mask */ + mask &= ~(1 << PCPU_GET(cpuid)); + if (mask == 0) + return; + ncpu = popcnt(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} +#endif + void smp_invltlb(void) { #if defined(APIC_IO) - if (smp_started) - ipi_all_but_self(IPI_INVLTLB); + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } #endif /* APIC_IO */ } void -invlpg(u_int addr) +smp_invlpg(vm_offset_t addr) { - __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } void -invltlb(void) +smp_masked_invltlb(u_int mask) { - u_long temp; +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +#endif /* APIC_IO */ +} - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() is - * inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } @@ -2251,7 +2455,7 @@ ap_init(void) /* spin */ ; /* BSP may have changed PTD while we were waiting */ - cpu_invltlb(); + invltlb(); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); @@ -2290,6 +2494,9 @@ ap_init(void) /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + if (bootverbose) + apic_dump("ap_init()"); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); if (smp_cpus == mp_ncpus) { @@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); - statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); + statclock_process(curthread->td_kse, TRAPF_PC(&frame), + TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index e6ac669..e0789fc 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -151,7 +151,7 @@ extern pt_entry_t PTmap[], APTmap[]; extern pd_entry_t PTD[], APTD[]; extern pd_entry_t PTDpde, APTDpde; -extern pd_entry_t IdlePTD; /* physical address of "Idle" state directory */ +extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */ #endif #ifdef _KERNEL @@ -253,14 +253,15 @@ extern char *ptvmmap; /* poor name! */ extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; -void pmap_bootstrap( vm_offset_t, vm_offset_t); +void pmap_bootstrap(vm_offset_t, vm_offset_t); void *pmap_mapdev(vm_offset_t, vm_size_t); void pmap_unmapdev(vm_offset_t, vm_size_t); pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2; vm_page_t pmap_use_pt(pmap_t, vm_offset_t); -#ifdef SMP void pmap_set_opt(void); -#endif +void pmap_invalidate_page(pmap_t, vm_offset_t); +void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); +void pmap_invalidate_all(pmap_t); #endif /* _KERNEL */ diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h index 872c5ec..d669c51 100644 --- a/sys/i386/include/smp.h +++ b/sys/i386/include/smp.h @@ -51,6 +51,8 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */ * Interprocessor interrupts for SMP. */ #define IPI_INVLTLB XINVLTLB_OFFSET +#define IPI_INVLPG XINVLPG_OFFSET +#define IPI_INVLRNG XINVLRNG_OFFSET #define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET #define IPI_AST XCPUAST_OFFSET #define IPI_STOP XCPUSTOP_OFFSET @@ -107,7 +109,6 @@ void assign_apic_irq(int apic, int intpin, int irq); void revoke_apic_irq(int irq); void bsp_apic_configure(void); void init_secondary(void); -void smp_invltlb(void); void forward_statclock(void); void forwarded_statclock(struct trapframe frame); void forward_hardclock(void); @@ -119,6 +120,13 @@ void ipi_self(u_int ipi); #ifdef APIC_INTR_REORDER void set_lapic_isrloc(int, int); #endif /* APIC_INTR_REORDER */ +void smp_invlpg(vm_offset_t addr); +void smp_masked_invlpg(u_int mask, vm_offset_t addr); +void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva); +void smp_masked_invlpg_range(u_int mask, vm_offset_t startva, + vm_offset_t endva); +void smp_invltlb(void); +void smp_masked_invltlb(u_int mask); /* global data in mpapic.c */ extern volatile lapic_t lapic; -- cgit v1.1