From 4d88d6566a61c3b7598a583389954ccba701acb4 Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 12 Jul 2002 07:56:11 +0000 Subject: Revive backed out pmap related changes from Feb 2002. The highlights are: - It actually works this time, honest! - Fine grained TLB shootdowns for SMP on i386. IPI's are very expensive, so try and optimize things where possible. - Introduce ranged shootdowns that can be done as a single IPI. - PG_G support for i386 - Specific-cpu targeted shootdowns. For example, there is no sense in globally purging the TLB cache for where we are stealing a page from the local unshared process on the local cpu. Use pm_active to track this. - Add some instrumentation for the tlb shootdown code. - Rip out SMP code from - Try and fix some very bogus PG_G and PG_PS interactions that were bad enough to cause vm86 bios calls to break. vm86 depended on our existing bugs and this was the cause of the VESA panics last time. - Fix the silly one-line error that caused the 'panic: bad pte' last time. - Fix a couple of other silly one-line errors that should have caused more pain than they did. Some more work is needed: - pmap_{zero,copy}_page[_idle]. These can be done without IPI's if we have a hook in cpu_switch. - The IPI handlers need some cleanup. I have a bogus %ds load that can be avoided. - APTD handling is rather bogus and appears to be a large source of global TLB IPI shootdowns for no really good reason. I see speedups of between 1.5% and ~4% on buildworlds in a while 1 loop. I expect to see a bigger difference when there is significant pageout activity or the system otherwise has memory shortages. I have backed out a few optimizations that I had been using over the last few days in order to be a little more conservative. I'll revisit these again over the next few days as the dust settles. New option: DISABLE_PG_G - In case I missed something. --- sys/amd64/amd64/apic_vector.S | 87 ++++++- sys/amd64/amd64/bios.c | 15 +- sys/amd64/amd64/db_interface.c | 4 +- sys/amd64/amd64/locore.S | 24 +- sys/amd64/amd64/locore.s | 24 +- sys/amd64/amd64/mp_machdep.c | 252 ++++++++++++++++++-- sys/amd64/amd64/mptable.c | 252 ++++++++++++++++++-- sys/amd64/amd64/pmap.c | 510 +++++++++++++++++++++-------------------- sys/amd64/amd64/support.S | 36 --- sys/amd64/amd64/support.s | 36 --- sys/amd64/include/cpufunc.h | 168 ++++++++------ sys/amd64/include/mptable.h | 252 ++++++++++++++++++-- sys/amd64/include/pmap.h | 9 +- sys/amd64/include/smp.h | 10 +- sys/amd64/isa/intr_machdep.h | 20 +- sys/conf/options.i386 | 3 +- sys/conf/options.pc98 | 3 +- sys/i386/i386/apic_vector.s | 87 ++++++- sys/i386/i386/bios.c | 15 +- sys/i386/i386/db_interface.c | 4 +- sys/i386/i386/locore.s | 24 +- sys/i386/i386/mp_machdep.c | 252 ++++++++++++++++++-- sys/i386/i386/mpapic.c | 3 - sys/i386/i386/mptable.c | 252 ++++++++++++++++++-- sys/i386/i386/pmap.c | 510 +++++++++++++++++++++-------------------- sys/i386/i386/support.s | 36 --- sys/i386/i386/vm86.c | 2 + sys/i386/include/cpufunc.h | 168 ++++++++------ sys/i386/include/mptable.h | 252 ++++++++++++++++++-- sys/i386/include/pmap.h | 9 +- sys/i386/include/smp.h | 10 +- sys/i386/isa/apic_vector.s | 87 ++++++- sys/i386/isa/intr_machdep.h | 20 +- sys/kern/subr_witness.c | 3 + 34 files changed, 2426 insertions(+), 1013 deletions(-) diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 8490b1b..569ed50 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -260,30 +260,107 @@ Xspuriousint: iret /* - * Handle TLB shootdowns. + * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT .globl Xinvltlb Xinvltlb: pushl %eax + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds #ifdef COUNT_XINVLTLB_HITS pushl %fs - movl $KPSEL, %eax + movl $KPSEL, %eax /* Private space selector */ mov %ax, %fs movl PCPU(CPUID), %eax popl %fs - ss - incl xhits(,%eax,4) + incl xhits_gbl(,%eax,4) #endif /* COUNT_XINVLTLB_HITS */ movl %cr3, %eax /* invalidate the TLB */ movl %eax, %cr3 - ss /* stack segment, avoid %ds load */ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + lock + incl smp_tlb_wait + + popl %ds + popl %eax + iret + +/* + * Single page TLB shootdown + */ + .text + SUPERALIGN_TEXT + .globl Xinvlpg +Xinvlpg: + pushl %eax + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds + +#ifdef COUNT_XINVLTLB_HITS + pushl %fs + movl $KPSEL, %eax /* Private space selector */ + mov %ax, %fs + movl PCPU(CPUID), %eax + popl %fs + incl xhits_pg(,%eax,4) +#endif /* COUNT_XINVLTLB_HITS */ + + movl smp_tlb_addr1, %eax + invlpg (%eax) /* invalidate single page */ + + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + + lock + incl smp_tlb_wait + + popl %ds + popl %eax + iret + +/* + * Page range TLB shootdown. + */ + .text + SUPERALIGN_TEXT + .globl Xinvlrng +Xinvlrng: + pushl %eax + pushl %edx + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds + +#ifdef COUNT_XINVLTLB_HITS + pushl %fs + movl $KPSEL, %eax /* Private space selector */ + mov %ax, %fs + movl PCPU(CPUID), %eax + popl %fs + incl xhits_rng(,%eax,4) +#endif /* COUNT_XINVLTLB_HITS */ + + movl smp_tlb_addr1, %edx + movl smp_tlb_addr2, %eax +1: invlpg (%edx) /* invalidate single page */ + addl $PAGE_SIZE, %edx + cmpl %edx, %eax + jb 1b + + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + + lock + incl smp_tlb_wait + + popl %ds + popl %edx popl %eax iret diff --git a/sys/amd64/amd64/bios.c b/sys/amd64/amd64/bios.c index 0312adf..6e0837c 100644 --- a/sys/amd64/amd64/bios.c +++ b/sys/amd64/amd64/bios.c @@ -323,7 +323,8 @@ bios16(struct bios_args *args, char *fmt, ...) va_list ap; int flags = BIOSCODE_FLAG | BIOSDATA_FLAG; u_int i, arg_start, arg_end; - u_int *pte, *ptd; + pt_entry_t *pte; + pd_entry_t *ptd; arg_start = 0xffffffff; arg_end = 0; @@ -382,19 +383,19 @@ bios16(struct bios_args *args, char *fmt, ...) args->seg.code32.base = (u_int)&bios16_jmp & PG_FRAME; args->seg.code32.limit = 0xffff; - ptd = (u_int *)rcr3(); + ptd = (pd_entry_t *)rcr3(); if (ptd == (u_int *)IdlePTD) { /* * no page table, so create one and install it. */ - pte = (u_int *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); - ptd = (u_int *)((u_int)ptd + KERNBASE); + pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); + ptd = (pd_entry_t *)((u_int)ptd + KERNBASE); *ptd = vtophys(pte) | PG_RW | PG_V; } else { /* * this is a user-level page table */ - pte = (u_int *)&PTmap; + pte = PTmap; } /* * install pointer to page 0. we don't need to flush the tlb, @@ -451,7 +452,7 @@ bios16(struct bios_args *args, char *fmt, ...) i = bios16_call(&args->r, stack_top); - if (pte == (u_int *)&PTmap) { + if (pte == PTmap) { *pte = 0; /* remove entry */ } else { *ptd = 0; /* remove page table */ @@ -461,7 +462,7 @@ bios16(struct bios_args *args, char *fmt, ...) /* * XXX only needs to be invlpg(0) but that doesn't work on the 386 */ - invltlb(); + pmap_invalidate_all(kernel_pmap); return (i); } diff --git a/sys/amd64/amd64/db_interface.c b/sys/amd64/amd64/db_interface.c index 2ba81da..ec32a58 100644 --- a/sys/amd64/amd64/db_interface.c +++ b/sys/amd64/amd64/db_interface.c @@ -276,7 +276,7 @@ db_write_bytes(addr, size, data) } } - invltlb(); + pmap_invalidate_all(kernel_pmap); } dst = (char *)addr; @@ -292,7 +292,7 @@ db_write_bytes(addr, size, data) if (ptep1) *ptep1 = oldmap1; - invltlb(); + pmap_invalidate_all(kernel_pmap); } } diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S index d06065d..94a3a10 100644 --- a/sys/amd64/amd64/locore.S +++ b/sys/amd64/amd64/locore.S @@ -127,6 +127,7 @@ HIDENAME(tmpstk): .globl bootinfo bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ + .globl KERNend KERNend: .long 0 /* phys addr end of kernel (just after bss) */ physfree: .long 0 /* phys addr of next free page */ @@ -381,12 +382,6 @@ begin: movl IdlePTD,%esi movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) - testl $CPUID_PGE, R(cpu_feature) - jz 1f - movl %cr4, %eax - orl $CR4_PGE, %eax - movl %eax, %cr4 -1: pushl physfree /* value of first for init386(first) */ call init386 /* wire 386 chip for unix operation */ @@ -809,14 +804,7 @@ no_kernend: jne map_read_write #endif xorl %edx,%edx - -#if !defined(SMP) - testl $CPUID_PGE, R(cpu_feature) - jz 2f - orl $PG_G,%edx -#endif - -2: movl $R(etext),%ecx + movl $R(etext),%ecx addl $PAGE_MASK,%ecx shrl $PAGE_SHIFT,%ecx fillkptphys(%edx) @@ -827,13 +815,7 @@ no_kernend: andl $~PAGE_MASK, %eax map_read_write: movl $PG_RW,%edx -#if !defined(SMP) - testl $CPUID_PGE, R(cpu_feature) - jz 1f - orl $PG_G,%edx -#endif - -1: movl R(KERNend),%ecx + movl R(KERNend),%ecx subl %eax,%ecx shrl $PAGE_SHIFT,%ecx fillkptphys(%edx) diff --git a/sys/amd64/amd64/locore.s b/sys/amd64/amd64/locore.s index d06065d..94a3a10 100644 --- a/sys/amd64/amd64/locore.s +++ b/sys/amd64/amd64/locore.s @@ -127,6 +127,7 @@ HIDENAME(tmpstk): .globl bootinfo bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ + .globl KERNend KERNend: .long 0 /* phys addr end of kernel (just after bss) */ physfree: .long 0 /* phys addr of next free page */ @@ -381,12 +382,6 @@ begin: movl IdlePTD,%esi movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) - testl $CPUID_PGE, R(cpu_feature) - jz 1f - movl %cr4, %eax - orl $CR4_PGE, %eax - movl %eax, %cr4 -1: pushl physfree /* value of first for init386(first) */ call init386 /* wire 386 chip for unix operation */ @@ -809,14 +804,7 @@ no_kernend: jne map_read_write #endif xorl %edx,%edx - -#if !defined(SMP) - testl $CPUID_PGE, R(cpu_feature) - jz 2f - orl $PG_G,%edx -#endif - -2: movl $R(etext),%ecx + movl $R(etext),%ecx addl $PAGE_MASK,%ecx shrl $PAGE_SHIFT,%ecx fillkptphys(%edx) @@ -827,13 +815,7 @@ no_kernend: andl $~PAGE_MASK, %eax map_read_write: movl $PG_RW,%edx -#if !defined(SMP) - testl $CPUID_PGE, R(cpu_feature) - jz 1f - orl $PG_G,%edx -#endif - -1: movl R(KERNend),%ecx + movl R(KERNend),%ecx subl %eax,%ecx shrl $PAGE_SHIFT,%ecx fillkptphys(%edx) diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 63fec0e..29e9c6e 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; +#ifdef APIC_IO +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; +static struct mtx smp_tlb_mtx; +#endif + /* * Local data and functions. */ @@ -336,6 +344,9 @@ init_locks(void) #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", NULL, MTX_SPIN); #endif /* USE_COMLOCK */ +#ifdef APIC_IO + mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN); +#endif } /* @@ -605,6 +616,10 @@ mp_enable(u_int boot_addr) /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLPG_OFFSET, Xinvlpg, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLRNG_OFFSET, Xinvlrng, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, @@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } -#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS) -u_int xhits[MAXCPU]; -SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits), - "IU", ""); +#if defined(APIC_IO) + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); #endif /* * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + register_t eflags; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} + +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. * - * XXX: Needs to handshake and wait for completion before proceding. + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. */ +static __inline u_int32_t +popcnt(u_int32_t m) +{ + + m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); + m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); + m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); + m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); + m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); + return m; +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + register_t eflags; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + /* XXX there should be a pcpu self mask */ + mask &= ~(1 << PCPU_GET(cpuid)); + if (mask == 0) + return; + ncpu = popcnt(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} +#endif + void smp_invltlb(void) { #if defined(APIC_IO) - if (smp_started) - ipi_all_but_self(IPI_INVLTLB); + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } #endif /* APIC_IO */ } void -invlpg(u_int addr) +smp_invlpg(vm_offset_t addr) { - __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } void -invltlb(void) +smp_masked_invltlb(u_int mask) { - u_long temp; +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +#endif /* APIC_IO */ +} - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() is - * inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } @@ -2251,7 +2455,7 @@ ap_init(void) /* spin */ ; /* BSP may have changed PTD while we were waiting */ - cpu_invltlb(); + invltlb(); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); @@ -2290,6 +2494,9 @@ ap_init(void) /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + if (bootverbose) + apic_dump("ap_init()"); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); if (smp_cpus == mp_ncpus) { @@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); - statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); + statclock_process(curthread->td_kse, TRAPF_PC(&frame), + TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c index 63fec0e..29e9c6e 100644 --- a/sys/amd64/amd64/mptable.c +++ b/sys/amd64/amd64/mptable.c @@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; +#ifdef APIC_IO +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; +static struct mtx smp_tlb_mtx; +#endif + /* * Local data and functions. */ @@ -336,6 +344,9 @@ init_locks(void) #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", NULL, MTX_SPIN); #endif /* USE_COMLOCK */ +#ifdef APIC_IO + mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN); +#endif } /* @@ -605,6 +616,10 @@ mp_enable(u_int boot_addr) /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLPG_OFFSET, Xinvlpg, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLRNG_OFFSET, Xinvlrng, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, @@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } -#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS) -u_int xhits[MAXCPU]; -SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits), - "IU", ""); +#if defined(APIC_IO) + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); #endif /* * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + register_t eflags; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} + +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. * - * XXX: Needs to handshake and wait for completion before proceding. + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. */ +static __inline u_int32_t +popcnt(u_int32_t m) +{ + + m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); + m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); + m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); + m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); + m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); + return m; +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + register_t eflags; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + /* XXX there should be a pcpu self mask */ + mask &= ~(1 << PCPU_GET(cpuid)); + if (mask == 0) + return; + ncpu = popcnt(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} +#endif + void smp_invltlb(void) { #if defined(APIC_IO) - if (smp_started) - ipi_all_but_self(IPI_INVLTLB); + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } #endif /* APIC_IO */ } void -invlpg(u_int addr) +smp_invlpg(vm_offset_t addr) { - __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } void -invltlb(void) +smp_masked_invltlb(u_int mask) { - u_long temp; +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +#endif /* APIC_IO */ +} - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() is - * inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } @@ -2251,7 +2455,7 @@ ap_init(void) /* spin */ ; /* BSP may have changed PTD while we were waiting */ - cpu_invltlb(); + invltlb(); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); @@ -2290,6 +2494,9 @@ ap_init(void) /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + if (bootverbose) + apic_dump("ap_init()"); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); if (smp_cpus == mp_ncpus) { @@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); - statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); + statclock_process(curthread->td_kse, TRAPF_PC(&frame), + TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 87cd8b9..5de1707 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -68,7 +68,6 @@ * and to when physical maps must be made correct. */ -#include "opt_disable_pse.h" #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_kstack_pages.h" @@ -85,6 +84,9 @@ #include #include #include +#ifdef SMP +#include +#endif #include #include @@ -97,6 +99,7 @@ #include #include +#include #include #include #include @@ -162,6 +165,7 @@ static vm_object_t kptobj; static int nkpt; vm_offset_t kernel_vm_end; +extern u_int32_t KERNend; /* * Data for the pv entry allocation mechanism @@ -257,10 +261,10 @@ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; + #ifndef DISABLE_PSE - if (cpu_feature & CPUID_PSE) { + if (cpu_feature & CPUID_PSE) newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); - } #endif return newaddr; } @@ -362,10 +366,9 @@ pmap_bootstrap(firstaddr, loadaddr) PTD[i] = 0; pgeflag = 0; -#if !defined(SMP) /* XXX - see also mp_machdep.c */ - if (cpu_feature & CPUID_PGE) { +#ifndef DISABLE_PG_G + if (cpu_feature & CPUID_PGE) pgeflag = PG_G; - } #endif /* @@ -378,7 +381,7 @@ pmap_bootstrap(firstaddr, loadaddr) */ pdir4mb = 0; -#if !defined(DISABLE_PSE) +#ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) { pd_entry_t ptditmp; /* @@ -389,29 +392,16 @@ pmap_bootstrap(firstaddr, loadaddr) ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; - -#if !defined(SMP) - /* - * Enable the PSE mode. - */ - load_cr4(rcr4() | CR4_PSE); - - /* - * We can do the mapping here for the single processor - * case. We simply ignore the old page table page from - * now on. - */ - /* - * For SMP, we still need 4K pages to bootstrap APs, - * PSE will be enabled as soon as all APs are up. - */ - PTD[KPTDI] = (pd_entry_t) ptditmp; - kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp; - invltlb(); -#endif } #endif - +#ifndef SMP + /* + * Turn on PGE/PSE. SMP does this later on since the + * 4K page tables are required for AP boot (for now). + * XXX fixme. + */ + pmap_set_opt(); +#endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic! (non-SMP hardware?)"); @@ -420,26 +410,55 @@ pmap_bootstrap(firstaddr, loadaddr) SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag | (cpu_apic_address & PG_FRAME)); #endif - invltlb(); } -#ifdef SMP /* - * Set 4mb pdir for mp startup + * Enable 4MB page mode for MP startup. Turn on PG_G support. + * BSP will run this after all the AP's have started up. */ void pmap_set_opt(void) { + pt_entry_t *pte; + vm_offset_t va, endva; + + if (pgeflag && (cpu_feature & CPUID_PGE)) { + load_cr4(rcr4() | CR4_PGE); + invltlb(); /* Insurance */ + } +#ifndef DISABLE_PSE if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); - if (pdir4mb && PCPU_GET(cpuid) == 0) { /* only on BSP */ + invltlb(); /* Insurance */ + } +#endif + if (PCPU_GET(cpuid) == 0) { +#ifndef DISABLE_PSE + if (pdir4mb) { kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb; - cpu_invltlb(); + invltlb(); /* Insurance */ } +#endif + if (pgeflag) { + /* Turn on PG_G for text, data, bss pages. */ + va = (vm_offset_t)btext; + endva = KERNBASE + KERNend; + while (va < endva) { + pte = vtopte(va); + if (*pte) + *pte |= pgeflag; + va += PAGE_SIZE; + } + invltlb(); /* Insurance */ + } + /* + * We do not need to broadcast the invltlb here, because + * each AP does it the moment it is released from the boot + * lock. See ap_init(). + */ } } -#endif void * pmap_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) @@ -553,43 +572,151 @@ pmap_track_modified(vm_offset_t va) return 0; } -static PMAP_INLINE void -invltlb_1pg(vm_offset_t va) -{ #ifdef I386_CPU - invltlb(); -#else - invlpg(va); -#endif +/* + * i386 only has "invalidate everything" and no SMP to worry about. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); } -static __inline void +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} +#else /* !I386_CPU */ +#ifdef SMP +/* + * For SMP, these functions have to use the IPI mechanism for coherence. + */ +void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { -#if defined(SMP) - if (pmap->pm_active & PCPU_GET(cpumask)) - cpu_invlpg((void *)va); - if (pmap->pm_active & PCPU_GET(other_cpus)) - smp_invltlb(); -#else - if (pmap->pm_active) - invltlb_1pg(va); -#endif + u_int cpumask; + u_int other_cpus; + + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + */ + if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { + invlpg(va); + smp_invlpg(va); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invlpg(va); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg(pmap->pm_active & other_cpus, va); + } + critical_exit(); } -static __inline void +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + u_int cpumask; + u_int other_cpus; + vm_offset_t addr; + + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + */ + if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + smp_invlpg_range(sva, eva); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg_range(pmap->pm_active & other_cpus, + sva, eva); + } + critical_exit(); +} + +void pmap_invalidate_all(pmap_t pmap) { -#if defined(SMP) - if (pmap->pm_active & PCPU_GET(cpumask)) - cpu_invltlb(); - if (pmap->pm_active & PCPU_GET(other_cpus)) + u_int cpumask; + u_int other_cpus; + + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + */ + if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { + invltlb(); smp_invltlb(); -#else - if (pmap->pm_active) + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invltlb(); + if (pmap->pm_active & other_cpus) + smp_masked_invltlb(pmap->pm_active & other_cpus); + } + critical_exit(); +} +#else /* !SMP */ +/* + * Normal, non-SMP, 486+ invalidation functions. + * We inline these within pmap.c for speed. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invlpg(va); +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + + if (pmap == kernel_pmap || pmap->pm_active) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + if (pmap == kernel_pmap || pmap->pm_active) invltlb(); -#endif } +#endif /* !SMP */ +#endif /* !I386_CPU */ /* * Return an address which is the base of the Virtual mapping of @@ -613,12 +740,7 @@ get_ptbase(pmap) /* otherwise, we are alternate address space */ if (frame != (APTDpde & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); -#if defined(SMP) - /* The page directory is not shared between CPUs */ - cpu_invltlb(); -#else invltlb(); -#endif } return APTmap; } @@ -647,7 +769,7 @@ pmap_pte_quick(pmap, va) newpf = pde & PG_FRAME; if (((*PMAP1) & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V; - invltlb_1pg((vm_offset_t) PADDR1); + pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR1); } return PADDR1 + (index & (NPTEPG - 1)); } @@ -692,34 +814,29 @@ pmap_extract(pmap, va) ***************************************************/ /* - * add a wired page to the kva - * note that in order for the mapping to take effect -- you - * should do a invltlb after doing the pmap_kenter... + * Add a wired page to the kva. + * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_offset_t pa) { pt_entry_t *pte; - pt_entry_t npte, opte; - npte = pa | PG_RW | PG_V | pgeflag; pte = vtopte(va); - opte = *pte; - *pte = npte; - invltlb_1pg(va); + *pte = pa | PG_RW | PG_V | pgeflag; } /* - * remove a page from the kernel pagetables + * Remove a page from the kernel pagetables. + * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { - register pt_entry_t *pte; + pt_entry_t *pte; pte = vtopte(va); *pte = 0; - invltlb_1pg(va); } /* @@ -737,13 +854,15 @@ pmap_kremove(vm_offset_t va) vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { - vm_offset_t sva = *virt; - vm_offset_t va = sva; + vm_offset_t va, sva; + + va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } + pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } @@ -756,64 +875,45 @@ pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. + * Note: SMP coherent. Uses a ranged shootdown IPI. */ void -pmap_qenter(vm_offset_t va, vm_page_t *m, int count) +pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { - vm_offset_t end_va; - - end_va = va + count * PAGE_SIZE; - - while (va < end_va) { - pt_entry_t *pte; + vm_offset_t va; - pte = vtopte(va); - *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag; -#ifdef SMP - cpu_invlpg((void *)va); -#else - invltlb_1pg(va); -#endif + va = sva; + while (count-- > 0) { + pmap_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } -#ifdef SMP - smp_invltlb(); -#endif + pmap_invalidate_range(kernel_pmap, sva, va); } /* - * this routine jerks page mappings from the + * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. + * Note: SMP coherent. Uses a ranged shootdown IPI. */ void -pmap_qremove(vm_offset_t va, int count) +pmap_qremove(vm_offset_t sva, int count) { - vm_offset_t end_va; - - end_va = va + count*PAGE_SIZE; - - while (va < end_va) { - pt_entry_t *pte; + vm_offset_t va; - pte = vtopte(va); - *pte = 0; -#ifdef SMP - cpu_invlpg((void *)va); -#else - invltlb_1pg(va); -#endif + va = sva; + while (count-- > 0) { + pmap_kremove(va); va += PAGE_SIZE; } -#ifdef SMP - smp_invltlb(); -#endif + pmap_invalidate_range(kernel_pmap, sva, va); } static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; + retry: m = vm_page_lookup(object, pindex); if (m && vm_page_sleep_busy(m, FALSE, "pplookp")) @@ -829,14 +929,11 @@ retry: void pmap_new_thread(struct thread *td) { -#ifdef I386_CPU - int updateneeded = 0; -#endif int i; + vm_page_t ma[KSTACK_PAGES]; vm_object_t ksobj; vm_page_t m; vm_offset_t ks; - pt_entry_t *ptek, oldpte; /* * allocate object for the kstack @@ -844,39 +941,21 @@ pmap_new_thread(struct thread *td) ksobj = vm_object_allocate(OBJT_DEFAULT, KSTACK_PAGES); td->td_kstack_obj = ksobj; -#ifdef KSTACK_GUARD /* get a kernel virtual address for the kstack for this thread */ +#ifdef KSTACK_GUARD ks = kmem_alloc_nofault(kernel_map, (KSTACK_PAGES + 1) * PAGE_SIZE); if (ks == 0) panic("pmap_new_thread: kstack allocation failed"); - - /* - * Set the first page to be the unmapped guard page. - */ - ptek = vtopte(ks); - oldpte = *ptek; - *ptek = 0; - if (oldpte) { -#ifdef I386_CPU - updateneeded = 1; -#else - invlpg(ks); -#endif - } - - /* - * move to the next page, which is where the real stack starts. - */ + if (*vtopte(ks) != 0) + pmap_qremove(ks, 1); ks += PAGE_SIZE; td->td_kstack = ks; - ptek++; #else /* get a kernel virtual address for the kstack for this thread */ ks = kmem_alloc_nofault(kernel_map, KSTACK_PAGES * PAGE_SIZE); if (ks == 0) panic("pmap_new_thread: kstack allocation failed"); td->td_kstack = ks; - ptek = vtopte(ks); #endif /* * For the length of the stack, link in a real page of ram for each @@ -887,6 +966,7 @@ pmap_new_thread(struct thread *td) * Get a kernel stack page */ m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); + ma[i] = m; /* * Wire the page @@ -894,28 +974,12 @@ pmap_new_thread(struct thread *td) m->wire_count++; cnt.v_wire_count++; - /* - * Enter the page into the kernel address space. - */ - oldpte = ptek[i]; - ptek[i] = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag; - if (oldpte) { -#ifdef I386_CPU - updateneeded = 1; -#else - invlpg(ks + (i * PAGE_SIZE)); -#endif - } - vm_page_wakeup(m); vm_page_flag_clear(m, PG_ZERO); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); m->valid = VM_PAGE_BITS_ALL; } -#ifdef I386_CPU - if (updateneeded) - invltlb(); -#endif + pmap_qenter(ks, ma, KSTACK_PAGES); } /* @@ -930,26 +994,18 @@ pmap_dispose_thread(td) vm_object_t ksobj; vm_offset_t ks; vm_page_t m; - pt_entry_t *ptek; ksobj = td->td_kstack_obj; ks = td->td_kstack; - ptek = vtopte(ks); + pmap_qremove(ks, KSTACK_PAGES); for (i = 0; i < KSTACK_PAGES; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("pmap_dispose_thread: kstack already missing?"); vm_page_busy(m); - ptek[i] = 0; -#ifndef I386_CPU - invlpg(ks + (i * PAGE_SIZE)); -#endif vm_page_unwire(m, 0); vm_page_free(m); } -#ifdef I386_CPU - invltlb(); -#endif /* * Free the space that this stack was mapped to in the kernel * address map. @@ -976,13 +1032,13 @@ pmap_swapout_thread(td) ksobj = td->td_kstack_obj; ks = td->td_kstack; + pmap_qremove(ks, KSTACK_PAGES); for (i = 0; i < KSTACK_PAGES; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("pmap_swapout_thread: kstack already missing?"); vm_page_dirty(m); vm_page_unwire(m, 0); - pmap_kremove(ks + i * PAGE_SIZE); } } @@ -994,6 +1050,7 @@ pmap_swapin_thread(td) struct thread *td; { int i, rv; + vm_page_t ma[KSTACK_PAGES]; vm_object_t ksobj; vm_offset_t ks; vm_page_t m; @@ -1002,7 +1059,6 @@ pmap_swapin_thread(td) ks = td->td_kstack; for (i = 0; i < KSTACK_PAGES; i++) { m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); - pmap_kenter(ks + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(ksobj, &m, 1, 0); if (rv != VM_PAGER_OK) @@ -1010,10 +1066,12 @@ pmap_swapin_thread(td) m = vm_page_lookup(ksobj, i); m->valid = VM_PAGE_BITS_ALL; } + ma[i] = m; vm_page_wire(m); vm_page_wakeup(m); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); } + pmap_qenter(ks, ma, KSTACK_PAGES); } /*************************************************** @@ -1108,7 +1166,8 @@ pmap_pinit0(pmap) { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); - pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD); + pmap_kenter((vm_offset_t)pmap->pm_pdir, (vm_offset_t)IdlePTD); + invlpg((vm_offset_t)pmap->pm_pdir); pmap->pm_ptphint = NULL; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); @@ -1153,7 +1212,7 @@ pmap_pinit(pmap) vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/ ptdpg->valid = VM_PAGE_BITS_ALL; - pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); + pmap_qenter((vm_offset_t) pmap->pm_pdir, &ptdpg, 1); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); @@ -1616,7 +1675,7 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) * PG_G. */ if (oldpte & PG_G) - invlpg(va); + pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte); @@ -2028,13 +2087,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) { if ((origpte & PG_RW) == 0) { *pte |= PG_RW; -#ifdef SMP - cpu_invlpg((void *)va); - if (pmap->pm_active & PCPU_GET(other_cpus)) - smp_invltlb(); -#else - invltlb_1pg(va); -#endif + pmap_invalidate_page(pmap, va); } return; } @@ -2102,13 +2155,7 @@ validate: if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte | PG_A; /*if (origpte)*/ { -#ifdef SMP - cpu_invlpg((void *)va); - if (pmap->pm_active & PCPU_GET(other_cpus)) - smp_invltlb(); -#else - invltlb_1pg(va); -#endif + pmap_invalidate_page(pmap, va); } } } @@ -2222,7 +2269,11 @@ retry: void * pmap_kenter_temporary(vm_offset_t pa, int i) { - pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); + vm_offset_t va; + + va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); + pmap_kenter(va, pa); + invlpg(va); return ((void *)crashdumpmap); } @@ -2527,7 +2578,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t pdnxt; pd_entry_t src_frame, dst_frame; vm_page_t m; - pd_entry_t saved_pde; if (dst_addr != src_addr) return; @@ -2537,17 +2587,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, return; dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME; - if (dst_frame != (APTDpde & PG_FRAME)) { - APTDpde = dst_frame | PG_RW | PG_V; -#if defined(SMP) - /* The page directory is not shared between CPUs */ - cpu_invltlb(); -#else - invltlb(); -#endif - } - saved_pde = APTDpde & (PG_FRAME | PG_RW | PG_V); - for(addr = src_addr; addr < end_addr; addr = pdnxt) { + for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; @@ -2588,6 +2628,14 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, if (pdnxt > end_addr) pdnxt = end_addr; + /* + * Have to recheck this before every avtopte() call below + * in case we have blocked and something else used APTDpde. + */ + if (dst_frame != (APTDpde & PG_FRAME)) { + APTDpde = dst_frame | PG_RW | PG_V; + invltlb(); + } src_pte = vtopte(addr); dst_pte = avtopte(addr); while (addr < pdnxt) { @@ -2603,16 +2651,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); - if ((APTDpde & PG_FRAME) != - (saved_pde & PG_FRAME)) { - APTDpde = saved_pde; -printf ("IT HAPPENNED!"); -#if defined(SMP) - cpu_invltlb(); -#else - invltlb(); -#endif - } if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and @@ -2644,14 +2682,13 @@ printf ("IT HAPPENNED!"); void pmap_zero_page(vm_page_t m) { - vm_offset_t phys = VM_PAGE_TO_PHYS(m); + vm_offset_t phys; + phys = VM_PAGE_TO_PHYS(m); if (*CMAP2) panic("pmap_zero_page: CMAP2 busy"); - *CMAP2 = PG_V | PG_RW | phys | PG_A | PG_M; - invltlb_1pg((vm_offset_t)CADDR2); - + pmap_invalidate_page(kernel_pmap, (vm_offset_t)CADDR2); #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) i686_pagezero(CADDR2); @@ -2670,14 +2707,13 @@ pmap_zero_page(vm_page_t m) void pmap_zero_page_area(vm_page_t m, int off, int size) { - vm_offset_t phys = VM_PAGE_TO_PHYS(m); + vm_offset_t phys; + phys = VM_PAGE_TO_PHYS(m); if (*CMAP2) panic("pmap_zero_page: CMAP2 busy"); - *CMAP2 = PG_V | PG_RW | phys | PG_A | PG_M; - invltlb_1pg((vm_offset_t)CADDR2); - + pmap_invalidate_page(kernel_pmap, (vm_offset_t)CADDR2); #if defined(I686_CPU) if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE) i686_pagezero(CADDR2); @@ -2696,20 +2732,13 @@ pmap_zero_page_area(vm_page_t m, int off, int size) void pmap_zero_page_idle(vm_page_t m) { - vm_offset_t phys = VM_PAGE_TO_PHYS(m); + vm_offset_t phys; + phys = VM_PAGE_TO_PHYS(m); if (*CMAP3) panic("pmap_zero_page: CMAP3 busy"); - *CMAP3 = PG_V | PG_RW | phys | PG_A | PG_M; -#ifdef SMP - mtx_lock(&Giant); /* IPI sender not MPSAFE */ -#endif - invltlb_1pg((vm_offset_t)CADDR3); -#ifdef SMP - mtx_unlock(&Giant); -#endif - + invlpg((vm_offset_t)CADDR3); /* SMP: local cpu only */ #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) i686_pagezero(CADDR3); @@ -2733,18 +2762,15 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) panic("pmap_copy_page: CMAP1 busy"); if (*CMAP2) panic("pmap_copy_page: CMAP2 busy"); - *CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; -#ifdef I386_CPU - invltlb(); -#else - invlpg((u_int)CADDR1); - invlpg((u_int)CADDR2); -#endif - + /* + * XXX we "know" that CADDR2 immediately follows CADDR1 and use + * that to save an IPI on SMP systems. + */ + pmap_invalidate_range(kernel_pmap, (vm_offset_t)CADDR1, + (vm_offset_t)CADDR2 + PAGE_SIZE); bcopy(CADDR1, CADDR2, PAGE_SIZE); - *CMAP1 = 0; *CMAP2 = 0; } @@ -3176,18 +3202,11 @@ pmap_mapdev(pa, size) for (tmpva = va; size > 0; ) { pte = vtopte(tmpva); *pte = pa | PG_RW | PG_V | pgeflag; -#ifdef SMP - cpu_invlpg((void *)tmpva); -#else - invltlb_1pg(tmpva); -#endif size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } -#ifdef SMP - smp_invltlb(); -#endif + pmap_invalidate_range(kernel_pmap, va, tmpva); return ((void *)(va + offset)); } @@ -3205,15 +3224,8 @@ pmap_unmapdev(va, size) for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { pte = vtopte(tmpva); *pte = 0; -#ifdef SMP - cpu_invlpg((void *)tmpva); -#else - invltlb_1pg(tmpva); -#endif } -#ifdef SMP - smp_invltlb(); -#endif + pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index c1f3899..23c611c 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -1596,42 +1596,6 @@ ENTRY(ssdtosd) popl %ebx ret -/* load_cr0(cr0) */ -ENTRY(load_cr0) - movl 4(%esp),%eax - movl %eax,%cr0 - ret - -/* rcr0() */ -ENTRY(rcr0) - movl %cr0,%eax - ret - -/* rcr3() */ -ENTRY(rcr3) - movl %cr3,%eax - ret - -/* void load_cr3(caddr_t cr3) */ -ENTRY(load_cr3) -#ifdef SWTCH_OPTIM_STATS - incl tlb_flush_count -#endif - movl 4(%esp),%eax - movl %eax,%cr3 - ret - -/* rcr4() */ -ENTRY(rcr4) - movl %cr4,%eax - ret - -/* void load_cr4(caddr_t cr4) */ -ENTRY(load_cr4) - movl 4(%esp),%eax - movl %eax,%cr4 - ret - /* void reset_dbregs() */ ENTRY(reset_dbregs) movl $0,%eax diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s index c1f3899..23c611c 100644 --- a/sys/amd64/amd64/support.s +++ b/sys/amd64/amd64/support.s @@ -1596,42 +1596,6 @@ ENTRY(ssdtosd) popl %ebx ret -/* load_cr0(cr0) */ -ENTRY(load_cr0) - movl 4(%esp),%eax - movl %eax,%cr0 - ret - -/* rcr0() */ -ENTRY(rcr0) - movl %cr0,%eax - ret - -/* rcr3() */ -ENTRY(rcr3) - movl %cr3,%eax - ret - -/* void load_cr3(caddr_t cr3) */ -ENTRY(load_cr3) -#ifdef SWTCH_OPTIM_STATS - incl tlb_flush_count -#endif - movl 4(%esp),%eax - movl %eax,%cr3 - ret - -/* rcr4() */ -ENTRY(rcr4) - movl %cr4,%eax - ret - -/* void load_cr4(caddr_t cr4) */ -ENTRY(load_cr4) - movl 4(%esp),%eax - movl %eax,%cr4 - ret - /* void reset_dbregs() */ ENTRY(reset_dbregs) movl $0,%eax diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h index 2e64138..0896659 100644 --- a/sys/amd64/include/cpufunc.h +++ b/sys/amd64/include/cpufunc.h @@ -237,62 +237,6 @@ invd(void) __asm __volatile("invd"); } -#if defined(SMP) && defined(_KERNEL) - -/* - * When using APIC IPI's, invlpg() is not simply the invlpg instruction - * (this is a bug) and the inlining cost is prohibitive since the call - * executes into the IPI transmission system. - */ -void invlpg(u_int addr); -void invltlb(void); - -static __inline void -cpu_invlpg(void *addr) -{ - __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); -} - -static __inline void -cpu_invltlb(void) -{ - u_int temp; - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() - * is inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp) - : : "memory"); -#if defined(SWTCH_OPTIM_STATS) - ++tlb_flush_count; -#endif -} - -#else /* !(SMP && _KERNEL) */ - -static __inline void -invlpg(u_int addr) -{ - __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); -} - -static __inline void -invltlb(void) -{ - u_int temp; - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() - * is inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp) - : : "memory"); -#ifdef SWTCH_OPTIM_STATS - ++tlb_flush_count; -#endif -} - -#endif /* SMP && _KERNEL */ - static __inline u_short inw(u_int port) { @@ -364,15 +308,6 @@ ia32_pause(void) } static __inline u_int -rcr2(void) -{ - u_int data; - - __asm __volatile("movl %%cr2,%0" : "=r" (data)); - return (data); -} - -static __inline u_int read_eflags(void) { u_int ef; @@ -426,6 +361,86 @@ wrmsr(u_int msr, u_int64_t newval) __asm __volatile("wrmsr" : : "A" (newval), "c" (msr)); } +static __inline void +load_cr0(u_int data) +{ + + __asm __volatile("movl %0,%%cr0" : : "r" (data)); +} + +static __inline u_int +rcr0(void) +{ + u_int data; + + __asm __volatile("movl %%cr0,%0" : "=r" (data)); + return (data); +} + +static __inline u_int +rcr2(void) +{ + u_int data; + + __asm __volatile("movl %%cr2,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_cr3(u_int data) +{ + + __asm __volatile("movl %0,%%cr3" : : "r" (data) : "memory"); +#if defined(SWTCH_OPTIM_STATS) + ++tlb_flush_count; +#endif +} + +static __inline u_int +rcr3(void) +{ + u_int data; + + __asm __volatile("movl %%cr3,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_cr4(u_int data) +{ + __asm __volatile("movl %0,%%cr4" : : "r" (data)); +} + +static __inline u_int +rcr4(void) +{ + u_int data; + + __asm __volatile("movl %%cr4,%0" : "=r" (data)); + return (data); +} + +/* + * Global TLB flush (except for thise for pages marked PG_G) + */ +static __inline void +invltlb(void) +{ + + load_cr3(rcr3()); +} + +/* + * TLB flush for an individual page (even if it has PG_G). + * Only works on 486+ CPUs (i386 does not have PG_G). + */ +static __inline void +invlpg(u_int addr) +{ + + __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); +} + static __inline u_int rfs(void) { @@ -587,6 +602,8 @@ intr_restore(register_t eflags) int breakpoint(void); u_int bsfl(u_int mask); u_int bsrl(u_int mask); +void cpu_invlpg(u_int addr); +void cpu_invlpg_range(u_int start, u_int end); void disable_intr(void); void do_cpuid(u_int ax, u_int *p); void enable_intr(void); @@ -597,8 +614,14 @@ void insl(u_int port, void *addr, size_t cnt); void insw(u_int port, void *addr, size_t cnt); void invd(void); void invlpg(u_int addr); +void invlpg_range(u_int start, u_int end); void invltlb(void); u_short inw(u_int port); +void load_cr0(u_int cr0); +void load_cr3(u_int cr3); +void load_cr4(u_int cr4); +void load_fs(u_int sel); +void load_gs(u_int sel); void outb(u_int port, u_char data); void outl(u_int port, u_int data); void outsb(u_int port, void *addr, size_t cnt); @@ -606,7 +629,12 @@ void outsl(u_int port, void *addr, size_t cnt); void outsw(u_int port, void *addr, size_t cnt); void outw(u_int port, u_short data); void ia32_pause(void); +u_int rcr0(void); u_int rcr2(void); +u_int rcr3(void); +u_int rcr4(void); +u_int rfs(void); +u_int rgs(void); u_int64_t rdmsr(u_int msr); u_int64_t rdpmc(u_int pmc); u_int64_t rdtsc(void); @@ -614,10 +642,6 @@ u_int read_eflags(void); void wbinvd(void); void write_eflags(u_int ef); void wrmsr(u_int msr, u_int64_t newval); -u_int rfs(void); -u_int rgs(void); -void load_fs(u_int sel); -void load_gs(u_int sel); u_int rdr0(void); void load_dr0(u_int dr0); u_int rdr1(void); @@ -639,13 +663,7 @@ void intr_restore(register_t ef); #endif /* __GNUC__ */ -void load_cr0(u_int cr0); -void load_cr3(u_int cr3); -void load_cr4(u_int cr4); void ltr(u_short sel); -u_int rcr0(void); -u_int rcr3(void); -u_int rcr4(void); void reset_dbregs(void); __END_DECLS diff --git a/sys/amd64/include/mptable.h b/sys/amd64/include/mptable.h index 63fec0e..29e9c6e 100644 --- a/sys/amd64/include/mptable.h +++ b/sys/amd64/include/mptable.h @@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; +#ifdef APIC_IO +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; +static struct mtx smp_tlb_mtx; +#endif + /* * Local data and functions. */ @@ -336,6 +344,9 @@ init_locks(void) #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", NULL, MTX_SPIN); #endif /* USE_COMLOCK */ +#ifdef APIC_IO + mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN); +#endif } /* @@ -605,6 +616,10 @@ mp_enable(u_int boot_addr) /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLPG_OFFSET, Xinvlpg, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLRNG_OFFSET, Xinvlrng, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, @@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } -#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS) -u_int xhits[MAXCPU]; -SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits), - "IU", ""); +#if defined(APIC_IO) + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); #endif /* * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + register_t eflags; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} + +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. * - * XXX: Needs to handshake and wait for completion before proceding. + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. */ +static __inline u_int32_t +popcnt(u_int32_t m) +{ + + m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); + m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); + m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); + m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); + m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); + return m; +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + register_t eflags; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + /* XXX there should be a pcpu self mask */ + mask &= ~(1 << PCPU_GET(cpuid)); + if (mask == 0) + return; + ncpu = popcnt(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} +#endif + void smp_invltlb(void) { #if defined(APIC_IO) - if (smp_started) - ipi_all_but_self(IPI_INVLTLB); + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } #endif /* APIC_IO */ } void -invlpg(u_int addr) +smp_invlpg(vm_offset_t addr) { - __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } void -invltlb(void) +smp_masked_invltlb(u_int mask) { - u_long temp; +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +#endif /* APIC_IO */ +} - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() is - * inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } @@ -2251,7 +2455,7 @@ ap_init(void) /* spin */ ; /* BSP may have changed PTD while we were waiting */ - cpu_invltlb(); + invltlb(); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); @@ -2290,6 +2494,9 @@ ap_init(void) /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + if (bootverbose) + apic_dump("ap_init()"); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); if (smp_cpus == mp_ncpus) { @@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); - statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); + statclock_process(curthread->td_kse, TRAPF_PC(&frame), + TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index e6ac669..e0789fc 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -151,7 +151,7 @@ extern pt_entry_t PTmap[], APTmap[]; extern pd_entry_t PTD[], APTD[]; extern pd_entry_t PTDpde, APTDpde; -extern pd_entry_t IdlePTD; /* physical address of "Idle" state directory */ +extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */ #endif #ifdef _KERNEL @@ -253,14 +253,15 @@ extern char *ptvmmap; /* poor name! */ extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; -void pmap_bootstrap( vm_offset_t, vm_offset_t); +void pmap_bootstrap(vm_offset_t, vm_offset_t); void *pmap_mapdev(vm_offset_t, vm_size_t); void pmap_unmapdev(vm_offset_t, vm_size_t); pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2; vm_page_t pmap_use_pt(pmap_t, vm_offset_t); -#ifdef SMP void pmap_set_opt(void); -#endif +void pmap_invalidate_page(pmap_t, vm_offset_t); +void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); +void pmap_invalidate_all(pmap_t); #endif /* _KERNEL */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index 872c5ec..d669c51 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -51,6 +51,8 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */ * Interprocessor interrupts for SMP. */ #define IPI_INVLTLB XINVLTLB_OFFSET +#define IPI_INVLPG XINVLPG_OFFSET +#define IPI_INVLRNG XINVLRNG_OFFSET #define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET #define IPI_AST XCPUAST_OFFSET #define IPI_STOP XCPUSTOP_OFFSET @@ -107,7 +109,6 @@ void assign_apic_irq(int apic, int intpin, int irq); void revoke_apic_irq(int irq); void bsp_apic_configure(void); void init_secondary(void); -void smp_invltlb(void); void forward_statclock(void); void forwarded_statclock(struct trapframe frame); void forward_hardclock(void); @@ -119,6 +120,13 @@ void ipi_self(u_int ipi); #ifdef APIC_INTR_REORDER void set_lapic_isrloc(int, int); #endif /* APIC_INTR_REORDER */ +void smp_invlpg(vm_offset_t addr); +void smp_masked_invlpg(u_int mask, vm_offset_t addr); +void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva); +void smp_masked_invlpg_range(u_int mask, vm_offset_t startva, + vm_offset_t endva); +void smp_invltlb(void); +void smp_masked_invltlb(u_int mask); /* global data in mpapic.c */ extern volatile lapic_t lapic; diff --git a/sys/amd64/isa/intr_machdep.h b/sys/amd64/isa/intr_machdep.h index 41542d0..7179268 100644 --- a/sys/amd64/isa/intr_machdep.h +++ b/sys/amd64/isa/intr_machdep.h @@ -88,6 +88,7 @@ /* IDT vector base for regular (aka. slow) and fast interrupts */ #define TPR_SLOW_INTS 0x20 #define TPR_FAST_INTS 0x60 +/* XXX note that the AST interrupt is at 0x50 */ /* blocking values for local APIC Task Priority Register */ #define TPR_BLOCK_HWI 0x4f /* hardware INTs */ @@ -104,20 +105,23 @@ #endif /** TEST_TEST1 */ /* TLB shootdowns */ -#define XINVLTLB_OFFSET (ICU_OFFSET + 112) +#define XINVLTLB_OFFSET (ICU_OFFSET + 112) /* 0x90 */ +#define XINVLPG_OFFSET (ICU_OFFSET + 113) /* 0x91 */ +#define XINVLRNG_OFFSET (ICU_OFFSET + 114) /* 0x92 */ /* inter-cpu clock handling */ -#define XHARDCLOCK_OFFSET (ICU_OFFSET + 113) -#define XSTATCLOCK_OFFSET (ICU_OFFSET + 114) +#define XHARDCLOCK_OFFSET (ICU_OFFSET + 120) /* 0x98 */ +#define XSTATCLOCK_OFFSET (ICU_OFFSET + 121) /* 0x99 */ /* inter-CPU rendezvous */ -#define XRENDEZVOUS_OFFSET (ICU_OFFSET + 115) +#define XRENDEZVOUS_OFFSET (ICU_OFFSET + 122) /* 0x9A */ /* IPI to generate an additional software trap at the target CPU */ -#define XCPUAST_OFFSET (ICU_OFFSET + 48) +/* XXX in the middle of the interrupt range, overlapping IRQ48 */ +#define XCPUAST_OFFSET (ICU_OFFSET + 48) /* 0x50 */ /* IPI to signal CPUs to stop and wait for another CPU to restart them */ -#define XCPUSTOP_OFFSET (ICU_OFFSET + 128) +#define XCPUSTOP_OFFSET (ICU_OFFSET + 128) /* 0xA0 */ /* * Note: this vector MUST be xxxx1111, 32 + 223 = 255 = 0xff: @@ -194,7 +198,9 @@ inthand_t IDTVEC(intr28), IDTVEC(intr29), IDTVEC(intr30), IDTVEC(intr31); inthand_t - Xinvltlb, /* TLB shootdowns */ + Xinvltlb, /* TLB shootdowns - global */ + Xinvlpg, /* TLB shootdowns - 1 page */ + Xinvlrng, /* TLB shootdowns - page range */ Xhardclock, /* Forward hardclock() */ Xstatclock, /* Forward statclock() */ Xcpuast, /* Additional software trap on other cpu */ diff --git a/sys/conf/options.i386 b/sys/conf/options.i386 index 9f0d22d..ad82c05 100644 --- a/sys/conf/options.i386 +++ b/sys/conf/options.i386 @@ -1,10 +1,11 @@ # $FreeBSD$ # Options specific to the i386 platform kernels -DISABLE_PSE MATH_EMULATE opt_math_emulate.h GPL_MATH_EMULATE opt_math_emulate.h +DISABLE_PSE opt_pmap.h PMAP_SHPGPERPROC opt_pmap.h +DISABLE_PG_G opt_pmap.h PPC_PROBE_CHIPSET opt_ppc.h PPC_DEBUG opt_ppc.h SHOW_BUSYBUFS diff --git a/sys/conf/options.pc98 b/sys/conf/options.pc98 index 49325cb..ed2e2c6 100644 --- a/sys/conf/options.pc98 +++ b/sys/conf/options.pc98 @@ -1,10 +1,11 @@ # $FreeBSD$ # Options specific to the pc98 platform kernels -DISABLE_PSE MATH_EMULATE opt_math_emulate.h GPL_MATH_EMULATE opt_math_emulate.h +DISABLE_PSE opt_pmap.h PMAP_SHPGPERPROC opt_pmap.h +DISABLE_PG_G opt_pmap.h PPC_PROBE_CHIPSET opt_ppc.h PPC_DEBUG opt_ppc.h SHOW_BUSYBUFS diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index 8490b1b..569ed50 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -260,30 +260,107 @@ Xspuriousint: iret /* - * Handle TLB shootdowns. + * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT .globl Xinvltlb Xinvltlb: pushl %eax + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds #ifdef COUNT_XINVLTLB_HITS pushl %fs - movl $KPSEL, %eax + movl $KPSEL, %eax /* Private space selector */ mov %ax, %fs movl PCPU(CPUID), %eax popl %fs - ss - incl xhits(,%eax,4) + incl xhits_gbl(,%eax,4) #endif /* COUNT_XINVLTLB_HITS */ movl %cr3, %eax /* invalidate the TLB */ movl %eax, %cr3 - ss /* stack segment, avoid %ds load */ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + lock + incl smp_tlb_wait + + popl %ds + popl %eax + iret + +/* + * Single page TLB shootdown + */ + .text + SUPERALIGN_TEXT + .globl Xinvlpg +Xinvlpg: + pushl %eax + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds + +#ifdef COUNT_XINVLTLB_HITS + pushl %fs + movl $KPSEL, %eax /* Private space selector */ + mov %ax, %fs + movl PCPU(CPUID), %eax + popl %fs + incl xhits_pg(,%eax,4) +#endif /* COUNT_XINVLTLB_HITS */ + + movl smp_tlb_addr1, %eax + invlpg (%eax) /* invalidate single page */ + + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + + lock + incl smp_tlb_wait + + popl %ds + popl %eax + iret + +/* + * Page range TLB shootdown. + */ + .text + SUPERALIGN_TEXT + .globl Xinvlrng +Xinvlrng: + pushl %eax + pushl %edx + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds + +#ifdef COUNT_XINVLTLB_HITS + pushl %fs + movl $KPSEL, %eax /* Private space selector */ + mov %ax, %fs + movl PCPU(CPUID), %eax + popl %fs + incl xhits_rng(,%eax,4) +#endif /* COUNT_XINVLTLB_HITS */ + + movl smp_tlb_addr1, %edx + movl smp_tlb_addr2, %eax +1: invlpg (%edx) /* invalidate single page */ + addl $PAGE_SIZE, %edx + cmpl %edx, %eax + jb 1b + + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + + lock + incl smp_tlb_wait + + popl %ds + popl %edx popl %eax iret diff --git a/sys/i386/i386/bios.c b/sys/i386/i386/bios.c index 0312adf..6e0837c 100644 --- a/sys/i386/i386/bios.c +++ b/sys/i386/i386/bios.c @@ -323,7 +323,8 @@ bios16(struct bios_args *args, char *fmt, ...) va_list ap; int flags = BIOSCODE_FLAG | BIOSDATA_FLAG; u_int i, arg_start, arg_end; - u_int *pte, *ptd; + pt_entry_t *pte; + pd_entry_t *ptd; arg_start = 0xffffffff; arg_end = 0; @@ -382,19 +383,19 @@ bios16(struct bios_args *args, char *fmt, ...) args->seg.code32.base = (u_int)&bios16_jmp & PG_FRAME; args->seg.code32.limit = 0xffff; - ptd = (u_int *)rcr3(); + ptd = (pd_entry_t *)rcr3(); if (ptd == (u_int *)IdlePTD) { /* * no page table, so create one and install it. */ - pte = (u_int *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); - ptd = (u_int *)((u_int)ptd + KERNBASE); + pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); + ptd = (pd_entry_t *)((u_int)ptd + KERNBASE); *ptd = vtophys(pte) | PG_RW | PG_V; } else { /* * this is a user-level page table */ - pte = (u_int *)&PTmap; + pte = PTmap; } /* * install pointer to page 0. we don't need to flush the tlb, @@ -451,7 +452,7 @@ bios16(struct bios_args *args, char *fmt, ...) i = bios16_call(&args->r, stack_top); - if (pte == (u_int *)&PTmap) { + if (pte == PTmap) { *pte = 0; /* remove entry */ } else { *ptd = 0; /* remove page table */ @@ -461,7 +462,7 @@ bios16(struct bios_args *args, char *fmt, ...) /* * XXX only needs to be invlpg(0) but that doesn't work on the 386 */ - invltlb(); + pmap_invalidate_all(kernel_pmap); return (i); } diff --git a/sys/i386/i386/db_interface.c b/sys/i386/i386/db_interface.c index 2ba81da..ec32a58 100644 --- a/sys/i386/i386/db_interface.c +++ b/sys/i386/i386/db_interface.c @@ -276,7 +276,7 @@ db_write_bytes(addr, size, data) } } - invltlb(); + pmap_invalidate_all(kernel_pmap); } dst = (char *)addr; @@ -292,7 +292,7 @@ db_write_bytes(addr, size, data) if (ptep1) *ptep1 = oldmap1; - invltlb(); + pmap_invalidate_all(kernel_pmap); } } diff --git a/sys/i386/i386/locore.s b/sys/i386/i386/locore.s index d06065d..94a3a10 100644 --- a/sys/i386/i386/locore.s +++ b/sys/i386/i386/locore.s @@ -127,6 +127,7 @@ HIDENAME(tmpstk): .globl bootinfo bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ + .globl KERNend KERNend: .long 0 /* phys addr end of kernel (just after bss) */ physfree: .long 0 /* phys addr of next free page */ @@ -381,12 +382,6 @@ begin: movl IdlePTD,%esi movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) - testl $CPUID_PGE, R(cpu_feature) - jz 1f - movl %cr4, %eax - orl $CR4_PGE, %eax - movl %eax, %cr4 -1: pushl physfree /* value of first for init386(first) */ call init386 /* wire 386 chip for unix operation */ @@ -809,14 +804,7 @@ no_kernend: jne map_read_write #endif xorl %edx,%edx - -#if !defined(SMP) - testl $CPUID_PGE, R(cpu_feature) - jz 2f - orl $PG_G,%edx -#endif - -2: movl $R(etext),%ecx + movl $R(etext),%ecx addl $PAGE_MASK,%ecx shrl $PAGE_SHIFT,%ecx fillkptphys(%edx) @@ -827,13 +815,7 @@ no_kernend: andl $~PAGE_MASK, %eax map_read_write: movl $PG_RW,%edx -#if !defined(SMP) - testl $CPUID_PGE, R(cpu_feature) - jz 1f - orl $PG_G,%edx -#endif - -1: movl R(KERNend),%ecx + movl R(KERNend),%ecx subl %eax,%ecx shrl $PAGE_SHIFT,%ecx fillkptphys(%edx) diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index 63fec0e..29e9c6e 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; +#ifdef APIC_IO +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; +static struct mtx smp_tlb_mtx; +#endif + /* * Local data and functions. */ @@ -336,6 +344,9 @@ init_locks(void) #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", NULL, MTX_SPIN); #endif /* USE_COMLOCK */ +#ifdef APIC_IO + mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN); +#endif } /* @@ -605,6 +616,10 @@ mp_enable(u_int boot_addr) /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLPG_OFFSET, Xinvlpg, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLRNG_OFFSET, Xinvlrng, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, @@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } -#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS) -u_int xhits[MAXCPU]; -SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits), - "IU", ""); +#if defined(APIC_IO) + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); #endif /* * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + register_t eflags; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} + +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. * - * XXX: Needs to handshake and wait for completion before proceding. + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. */ +static __inline u_int32_t +popcnt(u_int32_t m) +{ + + m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); + m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); + m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); + m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); + m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); + return m; +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + register_t eflags; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + /* XXX there should be a pcpu self mask */ + mask &= ~(1 << PCPU_GET(cpuid)); + if (mask == 0) + return; + ncpu = popcnt(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} +#endif + void smp_invltlb(void) { #if defined(APIC_IO) - if (smp_started) - ipi_all_but_self(IPI_INVLTLB); + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } #endif /* APIC_IO */ } void -invlpg(u_int addr) +smp_invlpg(vm_offset_t addr) { - __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } void -invltlb(void) +smp_masked_invltlb(u_int mask) { - u_long temp; +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +#endif /* APIC_IO */ +} - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() is - * inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } @@ -2251,7 +2455,7 @@ ap_init(void) /* spin */ ; /* BSP may have changed PTD while we were waiting */ - cpu_invltlb(); + invltlb(); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); @@ -2290,6 +2494,9 @@ ap_init(void) /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + if (bootverbose) + apic_dump("ap_init()"); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); if (smp_cpus == mp_ncpus) { @@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); - statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); + statclock_process(curthread->td_kse, TRAPF_PC(&frame), + TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } diff --git a/sys/i386/i386/mpapic.c b/sys/i386/i386/mpapic.c index c42373b..85346bf 100644 --- a/sys/i386/i386/mpapic.c +++ b/sys/i386/i386/mpapic.c @@ -101,9 +101,6 @@ apic_initialize(void) #endif /** TEST_TEST1 */ lapic.svr = temp; - - if (bootverbose) - apic_dump("apic_initialize()"); } diff --git a/sys/i386/i386/mptable.c b/sys/i386/i386/mptable.c index 63fec0e..29e9c6e 100644 --- a/sys/i386/i386/mptable.c +++ b/sys/i386/i386/mptable.c @@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; +#ifdef APIC_IO +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; +static struct mtx smp_tlb_mtx; +#endif + /* * Local data and functions. */ @@ -336,6 +344,9 @@ init_locks(void) #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", NULL, MTX_SPIN); #endif /* USE_COMLOCK */ +#ifdef APIC_IO + mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN); +#endif } /* @@ -605,6 +616,10 @@ mp_enable(u_int boot_addr) /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLPG_OFFSET, Xinvlpg, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLRNG_OFFSET, Xinvlrng, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, @@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } -#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS) -u_int xhits[MAXCPU]; -SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits), - "IU", ""); +#if defined(APIC_IO) + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); #endif /* * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + register_t eflags; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} + +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. * - * XXX: Needs to handshake and wait for completion before proceding. + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. */ +static __inline u_int32_t +popcnt(u_int32_t m) +{ + + m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); + m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); + m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); + m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); + m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); + return m; +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + register_t eflags; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + /* XXX there should be a pcpu self mask */ + mask &= ~(1 << PCPU_GET(cpuid)); + if (mask == 0) + return; + ncpu = popcnt(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} +#endif + void smp_invltlb(void) { #if defined(APIC_IO) - if (smp_started) - ipi_all_but_self(IPI_INVLTLB); + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } #endif /* APIC_IO */ } void -invlpg(u_int addr) +smp_invlpg(vm_offset_t addr) { - __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } void -invltlb(void) +smp_masked_invltlb(u_int mask) { - u_long temp; +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +#endif /* APIC_IO */ +} - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() is - * inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } @@ -2251,7 +2455,7 @@ ap_init(void) /* spin */ ; /* BSP may have changed PTD while we were waiting */ - cpu_invltlb(); + invltlb(); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); @@ -2290,6 +2494,9 @@ ap_init(void) /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + if (bootverbose) + apic_dump("ap_init()"); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); if (smp_cpus == mp_ncpus) { @@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); - statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); + statclock_process(curthread->td_kse, TRAPF_PC(&frame), + TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 87cd8b9..5de1707 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -68,7 +68,6 @@ * and to when physical maps must be made correct. */ -#include "opt_disable_pse.h" #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_kstack_pages.h" @@ -85,6 +84,9 @@ #include #include #include +#ifdef SMP +#include +#endif #include #include @@ -97,6 +99,7 @@ #include #include +#include #include #include #include @@ -162,6 +165,7 @@ static vm_object_t kptobj; static int nkpt; vm_offset_t kernel_vm_end; +extern u_int32_t KERNend; /* * Data for the pv entry allocation mechanism @@ -257,10 +261,10 @@ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; + #ifndef DISABLE_PSE - if (cpu_feature & CPUID_PSE) { + if (cpu_feature & CPUID_PSE) newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); - } #endif return newaddr; } @@ -362,10 +366,9 @@ pmap_bootstrap(firstaddr, loadaddr) PTD[i] = 0; pgeflag = 0; -#if !defined(SMP) /* XXX - see also mp_machdep.c */ - if (cpu_feature & CPUID_PGE) { +#ifndef DISABLE_PG_G + if (cpu_feature & CPUID_PGE) pgeflag = PG_G; - } #endif /* @@ -378,7 +381,7 @@ pmap_bootstrap(firstaddr, loadaddr) */ pdir4mb = 0; -#if !defined(DISABLE_PSE) +#ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) { pd_entry_t ptditmp; /* @@ -389,29 +392,16 @@ pmap_bootstrap(firstaddr, loadaddr) ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; - -#if !defined(SMP) - /* - * Enable the PSE mode. - */ - load_cr4(rcr4() | CR4_PSE); - - /* - * We can do the mapping here for the single processor - * case. We simply ignore the old page table page from - * now on. - */ - /* - * For SMP, we still need 4K pages to bootstrap APs, - * PSE will be enabled as soon as all APs are up. - */ - PTD[KPTDI] = (pd_entry_t) ptditmp; - kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp; - invltlb(); -#endif } #endif - +#ifndef SMP + /* + * Turn on PGE/PSE. SMP does this later on since the + * 4K page tables are required for AP boot (for now). + * XXX fixme. + */ + pmap_set_opt(); +#endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic! (non-SMP hardware?)"); @@ -420,26 +410,55 @@ pmap_bootstrap(firstaddr, loadaddr) SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag | (cpu_apic_address & PG_FRAME)); #endif - invltlb(); } -#ifdef SMP /* - * Set 4mb pdir for mp startup + * Enable 4MB page mode for MP startup. Turn on PG_G support. + * BSP will run this after all the AP's have started up. */ void pmap_set_opt(void) { + pt_entry_t *pte; + vm_offset_t va, endva; + + if (pgeflag && (cpu_feature & CPUID_PGE)) { + load_cr4(rcr4() | CR4_PGE); + invltlb(); /* Insurance */ + } +#ifndef DISABLE_PSE if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); - if (pdir4mb && PCPU_GET(cpuid) == 0) { /* only on BSP */ + invltlb(); /* Insurance */ + } +#endif + if (PCPU_GET(cpuid) == 0) { +#ifndef DISABLE_PSE + if (pdir4mb) { kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb; - cpu_invltlb(); + invltlb(); /* Insurance */ } +#endif + if (pgeflag) { + /* Turn on PG_G for text, data, bss pages. */ + va = (vm_offset_t)btext; + endva = KERNBASE + KERNend; + while (va < endva) { + pte = vtopte(va); + if (*pte) + *pte |= pgeflag; + va += PAGE_SIZE; + } + invltlb(); /* Insurance */ + } + /* + * We do not need to broadcast the invltlb here, because + * each AP does it the moment it is released from the boot + * lock. See ap_init(). + */ } } -#endif void * pmap_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) @@ -553,43 +572,151 @@ pmap_track_modified(vm_offset_t va) return 0; } -static PMAP_INLINE void -invltlb_1pg(vm_offset_t va) -{ #ifdef I386_CPU - invltlb(); -#else - invlpg(va); -#endif +/* + * i386 only has "invalidate everything" and no SMP to worry about. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); } -static __inline void +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} +#else /* !I386_CPU */ +#ifdef SMP +/* + * For SMP, these functions have to use the IPI mechanism for coherence. + */ +void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { -#if defined(SMP) - if (pmap->pm_active & PCPU_GET(cpumask)) - cpu_invlpg((void *)va); - if (pmap->pm_active & PCPU_GET(other_cpus)) - smp_invltlb(); -#else - if (pmap->pm_active) - invltlb_1pg(va); -#endif + u_int cpumask; + u_int other_cpus; + + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + */ + if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { + invlpg(va); + smp_invlpg(va); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invlpg(va); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg(pmap->pm_active & other_cpus, va); + } + critical_exit(); } -static __inline void +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + u_int cpumask; + u_int other_cpus; + vm_offset_t addr; + + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + */ + if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + smp_invlpg_range(sva, eva); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg_range(pmap->pm_active & other_cpus, + sva, eva); + } + critical_exit(); +} + +void pmap_invalidate_all(pmap_t pmap) { -#if defined(SMP) - if (pmap->pm_active & PCPU_GET(cpumask)) - cpu_invltlb(); - if (pmap->pm_active & PCPU_GET(other_cpus)) + u_int cpumask; + u_int other_cpus; + + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + */ + if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { + invltlb(); smp_invltlb(); -#else - if (pmap->pm_active) + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invltlb(); + if (pmap->pm_active & other_cpus) + smp_masked_invltlb(pmap->pm_active & other_cpus); + } + critical_exit(); +} +#else /* !SMP */ +/* + * Normal, non-SMP, 486+ invalidation functions. + * We inline these within pmap.c for speed. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invlpg(va); +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + + if (pmap == kernel_pmap || pmap->pm_active) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + if (pmap == kernel_pmap || pmap->pm_active) invltlb(); -#endif } +#endif /* !SMP */ +#endif /* !I386_CPU */ /* * Return an address which is the base of the Virtual mapping of @@ -613,12 +740,7 @@ get_ptbase(pmap) /* otherwise, we are alternate address space */ if (frame != (APTDpde & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); -#if defined(SMP) - /* The page directory is not shared between CPUs */ - cpu_invltlb(); -#else invltlb(); -#endif } return APTmap; } @@ -647,7 +769,7 @@ pmap_pte_quick(pmap, va) newpf = pde & PG_FRAME; if (((*PMAP1) & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V; - invltlb_1pg((vm_offset_t) PADDR1); + pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR1); } return PADDR1 + (index & (NPTEPG - 1)); } @@ -692,34 +814,29 @@ pmap_extract(pmap, va) ***************************************************/ /* - * add a wired page to the kva - * note that in order for the mapping to take effect -- you - * should do a invltlb after doing the pmap_kenter... + * Add a wired page to the kva. + * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_offset_t pa) { pt_entry_t *pte; - pt_entry_t npte, opte; - npte = pa | PG_RW | PG_V | pgeflag; pte = vtopte(va); - opte = *pte; - *pte = npte; - invltlb_1pg(va); + *pte = pa | PG_RW | PG_V | pgeflag; } /* - * remove a page from the kernel pagetables + * Remove a page from the kernel pagetables. + * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { - register pt_entry_t *pte; + pt_entry_t *pte; pte = vtopte(va); *pte = 0; - invltlb_1pg(va); } /* @@ -737,13 +854,15 @@ pmap_kremove(vm_offset_t va) vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { - vm_offset_t sva = *virt; - vm_offset_t va = sva; + vm_offset_t va, sva; + + va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } + pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } @@ -756,64 +875,45 @@ pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. + * Note: SMP coherent. Uses a ranged shootdown IPI. */ void -pmap_qenter(vm_offset_t va, vm_page_t *m, int count) +pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { - vm_offset_t end_va; - - end_va = va + count * PAGE_SIZE; - - while (va < end_va) { - pt_entry_t *pte; + vm_offset_t va; - pte = vtopte(va); - *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag; -#ifdef SMP - cpu_invlpg((void *)va); -#else - invltlb_1pg(va); -#endif + va = sva; + while (count-- > 0) { + pmap_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } -#ifdef SMP - smp_invltlb(); -#endif + pmap_invalidate_range(kernel_pmap, sva, va); } /* - * this routine jerks page mappings from the + * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. + * Note: SMP coherent. Uses a ranged shootdown IPI. */ void -pmap_qremove(vm_offset_t va, int count) +pmap_qremove(vm_offset_t sva, int count) { - vm_offset_t end_va; - - end_va = va + count*PAGE_SIZE; - - while (va < end_va) { - pt_entry_t *pte; + vm_offset_t va; - pte = vtopte(va); - *pte = 0; -#ifdef SMP - cpu_invlpg((void *)va); -#else - invltlb_1pg(va); -#endif + va = sva; + while (count-- > 0) { + pmap_kremove(va); va += PAGE_SIZE; } -#ifdef SMP - smp_invltlb(); -#endif + pmap_invalidate_range(kernel_pmap, sva, va); } static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; + retry: m = vm_page_lookup(object, pindex); if (m && vm_page_sleep_busy(m, FALSE, "pplookp")) @@ -829,14 +929,11 @@ retry: void pmap_new_thread(struct thread *td) { -#ifdef I386_CPU - int updateneeded = 0; -#endif int i; + vm_page_t ma[KSTACK_PAGES]; vm_object_t ksobj; vm_page_t m; vm_offset_t ks; - pt_entry_t *ptek, oldpte; /* * allocate object for the kstack @@ -844,39 +941,21 @@ pmap_new_thread(struct thread *td) ksobj = vm_object_allocate(OBJT_DEFAULT, KSTACK_PAGES); td->td_kstack_obj = ksobj; -#ifdef KSTACK_GUARD /* get a kernel virtual address for the kstack for this thread */ +#ifdef KSTACK_GUARD ks = kmem_alloc_nofault(kernel_map, (KSTACK_PAGES + 1) * PAGE_SIZE); if (ks == 0) panic("pmap_new_thread: kstack allocation failed"); - - /* - * Set the first page to be the unmapped guard page. - */ - ptek = vtopte(ks); - oldpte = *ptek; - *ptek = 0; - if (oldpte) { -#ifdef I386_CPU - updateneeded = 1; -#else - invlpg(ks); -#endif - } - - /* - * move to the next page, which is where the real stack starts. - */ + if (*vtopte(ks) != 0) + pmap_qremove(ks, 1); ks += PAGE_SIZE; td->td_kstack = ks; - ptek++; #else /* get a kernel virtual address for the kstack for this thread */ ks = kmem_alloc_nofault(kernel_map, KSTACK_PAGES * PAGE_SIZE); if (ks == 0) panic("pmap_new_thread: kstack allocation failed"); td->td_kstack = ks; - ptek = vtopte(ks); #endif /* * For the length of the stack, link in a real page of ram for each @@ -887,6 +966,7 @@ pmap_new_thread(struct thread *td) * Get a kernel stack page */ m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); + ma[i] = m; /* * Wire the page @@ -894,28 +974,12 @@ pmap_new_thread(struct thread *td) m->wire_count++; cnt.v_wire_count++; - /* - * Enter the page into the kernel address space. - */ - oldpte = ptek[i]; - ptek[i] = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag; - if (oldpte) { -#ifdef I386_CPU - updateneeded = 1; -#else - invlpg(ks + (i * PAGE_SIZE)); -#endif - } - vm_page_wakeup(m); vm_page_flag_clear(m, PG_ZERO); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); m->valid = VM_PAGE_BITS_ALL; } -#ifdef I386_CPU - if (updateneeded) - invltlb(); -#endif + pmap_qenter(ks, ma, KSTACK_PAGES); } /* @@ -930,26 +994,18 @@ pmap_dispose_thread(td) vm_object_t ksobj; vm_offset_t ks; vm_page_t m; - pt_entry_t *ptek; ksobj = td->td_kstack_obj; ks = td->td_kstack; - ptek = vtopte(ks); + pmap_qremove(ks, KSTACK_PAGES); for (i = 0; i < KSTACK_PAGES; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("pmap_dispose_thread: kstack already missing?"); vm_page_busy(m); - ptek[i] = 0; -#ifndef I386_CPU - invlpg(ks + (i * PAGE_SIZE)); -#endif vm_page_unwire(m, 0); vm_page_free(m); } -#ifdef I386_CPU - invltlb(); -#endif /* * Free the space that this stack was mapped to in the kernel * address map. @@ -976,13 +1032,13 @@ pmap_swapout_thread(td) ksobj = td->td_kstack_obj; ks = td->td_kstack; + pmap_qremove(ks, KSTACK_PAGES); for (i = 0; i < KSTACK_PAGES; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("pmap_swapout_thread: kstack already missing?"); vm_page_dirty(m); vm_page_unwire(m, 0); - pmap_kremove(ks + i * PAGE_SIZE); } } @@ -994,6 +1050,7 @@ pmap_swapin_thread(td) struct thread *td; { int i, rv; + vm_page_t ma[KSTACK_PAGES]; vm_object_t ksobj; vm_offset_t ks; vm_page_t m; @@ -1002,7 +1059,6 @@ pmap_swapin_thread(td) ks = td->td_kstack; for (i = 0; i < KSTACK_PAGES; i++) { m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); - pmap_kenter(ks + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(ksobj, &m, 1, 0); if (rv != VM_PAGER_OK) @@ -1010,10 +1066,12 @@ pmap_swapin_thread(td) m = vm_page_lookup(ksobj, i); m->valid = VM_PAGE_BITS_ALL; } + ma[i] = m; vm_page_wire(m); vm_page_wakeup(m); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); } + pmap_qenter(ks, ma, KSTACK_PAGES); } /*************************************************** @@ -1108,7 +1166,8 @@ pmap_pinit0(pmap) { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); - pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD); + pmap_kenter((vm_offset_t)pmap->pm_pdir, (vm_offset_t)IdlePTD); + invlpg((vm_offset_t)pmap->pm_pdir); pmap->pm_ptphint = NULL; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); @@ -1153,7 +1212,7 @@ pmap_pinit(pmap) vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/ ptdpg->valid = VM_PAGE_BITS_ALL; - pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); + pmap_qenter((vm_offset_t) pmap->pm_pdir, &ptdpg, 1); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); @@ -1616,7 +1675,7 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) * PG_G. */ if (oldpte & PG_G) - invlpg(va); + pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte); @@ -2028,13 +2087,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) { if ((origpte & PG_RW) == 0) { *pte |= PG_RW; -#ifdef SMP - cpu_invlpg((void *)va); - if (pmap->pm_active & PCPU_GET(other_cpus)) - smp_invltlb(); -#else - invltlb_1pg(va); -#endif + pmap_invalidate_page(pmap, va); } return; } @@ -2102,13 +2155,7 @@ validate: if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte | PG_A; /*if (origpte)*/ { -#ifdef SMP - cpu_invlpg((void *)va); - if (pmap->pm_active & PCPU_GET(other_cpus)) - smp_invltlb(); -#else - invltlb_1pg(va); -#endif + pmap_invalidate_page(pmap, va); } } } @@ -2222,7 +2269,11 @@ retry: void * pmap_kenter_temporary(vm_offset_t pa, int i) { - pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); + vm_offset_t va; + + va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); + pmap_kenter(va, pa); + invlpg(va); return ((void *)crashdumpmap); } @@ -2527,7 +2578,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t pdnxt; pd_entry_t src_frame, dst_frame; vm_page_t m; - pd_entry_t saved_pde; if (dst_addr != src_addr) return; @@ -2537,17 +2587,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, return; dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME; - if (dst_frame != (APTDpde & PG_FRAME)) { - APTDpde = dst_frame | PG_RW | PG_V; -#if defined(SMP) - /* The page directory is not shared between CPUs */ - cpu_invltlb(); -#else - invltlb(); -#endif - } - saved_pde = APTDpde & (PG_FRAME | PG_RW | PG_V); - for(addr = src_addr; addr < end_addr; addr = pdnxt) { + for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; @@ -2588,6 +2628,14 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, if (pdnxt > end_addr) pdnxt = end_addr; + /* + * Have to recheck this before every avtopte() call below + * in case we have blocked and something else used APTDpde. + */ + if (dst_frame != (APTDpde & PG_FRAME)) { + APTDpde = dst_frame | PG_RW | PG_V; + invltlb(); + } src_pte = vtopte(addr); dst_pte = avtopte(addr); while (addr < pdnxt) { @@ -2603,16 +2651,6 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); - if ((APTDpde & PG_FRAME) != - (saved_pde & PG_FRAME)) { - APTDpde = saved_pde; -printf ("IT HAPPENNED!"); -#if defined(SMP) - cpu_invltlb(); -#else - invltlb(); -#endif - } if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and @@ -2644,14 +2682,13 @@ printf ("IT HAPPENNED!"); void pmap_zero_page(vm_page_t m) { - vm_offset_t phys = VM_PAGE_TO_PHYS(m); + vm_offset_t phys; + phys = VM_PAGE_TO_PHYS(m); if (*CMAP2) panic("pmap_zero_page: CMAP2 busy"); - *CMAP2 = PG_V | PG_RW | phys | PG_A | PG_M; - invltlb_1pg((vm_offset_t)CADDR2); - + pmap_invalidate_page(kernel_pmap, (vm_offset_t)CADDR2); #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) i686_pagezero(CADDR2); @@ -2670,14 +2707,13 @@ pmap_zero_page(vm_page_t m) void pmap_zero_page_area(vm_page_t m, int off, int size) { - vm_offset_t phys = VM_PAGE_TO_PHYS(m); + vm_offset_t phys; + phys = VM_PAGE_TO_PHYS(m); if (*CMAP2) panic("pmap_zero_page: CMAP2 busy"); - *CMAP2 = PG_V | PG_RW | phys | PG_A | PG_M; - invltlb_1pg((vm_offset_t)CADDR2); - + pmap_invalidate_page(kernel_pmap, (vm_offset_t)CADDR2); #if defined(I686_CPU) if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE) i686_pagezero(CADDR2); @@ -2696,20 +2732,13 @@ pmap_zero_page_area(vm_page_t m, int off, int size) void pmap_zero_page_idle(vm_page_t m) { - vm_offset_t phys = VM_PAGE_TO_PHYS(m); + vm_offset_t phys; + phys = VM_PAGE_TO_PHYS(m); if (*CMAP3) panic("pmap_zero_page: CMAP3 busy"); - *CMAP3 = PG_V | PG_RW | phys | PG_A | PG_M; -#ifdef SMP - mtx_lock(&Giant); /* IPI sender not MPSAFE */ -#endif - invltlb_1pg((vm_offset_t)CADDR3); -#ifdef SMP - mtx_unlock(&Giant); -#endif - + invlpg((vm_offset_t)CADDR3); /* SMP: local cpu only */ #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) i686_pagezero(CADDR3); @@ -2733,18 +2762,15 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) panic("pmap_copy_page: CMAP1 busy"); if (*CMAP2) panic("pmap_copy_page: CMAP2 busy"); - *CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; -#ifdef I386_CPU - invltlb(); -#else - invlpg((u_int)CADDR1); - invlpg((u_int)CADDR2); -#endif - + /* + * XXX we "know" that CADDR2 immediately follows CADDR1 and use + * that to save an IPI on SMP systems. + */ + pmap_invalidate_range(kernel_pmap, (vm_offset_t)CADDR1, + (vm_offset_t)CADDR2 + PAGE_SIZE); bcopy(CADDR1, CADDR2, PAGE_SIZE); - *CMAP1 = 0; *CMAP2 = 0; } @@ -3176,18 +3202,11 @@ pmap_mapdev(pa, size) for (tmpva = va; size > 0; ) { pte = vtopte(tmpva); *pte = pa | PG_RW | PG_V | pgeflag; -#ifdef SMP - cpu_invlpg((void *)tmpva); -#else - invltlb_1pg(tmpva); -#endif size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } -#ifdef SMP - smp_invltlb(); -#endif + pmap_invalidate_range(kernel_pmap, va, tmpva); return ((void *)(va + offset)); } @@ -3205,15 +3224,8 @@ pmap_unmapdev(va, size) for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { pte = vtopte(tmpva); *pte = 0; -#ifdef SMP - cpu_invlpg((void *)tmpva); -#else - invltlb_1pg(tmpva); -#endif } -#ifdef SMP - smp_invltlb(); -#endif + pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s index c1f3899..23c611c 100644 --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -1596,42 +1596,6 @@ ENTRY(ssdtosd) popl %ebx ret -/* load_cr0(cr0) */ -ENTRY(load_cr0) - movl 4(%esp),%eax - movl %eax,%cr0 - ret - -/* rcr0() */ -ENTRY(rcr0) - movl %cr0,%eax - ret - -/* rcr3() */ -ENTRY(rcr3) - movl %cr3,%eax - ret - -/* void load_cr3(caddr_t cr3) */ -ENTRY(load_cr3) -#ifdef SWTCH_OPTIM_STATS - incl tlb_flush_count -#endif - movl 4(%esp),%eax - movl %eax,%cr3 - ret - -/* rcr4() */ -ENTRY(rcr4) - movl %cr4,%eax - ret - -/* void load_cr4(caddr_t cr4) */ -ENTRY(load_cr4) - movl 4(%esp),%eax - movl %eax,%cr4 - ret - /* void reset_dbregs() */ ENTRY(reset_dbregs) movl $0,%eax diff --git a/sys/i386/i386/vm86.c b/sys/i386/i386/vm86.c index eb0c98b..c03757f 100644 --- a/sys/i386/i386/vm86.c +++ b/sys/i386/i386/vm86.c @@ -603,6 +603,7 @@ vm86_datacall(intnum, vmf, vmc) entry = vmc->pmap[i].pte_num; vmc->pmap[i].old_pte = pte[entry]; pte[entry] = page | PG_V | PG_RW | PG_U; + pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva); } vmf->vmf_trapno = intnum; @@ -611,6 +612,7 @@ vm86_datacall(intnum, vmf, vmc) for (i = 0; i < vmc->npages; i++) { entry = vmc->pmap[i].pte_num; pte[entry] = vmc->pmap[i].old_pte; + pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva); } mtx_unlock(&vm86_lock); diff --git a/sys/i386/include/cpufunc.h b/sys/i386/include/cpufunc.h index 2e64138..0896659 100644 --- a/sys/i386/include/cpufunc.h +++ b/sys/i386/include/cpufunc.h @@ -237,62 +237,6 @@ invd(void) __asm __volatile("invd"); } -#if defined(SMP) && defined(_KERNEL) - -/* - * When using APIC IPI's, invlpg() is not simply the invlpg instruction - * (this is a bug) and the inlining cost is prohibitive since the call - * executes into the IPI transmission system. - */ -void invlpg(u_int addr); -void invltlb(void); - -static __inline void -cpu_invlpg(void *addr) -{ - __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); -} - -static __inline void -cpu_invltlb(void) -{ - u_int temp; - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() - * is inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp) - : : "memory"); -#if defined(SWTCH_OPTIM_STATS) - ++tlb_flush_count; -#endif -} - -#else /* !(SMP && _KERNEL) */ - -static __inline void -invlpg(u_int addr) -{ - __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); -} - -static __inline void -invltlb(void) -{ - u_int temp; - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() - * is inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp) - : : "memory"); -#ifdef SWTCH_OPTIM_STATS - ++tlb_flush_count; -#endif -} - -#endif /* SMP && _KERNEL */ - static __inline u_short inw(u_int port) { @@ -364,15 +308,6 @@ ia32_pause(void) } static __inline u_int -rcr2(void) -{ - u_int data; - - __asm __volatile("movl %%cr2,%0" : "=r" (data)); - return (data); -} - -static __inline u_int read_eflags(void) { u_int ef; @@ -426,6 +361,86 @@ wrmsr(u_int msr, u_int64_t newval) __asm __volatile("wrmsr" : : "A" (newval), "c" (msr)); } +static __inline void +load_cr0(u_int data) +{ + + __asm __volatile("movl %0,%%cr0" : : "r" (data)); +} + +static __inline u_int +rcr0(void) +{ + u_int data; + + __asm __volatile("movl %%cr0,%0" : "=r" (data)); + return (data); +} + +static __inline u_int +rcr2(void) +{ + u_int data; + + __asm __volatile("movl %%cr2,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_cr3(u_int data) +{ + + __asm __volatile("movl %0,%%cr3" : : "r" (data) : "memory"); +#if defined(SWTCH_OPTIM_STATS) + ++tlb_flush_count; +#endif +} + +static __inline u_int +rcr3(void) +{ + u_int data; + + __asm __volatile("movl %%cr3,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_cr4(u_int data) +{ + __asm __volatile("movl %0,%%cr4" : : "r" (data)); +} + +static __inline u_int +rcr4(void) +{ + u_int data; + + __asm __volatile("movl %%cr4,%0" : "=r" (data)); + return (data); +} + +/* + * Global TLB flush (except for thise for pages marked PG_G) + */ +static __inline void +invltlb(void) +{ + + load_cr3(rcr3()); +} + +/* + * TLB flush for an individual page (even if it has PG_G). + * Only works on 486+ CPUs (i386 does not have PG_G). + */ +static __inline void +invlpg(u_int addr) +{ + + __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); +} + static __inline u_int rfs(void) { @@ -587,6 +602,8 @@ intr_restore(register_t eflags) int breakpoint(void); u_int bsfl(u_int mask); u_int bsrl(u_int mask); +void cpu_invlpg(u_int addr); +void cpu_invlpg_range(u_int start, u_int end); void disable_intr(void); void do_cpuid(u_int ax, u_int *p); void enable_intr(void); @@ -597,8 +614,14 @@ void insl(u_int port, void *addr, size_t cnt); void insw(u_int port, void *addr, size_t cnt); void invd(void); void invlpg(u_int addr); +void invlpg_range(u_int start, u_int end); void invltlb(void); u_short inw(u_int port); +void load_cr0(u_int cr0); +void load_cr3(u_int cr3); +void load_cr4(u_int cr4); +void load_fs(u_int sel); +void load_gs(u_int sel); void outb(u_int port, u_char data); void outl(u_int port, u_int data); void outsb(u_int port, void *addr, size_t cnt); @@ -606,7 +629,12 @@ void outsl(u_int port, void *addr, size_t cnt); void outsw(u_int port, void *addr, size_t cnt); void outw(u_int port, u_short data); void ia32_pause(void); +u_int rcr0(void); u_int rcr2(void); +u_int rcr3(void); +u_int rcr4(void); +u_int rfs(void); +u_int rgs(void); u_int64_t rdmsr(u_int msr); u_int64_t rdpmc(u_int pmc); u_int64_t rdtsc(void); @@ -614,10 +642,6 @@ u_int read_eflags(void); void wbinvd(void); void write_eflags(u_int ef); void wrmsr(u_int msr, u_int64_t newval); -u_int rfs(void); -u_int rgs(void); -void load_fs(u_int sel); -void load_gs(u_int sel); u_int rdr0(void); void load_dr0(u_int dr0); u_int rdr1(void); @@ -639,13 +663,7 @@ void intr_restore(register_t ef); #endif /* __GNUC__ */ -void load_cr0(u_int cr0); -void load_cr3(u_int cr3); -void load_cr4(u_int cr4); void ltr(u_short sel); -u_int rcr0(void); -u_int rcr3(void); -u_int rcr4(void); void reset_dbregs(void); __END_DECLS diff --git a/sys/i386/include/mptable.h b/sys/i386/include/mptable.h index 63fec0e..29e9c6e 100644 --- a/sys/i386/include/mptable.h +++ b/sys/i386/include/mptable.h @@ -288,6 +288,14 @@ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; +#ifdef APIC_IO +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; +static struct mtx smp_tlb_mtx; +#endif + /* * Local data and functions. */ @@ -336,6 +344,9 @@ init_locks(void) #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", NULL, MTX_SPIN); #endif /* USE_COMLOCK */ +#ifdef APIC_IO + mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN); +#endif } /* @@ -605,6 +616,10 @@ mp_enable(u_int boot_addr) /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLPG_OFFSET, Xinvlpg, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(XINVLRNG_OFFSET, Xinvlrng, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, @@ -2190,48 +2205,237 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } -#if defined(APIC_IO) && defined(COUNT_XINVLTLB_HITS) -u_int xhits[MAXCPU]; -SYSCTL_OPAQUE(_debug, OID_AUTO, xhits, CTLFLAG_RW, &xhits, sizeof(xhits), - "IU", ""); +#if defined(APIC_IO) + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); #endif /* * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + register_t eflags; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} + +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. * - * XXX: Needs to handshake and wait for completion before proceding. + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. */ +static __inline u_int32_t +popcnt(u_int32_t m) +{ + + m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); + m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); + m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); + m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); + m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); + return m; +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + register_t eflags; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + /* XXX there should be a pcpu self mask */ + mask &= ~(1 << PCPU_GET(cpuid)); + if (mask == 0) + return; + ncpu = popcnt(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + eflags = read_eflags(); + if ((eflags & PSL_I) == 0) + panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled"); + mtx_lock_spin(&smp_tlb_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); + mtx_unlock_spin(&smp_tlb_mtx); +} +#endif + void smp_invltlb(void) { #if defined(APIC_IO) - if (smp_started) - ipi_all_but_self(IPI_INVLTLB); + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } #endif /* APIC_IO */ } void -invlpg(u_int addr) +smp_invlpg(vm_offset_t addr) { - __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } void -invltlb(void) +smp_masked_invltlb(u_int mask) { - u_long temp; +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +#endif /* APIC_IO */ +} - /* - * This should be implemented as load_cr3(rcr3()) when load_cr3() is - * inlined. - */ - __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +#endif /* APIC_IO */ +} - /* send a message to the other CPUs */ - smp_invltlb(); +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ +#if defined(APIC_IO) + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +#endif /* APIC_IO */ } @@ -2251,7 +2455,7 @@ ap_init(void) /* spin */ ; /* BSP may have changed PTD while we were waiting */ - cpu_invltlb(); + invltlb(); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); @@ -2290,6 +2494,9 @@ ap_init(void) /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + if (bootverbose) + apic_dump("ap_init()"); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); if (smp_cpus == mp_ncpus) { @@ -2325,7 +2532,8 @@ forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); - statclock_process(curthread->td_kse, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); + statclock_process(curthread->td_kse, TRAPF_PC(&frame), + TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index e6ac669..e0789fc 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -151,7 +151,7 @@ extern pt_entry_t PTmap[], APTmap[]; extern pd_entry_t PTD[], APTD[]; extern pd_entry_t PTDpde, APTDpde; -extern pd_entry_t IdlePTD; /* physical address of "Idle" state directory */ +extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */ #endif #ifdef _KERNEL @@ -253,14 +253,15 @@ extern char *ptvmmap; /* poor name! */ extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; -void pmap_bootstrap( vm_offset_t, vm_offset_t); +void pmap_bootstrap(vm_offset_t, vm_offset_t); void *pmap_mapdev(vm_offset_t, vm_size_t); void pmap_unmapdev(vm_offset_t, vm_size_t); pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2; vm_page_t pmap_use_pt(pmap_t, vm_offset_t); -#ifdef SMP void pmap_set_opt(void); -#endif +void pmap_invalidate_page(pmap_t, vm_offset_t); +void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); +void pmap_invalidate_all(pmap_t); #endif /* _KERNEL */ diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h index 872c5ec..d669c51 100644 --- a/sys/i386/include/smp.h +++ b/sys/i386/include/smp.h @@ -51,6 +51,8 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */ * Interprocessor interrupts for SMP. */ #define IPI_INVLTLB XINVLTLB_OFFSET +#define IPI_INVLPG XINVLPG_OFFSET +#define IPI_INVLRNG XINVLRNG_OFFSET #define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET #define IPI_AST XCPUAST_OFFSET #define IPI_STOP XCPUSTOP_OFFSET @@ -107,7 +109,6 @@ void assign_apic_irq(int apic, int intpin, int irq); void revoke_apic_irq(int irq); void bsp_apic_configure(void); void init_secondary(void); -void smp_invltlb(void); void forward_statclock(void); void forwarded_statclock(struct trapframe frame); void forward_hardclock(void); @@ -119,6 +120,13 @@ void ipi_self(u_int ipi); #ifdef APIC_INTR_REORDER void set_lapic_isrloc(int, int); #endif /* APIC_INTR_REORDER */ +void smp_invlpg(vm_offset_t addr); +void smp_masked_invlpg(u_int mask, vm_offset_t addr); +void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva); +void smp_masked_invlpg_range(u_int mask, vm_offset_t startva, + vm_offset_t endva); +void smp_invltlb(void); +void smp_masked_invltlb(u_int mask); /* global data in mpapic.c */ extern volatile lapic_t lapic; diff --git a/sys/i386/isa/apic_vector.s b/sys/i386/isa/apic_vector.s index 8490b1b..569ed50 100644 --- a/sys/i386/isa/apic_vector.s +++ b/sys/i386/isa/apic_vector.s @@ -260,30 +260,107 @@ Xspuriousint: iret /* - * Handle TLB shootdowns. + * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT .globl Xinvltlb Xinvltlb: pushl %eax + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds #ifdef COUNT_XINVLTLB_HITS pushl %fs - movl $KPSEL, %eax + movl $KPSEL, %eax /* Private space selector */ mov %ax, %fs movl PCPU(CPUID), %eax popl %fs - ss - incl xhits(,%eax,4) + incl xhits_gbl(,%eax,4) #endif /* COUNT_XINVLTLB_HITS */ movl %cr3, %eax /* invalidate the TLB */ movl %eax, %cr3 - ss /* stack segment, avoid %ds load */ movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + lock + incl smp_tlb_wait + + popl %ds + popl %eax + iret + +/* + * Single page TLB shootdown + */ + .text + SUPERALIGN_TEXT + .globl Xinvlpg +Xinvlpg: + pushl %eax + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds + +#ifdef COUNT_XINVLTLB_HITS + pushl %fs + movl $KPSEL, %eax /* Private space selector */ + mov %ax, %fs + movl PCPU(CPUID), %eax + popl %fs + incl xhits_pg(,%eax,4) +#endif /* COUNT_XINVLTLB_HITS */ + + movl smp_tlb_addr1, %eax + invlpg (%eax) /* invalidate single page */ + + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + + lock + incl smp_tlb_wait + + popl %ds + popl %eax + iret + +/* + * Page range TLB shootdown. + */ + .text + SUPERALIGN_TEXT + .globl Xinvlrng +Xinvlrng: + pushl %eax + pushl %edx + pushl %ds + movl $KDSEL, %eax /* Kernel data selector */ + mov %ax, %ds + +#ifdef COUNT_XINVLTLB_HITS + pushl %fs + movl $KPSEL, %eax /* Private space selector */ + mov %ax, %fs + movl PCPU(CPUID), %eax + popl %fs + incl xhits_rng(,%eax,4) +#endif /* COUNT_XINVLTLB_HITS */ + + movl smp_tlb_addr1, %edx + movl smp_tlb_addr2, %eax +1: invlpg (%edx) /* invalidate single page */ + addl $PAGE_SIZE, %edx + cmpl %edx, %eax + jb 1b + + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + + lock + incl smp_tlb_wait + + popl %ds + popl %edx popl %eax iret diff --git a/sys/i386/isa/intr_machdep.h b/sys/i386/isa/intr_machdep.h index 41542d0..7179268 100644 --- a/sys/i386/isa/intr_machdep.h +++ b/sys/i386/isa/intr_machdep.h @@ -88,6 +88,7 @@ /* IDT vector base for regular (aka. slow) and fast interrupts */ #define TPR_SLOW_INTS 0x20 #define TPR_FAST_INTS 0x60 +/* XXX note that the AST interrupt is at 0x50 */ /* blocking values for local APIC Task Priority Register */ #define TPR_BLOCK_HWI 0x4f /* hardware INTs */ @@ -104,20 +105,23 @@ #endif /** TEST_TEST1 */ /* TLB shootdowns */ -#define XINVLTLB_OFFSET (ICU_OFFSET + 112) +#define XINVLTLB_OFFSET (ICU_OFFSET + 112) /* 0x90 */ +#define XINVLPG_OFFSET (ICU_OFFSET + 113) /* 0x91 */ +#define XINVLRNG_OFFSET (ICU_OFFSET + 114) /* 0x92 */ /* inter-cpu clock handling */ -#define XHARDCLOCK_OFFSET (ICU_OFFSET + 113) -#define XSTATCLOCK_OFFSET (ICU_OFFSET + 114) +#define XHARDCLOCK_OFFSET (ICU_OFFSET + 120) /* 0x98 */ +#define XSTATCLOCK_OFFSET (ICU_OFFSET + 121) /* 0x99 */ /* inter-CPU rendezvous */ -#define XRENDEZVOUS_OFFSET (ICU_OFFSET + 115) +#define XRENDEZVOUS_OFFSET (ICU_OFFSET + 122) /* 0x9A */ /* IPI to generate an additional software trap at the target CPU */ -#define XCPUAST_OFFSET (ICU_OFFSET + 48) +/* XXX in the middle of the interrupt range, overlapping IRQ48 */ +#define XCPUAST_OFFSET (ICU_OFFSET + 48) /* 0x50 */ /* IPI to signal CPUs to stop and wait for another CPU to restart them */ -#define XCPUSTOP_OFFSET (ICU_OFFSET + 128) +#define XCPUSTOP_OFFSET (ICU_OFFSET + 128) /* 0xA0 */ /* * Note: this vector MUST be xxxx1111, 32 + 223 = 255 = 0xff: @@ -194,7 +198,9 @@ inthand_t IDTVEC(intr28), IDTVEC(intr29), IDTVEC(intr30), IDTVEC(intr31); inthand_t - Xinvltlb, /* TLB shootdowns */ + Xinvltlb, /* TLB shootdowns - global */ + Xinvlpg, /* TLB shootdowns - 1 page */ + Xinvlrng, /* TLB shootdowns - page range */ Xhardclock, /* Forward hardclock() */ Xstatclock, /* Forward statclock() */ Xcpuast, /* Additional software trap on other cpu */ diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index 73934cb..444b087 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -223,6 +223,9 @@ static struct witness_order_list_entry order_lists[] = { { "icu", &lock_class_mtx_spin }, #ifdef SMP { "smp rendezvous", &lock_class_mtx_spin }, +#if defined(__i386__) && defined(APIC_IO) + { "tlb", &lock_class_mtx_spin }, +#endif #endif { "clk", &lock_class_mtx_spin }, { "mutex profiling lock", &lock_class_mtx_spin }, -- cgit v1.1