diff options
author | kib <kib@FreeBSD.org> | 2013-08-30 07:59:49 +0000 |
---|---|---|
committer | kib <kib@FreeBSD.org> | 2013-08-30 07:59:49 +0000 |
commit | a2b5da0090b331918b7db2ece8b9ca5d545d4a6c (patch) | |
tree | 04770c540ba8145f9288bad14ac5d4d59ac30c95 | |
parent | 748f95c68727abdadaf3ea8816cc19784d05411d (diff) | |
download | FreeBSD-src-a2b5da0090b331918b7db2ece8b9ca5d545d4a6c.zip FreeBSD-src-a2b5da0090b331918b7db2ece8b9ca5d545d4a6c.tar.gz |
Implement support for the process-context identifiers ('PCID') on
Intel CPUs. The feature tags TLB entries with the Id of the address
space and allows to avoid TLB invalidation on the context switch, it
is available only in the long mode. In the microbenchmarks, using the
PCID decreased latency of the context switches by ~30% on SandyBridge
class desktop CPUs, measured with the lat_ctx program from lmbench.
If available, use INVPCID instruction when a TLB entry in non-current
address space needs to be invalidated. The instruction is typically
available on the Haswell.
If needed, the use of PCID can be turned off with the
vm.pmap.pcid_enabled loader tunable set to 0. The state of the
feature is reported by the vm.pmap.pcid_enabled sysctl. The sysctl
vm.pmap.pcid_save_cnt reports the number of context switches which
avoided invalidating the TLB; compare with the total number of context
switches, available as sysctl vm.stats.sys.v_swtch.
Sponsored by: The FreeBSD Foundation
Reviewed by: alc
Tested by: pho, bf
-rw-r--r-- | sys/amd64/amd64/apic_vector.S | 241 | ||||
-rw-r--r-- | sys/amd64/amd64/cpu_switch.S | 34 | ||||
-rw-r--r-- | sys/amd64/amd64/genassym.c | 4 | ||||
-rw-r--r-- | sys/amd64/amd64/machdep.c | 2 | ||||
-rw-r--r-- | sys/amd64/amd64/mp_machdep.c | 62 | ||||
-rw-r--r-- | sys/amd64/amd64/pmap.c | 276 | ||||
-rw-r--r-- | sys/amd64/amd64/vm_machdep.c | 2 | ||||
-rw-r--r-- | sys/amd64/include/pcpu.h | 2 | ||||
-rw-r--r-- | sys/amd64/include/pmap.h | 2 | ||||
-rw-r--r-- | sys/amd64/include/smp.h | 15 |
10 files changed, 527 insertions, 113 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 7551cc5..e868cf5 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -43,6 +43,12 @@ #include "assym.s" +#ifdef SMP +#define LK lock ; +#else +#define LK +#endif + /* * I/O Interrupt Entry Point. Rather than having one entry point for * each interrupt source, we use one entry point for each 32-bit word @@ -149,6 +155,38 @@ IDTVEC(xen_intr_upcall) * Global address space TLB shootdown. */ .text + +#define NAKE_INTR_CS 24 + + SUPERALIGN_TEXT +global_invltlb: + movl %cr4,%eax + andl $~0x80,%eax + movl %eax,%cr4 + orl $0x80,%eax + movl %eax,%cr4 +invltlb_ret_clear_pm_save: + movq smp_tlb_pmap,%rdx + testq %rdx,%rdx + jz invltlb_ret + testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp) + jz 1f + swapgs +1: + movl PCPU(CPUID),%eax + jz 2f + swapgs +2: + LK btcl %eax,PM_SAVE(%rdx) + SUPERALIGN_TEXT +invltlb_ret: + movq lapic, %rax + movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ + LK incl smp_tlb_wait + popq %rdx + popq %rax + jmp doreti_iret + SUPERALIGN_TEXT IDTVEC(invltlb) #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS) @@ -165,18 +203,44 @@ IDTVEC(invltlb) #endif pushq %rax + pushq %rdx - movq %cr3, %rax /* invalidate the TLB */ - movq %rax, %cr3 - - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - - lock - incl smp_tlb_wait - - popq %rax - jmp doreti_iret + movq %cr3,%rax + cmpl $0,pmap_pcid_enabled + je 2f + + movq $smp_tlb_invpcid,%rdx + cmpl $0,(%rdx) + je global_invltlb + cmpl $-1,(%rdx) + je global_invltlb + + /* + * Non-zero smp_tlb_invpcid, only invalidate TLB for entries with + * current PCID. + */ + cmpl $0,invpcid_works + je 1f + /* Use invpcid if available. */ + movl $1,%eax /* INVPCID_CTX */ + /* invpcid (%rdx),%rax */ + .byte 0x66,0x0f,0x38,0x82,0x02 + jmp invltlb_ret_clear_pm_save +1: + /* Otherwise reload %cr3 twice. */ + movq pcid_cr3,%rdx + cmpq %rax,%rdx + je 2f + movq %rdx,%cr3 /* Invalidate, bit 63 is zero. */ + btsq $63,%rax + + /* + * Invalidate the TLB if PCID is not enabled. + * Restore the old address space. + */ +2: + movq %rax,%cr3 + jmp invltlb_ret_clear_pm_save /* * Single page TLB shootdown @@ -198,18 +262,54 @@ IDTVEC(invlpg) #endif pushq %rax - - movq smp_tlb_addr1, %rax - invlpg (%rax) /* invalidate single page */ - - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - - lock - incl smp_tlb_wait - - popq %rax - jmp doreti_iret + pushq %rdx + movq $smp_tlb_invpcid,%rdx + cmpl $0,pmap_pcid_enabled + je 3f + cmpl $0,invpcid_works + jne 2f + + /* kernel pmap - use invlpg to invalidate global mapping */ + cmpl $0,(%rdx) + je 3f + cmpl $-1,(%rdx) + je global_invltlb + + /* + * PCID supported, but INVPCID is not. + * Temporarily switch to the target address space and do INVLPG. + */ + pushq %rcx + movq %cr3,%rcx + movq pcid_cr3,%rax + cmp %rcx,%rax + je 1f + btsq $63,%rax + movq %rax,%cr3 +1: movq 8(%rdx),%rax + invlpg (%rax) + btsq $63,%rcx + movq %rcx,%cr3 + popq %rcx + jmp invltlb_ret + + /* + * Invalidate the TLB entry using INVPCID_ADDR. + */ +2: + xorl %eax,%eax +/* invpcid (%rdx),%rax */ + .byte 0x66,0x0f,0x38,0x82,0x02 + jmp invltlb_ret + + /* + * PCID is not supported or kernel pmap. + * Invalidate single page using INVLPG. + */ +3: + movq 8(%rdx),%rax + invlpg (%rax) + jmp invltlb_ret /* * Page range TLB shootdown. @@ -232,23 +332,76 @@ IDTVEC(invlrng) pushq %rax pushq %rdx - - movq smp_tlb_addr1, %rdx - movq smp_tlb_addr2, %rax + movq $smp_tlb_invpcid,%rdx + cmpl $0,pmap_pcid_enabled + jne invlrng_single_page + cmpl $0,invpcid_works + jne invlrng_invpcid + + /* kernel pmap - use invlpg to invalidate global mapping */ + cmpl $0,(%rdx) + je invlrng_single_page + cmpl $-1,(%rdx) + je global_invltlb + + pushq %rcx + movq %cr3,%rcx + movq pcid_cr3,%rax + cmpq %rcx,%rax + je 1f + btsq $63,%rax + movq %rax,%cr3 +1: + movq 8(%rdx),%rdx + movq smp_tlb_addr2,%rax +2: + invlpg (%rdx) + addq $PAGE_SIZE,%rdx + cmpq %rax,%rdx + jb 2b + btsq $63,%rcx + movq %rcx,%cr3 + popq %rcx + jmp invltlb_ret + +invlrng_invpcid: + testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp) + jz 1f + swapgs +1: + pushq %rcx + movq (%rdx),%rcx + movq %rcx,PCPU(INVPCID_DESCR) + movq 8(%rdx),%rax + movq %rax,PCPU(INVPCID_DESCR)+8 + movq smp_tlb_addr2,%rcx + xorl %eax,%eax + movq $PC_INVPCID_DESCR,%rdx + gs + subq 8(%rdx),%rcx + shrq $PAGE_SHIFT,%rcx +2: + gs +// invpcid (%rdx),%rax + .byte 0x66,0x0f,0x38,0x82,0x02 + gs + addq $PAGE_SIZE,8(%rdx) + dec %rcx + jne 2b + popq %rcx + testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp) + jz invltlb_ret + swapgs + jmp invltlb_ret + +invlrng_single_page: + movq 8(%rdx),%rdx + movq smp_tlb_addr2,%rax 1: invlpg (%rdx) /* invalidate single page */ - addq $PAGE_SIZE, %rdx - cmpq %rax, %rdx + addq $PAGE_SIZE,%rdx + cmpq %rax,%rdx jb 1b - - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - - lock - incl smp_tlb_wait - - popq %rdx - popq %rax - jmp doreti_iret + jmp invltlb_ret /* * Invalidate cache. @@ -265,17 +418,9 @@ IDTVEC(invlcache) #endif pushq %rax - + pushq %rdx wbinvd - - movq lapic, %rax - movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - - lock - incl smp_tlb_wait - - popq %rax - jmp doreti_iret + jmp invltlb_ret /* * Handler for IPIs sent via the per-cpu IPI bitmap. diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index ed1ccb5..ac30990 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -77,8 +77,7 @@ ENTRY(cpu_throw) LK btrl %eax,PM_ACTIVE(%rdx) /* clear old */ 1: movq TD_PCB(%rsi),%r8 /* newtd->td_pcb */ - movq PCB_CR3(%r8),%rdx - movq %rdx,%cr3 /* new address space */ + movq PCB_CR3(%r8),%rcx /* new address space */ jmp swact END(cpu_throw) @@ -145,20 +144,41 @@ ctx_switch_xsave: SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */ jmp sw1 swinact: - movq %rcx,%cr3 /* new address space */ - movl PCPU(CPUID), %eax + movl PCPU(CPUID),%eax /* Release bit from old pmap->pm_active */ - movq PCPU(CURPMAP),%rcx - LK btrl %eax,PM_ACTIVE(%rcx) /* clear old */ - SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */ + movq PCPU(CURPMAP),%r12 + LK btrl %eax,PM_ACTIVE(%r12) /* clear old */ + SETLK %rdx,TD_LOCK(%rdi) /* Release the old thread */ swact: /* Set bit in new pmap->pm_active */ movq TD_PROC(%rsi),%rdx /* newproc */ movq P_VMSPACE(%rdx), %rdx addq $VM_PMAP,%rdx + cmpl $-1,PM_PCID(%rdx) + je 1f + LK btsl %eax,PM_SAVE(%rdx) + jnc 1f + btsq $63,%rcx /* CR3_PCID_SAVE */ + incq PCPU(PM_SAVE_CNT) +1: + movq %rcx,%cr3 /* new address space */ LK btsl %eax,PM_ACTIVE(%rdx) /* set new */ movq %rdx,PCPU(CURPMAP) + /* + * We might lose the race and other CPU might have changed + * the pmap after we set our bit in pmap->pm_save. Recheck. + * Reload %cr3 with CR3_PCID_SAVE bit cleared if pmap was + * modified, causing TLB flush for this pcid. + */ + btrq $63,%rcx + jnc 1f + LK btsl %eax,PM_SAVE(%rdx) + jc 1f + decq PCPU(PM_SAVE_CNT) + movq %rcx,%cr3 +1: + sw1: #if defined(SCHED_ULE) && defined(SMP) /* Wait for the new thread to become unblocked */ diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index 3043bb5..62017e7 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -76,6 +76,8 @@ __FBSDID("$FreeBSD$"); ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); +ASSYM(PM_SAVE, offsetof(struct pmap, pm_save)); +ASSYM(PM_PCID, offsetof(struct pmap, pm_pcid)); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); @@ -225,6 +227,8 @@ ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p)); ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt)); ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); +ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt)); +ASSYM(PC_INVPCID_DESCR, offsetof(struct pcpu, pc_invpcid_descr)); ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 7f7e54a..f3969d3 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1909,7 +1909,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; - thread0.td_pcb->pcb_cr3 = KPML4phys; + thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */ thread0.td_frame = &proc0_tf; env = getenv("kernelname"); diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 267b933..530aa61 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -107,9 +107,11 @@ struct pcb stoppcbs[MAXCPU]; struct pcb **susppcbs; /* Variables needed for SMP tlb shootdown. */ -vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; +struct invpcid_descr smp_tlb_invpcid; volatile int smp_tlb_wait; +uint64_t pcid_cr3; +pmap_t smp_tlb_pmap; #ifdef COUNT_IPIS /* Interrupt counts. */ @@ -603,6 +605,8 @@ cpu_mp_announce(void) } } +extern int pmap_pcid_enabled; + /* * AP CPU's call this to initialize themselves. */ @@ -768,6 +772,8 @@ init_secondary(void) */ load_cr4(rcr4() | CR4_PGE); + if (pmap_pcid_enabled) + load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); @@ -1119,7 +1125,8 @@ ipi_send_cpu(int cpu, u_int ipi) * Flush the TLB on all other CPU's */ static void -smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1, + vm_offset_t addr2) { u_int ncpu; @@ -1129,8 +1136,16 @@ smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); - smp_tlb_addr1 = addr1; + smp_tlb_invpcid.addr = addr1; + if (pmap == NULL) { + smp_tlb_invpcid.pcid = 0; + } else { + smp_tlb_invpcid.pcid = pmap->pm_pcid; + pcid_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | + (pmap->pm_pcid == -1 ? 0 : pmap->pm_pcid); + } smp_tlb_addr2 = addr2; + smp_tlb_pmap = pmap; atomic_store_rel_int(&smp_tlb_wait, 0); ipi_all_but_self(vector); while (smp_tlb_wait < ncpu) @@ -1139,7 +1154,8 @@ smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) } static void -smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, + vm_offset_t addr1, vm_offset_t addr2) { int cpu, ncpu, othercpus; @@ -1155,8 +1171,16 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_of if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); - smp_tlb_addr1 = addr1; + smp_tlb_invpcid.addr = addr1; + if (pmap == NULL) { + smp_tlb_invpcid.pcid = 0; + } else { + smp_tlb_invpcid.pcid = pmap->pm_pcid; + pcid_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | + (pmap->pm_pcid == -1 ? 0 : pmap->pm_pcid); + } smp_tlb_addr2 = addr2; + smp_tlb_pmap = pmap; atomic_store_rel_int(&smp_tlb_wait, 0); if (CPU_ISFULLSET(&mask)) { ncpu = othercpus; @@ -1182,15 +1206,15 @@ smp_cache_flush(void) { if (smp_started) - smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); + smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0); } void -smp_invltlb(void) +smp_invltlb(pmap_t pmap) { if (smp_started) { - smp_tlb_shootdown(IPI_INVLTLB, 0, 0); + smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif @@ -1198,11 +1222,11 @@ smp_invltlb(void) } void -smp_invlpg(vm_offset_t addr) +smp_invlpg(pmap_t pmap, vm_offset_t addr) { if (smp_started) { - smp_tlb_shootdown(IPI_INVLPG, addr, 0); + smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif @@ -1210,11 +1234,11 @@ smp_invlpg(vm_offset_t addr) } void -smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { - smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); + smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; @@ -1223,11 +1247,11 @@ smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) } void -smp_masked_invltlb(cpuset_t mask) +smp_masked_invltlb(cpuset_t mask, pmap_t pmap) { if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, NULL, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_global++; #endif @@ -1235,11 +1259,11 @@ smp_masked_invltlb(cpuset_t mask) } void -smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) +smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr) { if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_page++; #endif @@ -1247,11 +1271,13 @@ smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) } void -smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) +smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, + vm_offset_t addr2) { if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, + addr2); #ifdef COUNT_XINVLTLB_HITS ipi_masked_range++; ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 851f92a..bca40f0 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -116,11 +116,8 @@ __FBSDID("$FreeBSD$"); #include <sys/vmmeter.h> #include <sys/sched.h> #include <sys/sysctl.h> -#ifdef SMP +#include <sys/_unrhdr.h> #include <sys/smp.h> -#else -#include <sys/cpuset.h> -#endif #include <vm/vm.h> #include <vm/vm_param.h> @@ -250,6 +247,53 @@ static struct md_page *pv_table; pt_entry_t *CMAP1 = 0; caddr_t CADDR1 = 0; +static struct unrhdr pcid_unr; +static struct mtx pcid_mtx; +int pmap_pcid_enabled = 1; +SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled, + 0, "Is TLB Context ID enabled ?"); +int invpcid_works = 0; + +/* + * Perform the guaranteed invalidation of all TLB entries. This + * includes the global entries, and entries in all PCIDs, not only the + * current context. The function works both on non-PCID CPUs and CPUs + * with the PCID turned off or on. See IA-32 SDM Vol. 3a 4.10.4.1 + * Operations that Invalidate TLBs and Paging-Structure Caches. + */ +static __inline void +invltlb_globpcid(void) +{ + uint64_t cr4; + + cr4 = rcr4(); + load_cr4(cr4 & ~CR4_PGE); + /* + * Although preemption at this point could be detrimental to + * performance, it would not lead to an error. PG_G is simply + * ignored if CR4.PGE is clear. Moreover, in case this block + * is re-entered, the load_cr4() either above or below will + * modify CR4.PGE flushing the TLB. + */ + load_cr4(cr4 | CR4_PGE); +} + +static int +pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) +{ + int i; + uint64_t res; + + res = 0; + CPU_FOREACH(i) { + res += cpuid_to_pcpu[i]->pc_pm_save_cnt; + } + return (sysctl_handle_64(oidp, &res, 0, req)); +} +SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | + CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", + "Count of saved TLB context on switch"); + /* * Crashdump maps. */ @@ -685,6 +729,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ + CPU_ZERO(&kernel_pmap->pm_save); TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* @@ -716,6 +761,21 @@ pmap_bootstrap(vm_paddr_t *firstaddr) /* Initialize the PAT MSR. */ pmap_init_pat(); + +#ifdef SMP + /* Initialize TLB Context Id. */ + TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); + if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { + load_cr4(rcr4() | CR4_PCIDE); + mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF); + init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx); + /* Check for INVPCID support */ + invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) + != 0; + kernel_pmap->pm_pcid = 0; + } else +#endif + pmap_pcid_enabled = 0; } /* @@ -952,7 +1012,6 @@ pmap_cache_bits(int mode, boolean_t is_pde) static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { - u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ @@ -968,19 +1027,34 @@ pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ - cr4 = rcr4(); - load_cr4(cr4 & ~CR4_PGE); - /* - * Although preemption at this point could be detrimental to - * performance, it would not lead to an error. PG_G is simply - * ignored if CR4.PGE is clear. Moreover, in case this block - * is re-entered, the load_cr4() either above or below will - * modify CR4.PGE flushing the TLB. - */ - load_cr4(cr4 | CR4_PGE); + invltlb_globpcid(); } } #ifdef SMP + +static void +pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va) +{ + struct invpcid_descr d; + uint64_t cr3; + + if (invpcid_works) { + d.pcid = pmap->pm_pcid; + d.pad = 0; + d.addr = va; + invpcid(&d, INVPCID_ADDR); + return; + } + + cr3 = rcr3(); + critical_enter(); + load_cr3(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | pmap->pm_pcid | + CR3_PCID_SAVE); + invlpg(va); + load_cr3(cr3 | CR3_PCID_SAVE); + critical_exit(); +} + /* * For SMP, these functions have to use the IPI mechanism for coherence. * @@ -1008,21 +1082,68 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - invlpg(va); - smp_invlpg(va); + if (!pmap_pcid_enabled) { + invlpg(va); + } else { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { + if (pmap == PCPU_GET(curpmap)) + invlpg(va); + else + pmap_invalidate_page_pcid(pmap, va); + } else { + invltlb_globpcid(); + } + } + smp_invlpg(pmap, va); } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); - CPU_AND(&other_cpus, &pmap->pm_active); + else if (pmap_pcid_enabled) { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) + pmap_invalidate_page_pcid(pmap, va); + else + invltlb_globpcid(); + } + if (pmap_pcid_enabled) + CPU_AND(&other_cpus, &pmap->pm_save); + else + CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) - smp_masked_invlpg(other_cpus, va); + smp_masked_invlpg(other_cpus, pmap, va); } sched_unpin(); } +static void +pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct invpcid_descr d; + uint64_t cr3; + vm_offset_t addr; + + if (invpcid_works) { + d.pcid = pmap->pm_pcid; + d.pad = 0; + for (addr = sva; addr < eva; addr += PAGE_SIZE) { + d.addr = addr; + invpcid(&d, INVPCID_ADDR); + } + return; + } + + cr3 = rcr3(); + critical_enter(); + load_cr3(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | pmap->pm_pcid | + CR3_PCID_SAVE); + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + load_cr3(cr3 | CR3_PCID_SAVE); + critical_exit(); +} + void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { @@ -1032,19 +1153,43 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - smp_invlpg_range(sva, eva); + if (!pmap_pcid_enabled) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + } else { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { + if (pmap == PCPU_GET(curpmap)) { + for (addr = sva; addr < eva; + addr += PAGE_SIZE) + invlpg(addr); + } else { + pmap_invalidate_range_pcid(pmap, + sva, eva); + } + } else { + invltlb_globpcid(); + } + } + smp_invlpg_range(pmap, sva, eva); } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) + if (CPU_ISSET(cpuid, &pmap->pm_active)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); - CPU_AND(&other_cpus, &pmap->pm_active); + } else if (pmap_pcid_enabled) { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) + pmap_invalidate_range_pcid(pmap, sva, eva); + else + invltlb_globpcid(); + } + if (pmap_pcid_enabled) + CPU_AND(&other_cpus, &pmap->pm_save); + else + CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) - smp_masked_invlpg_range(other_cpus, sva, eva); + smp_masked_invlpg_range(other_cpus, pmap, sva, eva); } sched_unpin(); } @@ -1053,21 +1198,63 @@ void pmap_invalidate_all(pmap_t pmap) { cpuset_t other_cpus; + struct invpcid_descr d; + uint64_t cr3; u_int cpuid; sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - invltlb(); - smp_invltlb(); + cpuid = PCPU_GET(cpuid); + if (pmap == kernel_pmap || + (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) || + !CPU_CMP(&pmap->pm_active, &all_cpus)) { + if (invpcid_works) { + bzero(&d, sizeof(d)); + invpcid(&d, INVPCID_CTXGLOB); + } else { + invltlb_globpcid(); + } + CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); + smp_invltlb(pmap); } else { - cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) + + /* + * This logic is duplicated in the Xinvltlb shootdown + * IPI handler. + */ + if (pmap_pcid_enabled) { + if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { + if (invpcid_works) { + d.pcid = pmap->pm_pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + } else { + cr3 = rcr3(); + critical_enter(); + + /* + * Bit 63 is clear, pcid TLB + * entries are invalidated. + */ + load_cr3(DMAP_TO_PHYS((vm_offset_t) + pmap->pm_pml4) | pmap->pm_pcid); + load_cr3(cr3 | CR3_PCID_SAVE); + critical_exit(); + } + } else { + invltlb_globpcid(); + } + } else if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); - CPU_AND(&other_cpus, &pmap->pm_active); + CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); + if (pmap_pcid_enabled) + CPU_AND(&other_cpus, &pmap->pm_save); + else + CPU_AND(&other_cpus, &pmap->pm_active); if (!CPU_EMPTY(&other_cpus)) - smp_masked_invltlb(other_cpus); + smp_masked_invltlb(other_cpus, pmap); } sched_unpin(); } @@ -1129,8 +1316,10 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; - else + else { active = pmap->pm_active; + CPU_AND_ATOMIC(&pmap->pm_save, &active); + } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; @@ -1193,6 +1382,8 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); + else + CPU_ZERO(&pmap->pm_save); } #endif /* !SMP */ @@ -1675,6 +1866,8 @@ pmap_pinit0(pmap_t pmap) PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1; + CPU_ZERO(&pmap->pm_save); } /* @@ -1716,6 +1909,8 @@ pmap_pinit(pmap_t pmap) CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + pmap->pm_pcid = pmap_pcid_enabled ? alloc_unr(&pcid_unr) : -1; + CPU_ZERO(&pmap->pm_save); return (1); } @@ -1957,6 +2152,14 @@ pmap_release(pmap_t pmap) KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); + if (pmap_pcid_enabled) { + /* + * Invalidate any left TLB entries, to allow the reuse + * of the pcid. + */ + pmap_invalidate_all(pmap); + } + m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); for (i = 0; i < NKPML4E; i++) /* KVA */ @@ -1968,6 +2171,8 @@ pmap_release(pmap_t pmap) m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); + if (pmap->pm_pcid != -1) + free_unr(&pcid_unr, pmap->pm_pcid); } static int @@ -5734,15 +5939,20 @@ pmap_activate(struct thread *td) critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); + CPU_ZERO(&pmap->pm_save); cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); + CPU_SET_ATOMIC(cpuid, &pmap->pm_save); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); + CPU_SET(cpuid, &pmap->pm_save); #endif cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4); + if (pmap->pm_pcid != -1) + cr3 |= pmap->pm_pcid; td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index ed0e7e9..3764f72 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -221,6 +221,8 @@ cpu_fork(td1, p2, td2, flags) */ pmap2 = vmspace_pmap(p2->p_vmspace); pcb2->pcb_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap2->pm_pml4); + if (pmap2->pm_pcid != -1) + pcb2->pcb_cr3 |= pmap2->pm_pcid; pcb2->pcb_r12 = (register_t)fork_return; /* fork_trampoline argument */ pcb2->pcb_rbp = 0; pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *); diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index 1c83c2a..0e11975 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -67,6 +67,8 @@ struct system_segment_descriptor *pc_ldt; \ /* Pointer to the CPU TSS descriptor */ \ struct system_segment_descriptor *pc_tss; \ + uint64_t pc_pm_save_cnt; \ + char pc_invpcid_descr[16]; \ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ uint64_t pc_dbreg[16]; /* ddb debugging regs */ \ int pc_dbreg_cmd; /* ddb debugging reg cmd */ \ diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index aacb9ba..fa42389 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -240,6 +240,8 @@ struct pmap { pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ + cpuset_t pm_save; /* Context valid on cpus mask */ + int pm_pcid; /* context id */ /* spare u_int here due to padding */ struct pmap_statistics pm_stats; /* pmap statistics */ struct vm_radix pm_root; /* spare page table pages */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index 16d87ea..d6cd476 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -54,6 +54,8 @@ inthand_t IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */ IDTVEC(rendezvous); /* handle CPU rendezvous */ +struct pmap; + /* functions in mp_machdep.c */ void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); @@ -67,13 +69,14 @@ int ipi_nmi_handler(void); void ipi_selected(cpuset_t cpus, u_int ipi); u_int mp_bootaddress(u_int); void smp_cache_flush(void); -void smp_invlpg(vm_offset_t addr); -void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr); -void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva); -void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva, +void smp_invlpg(struct pmap *pmap, vm_offset_t addr); +void smp_masked_invlpg(cpuset_t mask, struct pmap *pmap, vm_offset_t addr); +void smp_invlpg_range(struct pmap *pmap, vm_offset_t startva, vm_offset_t endva); -void smp_invltlb(void); -void smp_masked_invltlb(cpuset_t mask); +void smp_masked_invlpg_range(cpuset_t mask, struct pmap *pmap, + vm_offset_t startva, vm_offset_t endva); +void smp_invltlb(struct pmap *pmap); +void smp_masked_invltlb(cpuset_t mask, struct pmap *pmap); #endif /* !LOCORE */ #endif /* SMP */ |