From 6c690ee1039b251e583fc65b28da30e97d6a7385 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Jun 2017 10:26:14 -0700 Subject: x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3() The kernel has several code paths that read CR3. Most of them assume that CR3 contains the PGD's physical address, whereas some of them awkwardly use PHYSICAL_PAGE_MASK to mask off low bits. Add explicit mask macros for CR3 and convert all of the CR3 readers. This will keep them from breaking when PCID is enabled. Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: xen-devel Link: http://lkml.kernel.org/r/883f8fb121f4616c1c1427ad87350bb2f5ffeca1.1497288170.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/pagetable.c | 2 +- arch/x86/include/asm/efi.h | 2 +- arch/x86/include/asm/mmu_context.h | 4 ++-- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/include/asm/processor-flags.h | 36 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 8 ++++++++ arch/x86/include/asm/special_insns.h | 10 +++++++--- arch/x86/include/asm/tlbflush.h | 4 ++-- arch/x86/kernel/head64.c | 3 ++- arch/x86/kernel/paravirt.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/mm/fault.c | 10 +++++----- arch/x86/mm/ioremap.c | 2 +- arch/x86/platform/efi/efi_64.c | 4 ++-- arch/x86/platform/olpc/olpc-xo1-pm.c | 2 +- arch/x86/power/cpu.c | 2 +- arch/x86/power/hibernate_64.c | 3 ++- arch/x86/xen/mmu_pv.c | 6 +++--- 20 files changed, 79 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 1d78f17..8e69df9 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -92,7 +92,7 @@ void initialize_identity_maps(void) * and we must append to the existing area instead of entirely * overwriting it. */ - level4p = read_cr3(); + level4p = read_cr3_pa(); if (level4p == (unsigned long)_pgtable) { debug_putstr("booted via startup_32()\n"); pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 2f77bce..d2ff779 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -74,7 +74,7 @@ struct efi_scratch { __kernel_fpu_begin(); \ \ if (efi_scratch.use_pgd) { \ - efi_scratch.prev_cr3 = read_cr3(); \ + efi_scratch.prev_cr3 = __read_cr3(); \ write_cr3((unsigned long)efi_scratch.efi_pgt); \ __flush_tlb_all(); \ } \ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5a93f62..cfe6034 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* * This can be used from process context to figure out what the value of - * CR3 is without needing to do a (slow) read_cr3(). + * CR3 is without needing to do a (slow) __read_cr3(). * * It's intended to be used for code like KVM that sneakily changes CR3 * and needs to restore it. It needs to be used very carefully. @@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void) /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || !in_atomic()); - VM_BUG_ON(cr3 != read_cr3()); + VM_BUG_ON(cr3 != __read_cr3()); return cr3; } diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 9a15739..a63e77f 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x) PVOP_VCALL1(pv_mmu_ops.write_cr2, x); } -static inline unsigned long read_cr3(void) +static inline unsigned long __read_cr3(void) { return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); } diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 39fb618..79aa2f9 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -8,4 +8,40 @@ #else #define X86_VM_MASK 0 /* No VM86 support */ #endif + +/* + * CR3's layout varies depending on several things. + * + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID. + * If PAE is enabled, then CR3[11:5] is part of the PDPT address + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored. + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and + * CR3[2:0] and CR3[11:5] are ignored. + * + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD. + * + * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be + * written as 1 to prevent the write to CR3 from flushing the TLB. + * + * On systems with SME, one bit (in a variable position!) is stolen to indicate + * that the top-level paging structure is encrypted. + * + * All of the remaining bits indicate the physical address of the top-level + * paging structure. + * + * CR3_ADDR_MASK is the mask used by read_cr3_pa(). + */ +#ifdef CONFIG_X86_64 +/* Mask off the address space ID bits. */ +#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull +#define CR3_PCID_MASK 0xFFFull +#else +/* + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save + * a tiny bit of code size by setting all the bits. + */ +#define CR3_ADDR_MASK 0xFFFFFFFFull +#define CR3_PCID_MASK 0ull +#endif + #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3cada99..9de02c9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -231,6 +231,14 @@ native_cpuid_reg(ebx) native_cpuid_reg(ecx) native_cpuid_reg(edx) +/* + * Friendlier CR3 helpers. + */ +static inline unsigned long read_cr3_pa(void) +{ + return __read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__pa(pgdir)); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 12af3e3..9efaabf 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val) asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); } -static inline unsigned long native_read_cr3(void) +static inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); @@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x) native_write_cr2(x); } -static inline unsigned long read_cr3(void) +/* + * Careful! CR3 contains more than just an address. You probably want + * read_cr3_pa() instead. + */ +static inline unsigned long __read_cr3(void) { - return native_read_cr3(); + return __native_read_cr3(); } static inline void write_cr3(unsigned long x) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 388c246..5f78c6a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -156,7 +156,7 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); - native_write_cr3(native_read_cr3()); + native_write_cr3(__native_read_cr3()); preempt_enable(); } @@ -264,7 +264,7 @@ static inline void reset_lazy_tlbstate(void) this_cpu_write(cpu_tlbstate.state, 0); this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm); - WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir)); + WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir)); } static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 43b7002..794e8f5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -55,7 +55,8 @@ int __init early_make_pgtable(unsigned long address) pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ - if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) + if (physaddr >= MAXMEM || + read_cr3_pa() != __pa_nodebug(early_level4_pgt)) return -1; again: diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3586996..bc0a849 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, - .read_cr3 = native_read_cr3, + .read_cr3 = __native_read_cr3, .write_cr3 = native_write_cr3, .flush_tlb_user = native_flush_tlb, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ffeae81..c6d6dc5 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); - cr3 = read_cr3(); + cr3 = __read_cr3(); cr4 = __read_cr4(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9c39ab8..c3169be 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); - cr3 = read_cr3(); + cr3 = __read_cr3(); cr4 = __read_cr4(); printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 19cde55..d143dd3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5024,7 +5024,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) * Save the most likely value for this task's CR3 in the VMCS. * We can't use __get_current_cr3_fast() because we're not atomic. */ - cr3 = read_cr3(); + cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ vmx->host_state.vmcs_host_cr3 = cr3; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8ad91a0..2a1fa10c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address) * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ - pgd_paddr = read_cr3(); + pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; @@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn) static void dump_pagetable(unsigned long address) { - pgd_t *base = __va(read_cr3()); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(address)]; p4d_t *p4d; pud_t *pud; @@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address) * happen within a race in page table update. In the later * case just flush: */ - pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); + pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; @@ -555,7 +555,7 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long address) { - pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = base + pgd_index(address); p4d_t *p4d; pud_t *pud; @@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, pgd_t *pgd; pte_t *pte; - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd = __va(read_cr3_pa()); pgd += pgd_index(address); pte = lookup_address_in_pgd(pgd, address, &level); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bbc558b..4c1b5fd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { /* Don't assume we're using swapper_pg_dir at this point */ - pgd_t *base = __va(read_cr3()); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(addr)]; p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index eb8dff1..f40bf62 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void) int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { - save_pgd = (pgd_t *)read_cr3(); + save_pgd = (pgd_t *)__read_cr3(); write_cr3((unsigned long)efi_scratch.efi_pgt); goto out; } @@ -646,7 +646,7 @@ efi_status_t efi_thunk_set_virtual_address_map( efi_sync_low_kernel_mappings(); local_irq_save(flags); - efi_scratch.prev_cr3 = read_cr3(); + efi_scratch.prev_cr3 = __read_cr3(); write_cr3((unsigned long)efi_scratch.efi_pgt); __flush_tlb_all(); diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index c5350fd..0668aaf 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) asmlinkage __visible int xo1_do_sleep(u8 sleep_state) { - void *pgd_addr = __va(read_cr3()); + void *pgd_addr = __va(read_cr3_pa()); /* Program wakeup mask (using dword access to CS5536_PM1_EN) */ outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 6b05a92..78459a6 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt) */ ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); + ctxt->cr3 = __read_cr3(); ctxt->cr4 = __read_cr4(); #ifdef CONFIG_X86_64 ctxt->cr8 = read_cr8(); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a6e21fe..e3e62c8 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -150,7 +150,8 @@ static int relocate_restore_code(void) memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ - pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); + pgd = (pgd_t *)__va(read_cr3_pa()) + + pgd_index(relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code); if (p4d_large(*p4d)) { set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 21beb37..4f63830 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2017,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) pmd_t pmd; pte_t pte; - pa = read_cr3(); + pa = read_cr3_pa(); pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * sizeof(pgd))); if (!pgd_present(pgd)) @@ -2097,7 +2097,7 @@ void __init xen_relocate_p2m(void) pt_phys = pmd_phys + PFN_PHYS(n_pmd); p2m_pfn = PFN_DOWN(pt_phys) + n_pt; - pgd = __va(read_cr3()); + pgd = __va(read_cr3_pa()); new_p2m = (unsigned long *)(2 * PGDIR_SIZE); idx_p4d = 0; save_pud = n_pud; @@ -2204,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) { unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); - BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(read_cr3_pa() != __pa(initial_page_table)); BUG_ON(cr3 != __pa(swapper_pg_dir)); /* -- cgit v1.1