diff options
Diffstat (limited to 'arch/x86')
64 files changed, 1727 insertions, 286 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ded8a67..666ac66 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -144,7 +144,7 @@ config INSTRUCTION_DECODER config PERF_EVENTS_INTEL_UNCORE def_bool y - depends on PERF_EVENTS && SUP_SUP_INTEL && PCI + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI config OUTPUT_FORMAT string @@ -248,6 +248,10 @@ config HAVE_INTEL_TXT def_bool y depends on INTEL_IOMMU && ACPI +config X86_INTEL_MPX + def_bool y + depends on CPU_SUP_INTEL + config X86_32_SMP def_bool y depends on X86_32 && SMP diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8a39181..65516ab 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -88,8 +88,10 @@ suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 +RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ + perl $(srctree)/arch/x86/tools/calc_run_size.pl) quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index cbed140..1d7fbbc 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -207,7 +207,8 @@ relocated: * Do the decompression, and jump to the new kernel.. */ /* push arguments for decompress_kernel: */ - pushl $z_output_len /* decompressed length */ + pushl $z_run_size /* size of kernel with .bss and .brk */ + pushl $z_output_len /* decompressed length, end of relocs */ leal z_extract_offset_negative(%ebx), %ebp pushl %ebp /* output address */ pushl $z_input_len /* input_len */ @@ -217,7 +218,7 @@ relocated: pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call decompress_kernel /* returns kernel location in %eax */ - addl $24, %esp + addl $28, %esp /* * Jump to the decompressed kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2884e0c..6b1766c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -402,13 +402,16 @@ relocated: * Do the decompression, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ + movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ + pushq %r9 movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ - movq $z_output_len, %r9 /* decompressed length */ + movq $z_output_len, %r9 /* decompressed length, end of relocs */ call decompress_kernel /* returns kernel location in %rax */ + popq %r9 popq %rsi /* diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 644abd7..dcc1c53 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -370,7 +370,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, - unsigned long output_len) + unsigned long output_len, + unsigned long run_size) { real_mode = rmode; @@ -393,8 +394,14 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; - output = choose_kernel_location(input_data, input_len, - output, output_len); + /* + * The memory hole needed for the kernel is the larger of either + * the entire decompressed kernel plus relocation table, or the + * entire decompressed kernel plus .bss and .brk sections. + */ + output = choose_kernel_location(input_data, input_len, output, + output_len > run_size ? output_len + : run_size); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index b669ab6..d8222f2 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -36,11 +36,13 @@ int main(int argc, char *argv[]) uint32_t olen; long ilen; unsigned long offs; + unsigned long run_size; FILE *f = NULL; int retval = 1; - if (argc < 2) { - fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); + if (argc < 3) { + fprintf(stderr, "Usage: %s compressed_file run_size\n", + argv[0]); goto bail; } @@ -74,6 +76,7 @@ int main(int argc, char *argv[]) offs += olen >> 12; /* Add 8 bytes for each 32K block */ offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ + run_size = atoi(argv[2]); printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); @@ -85,6 +88,8 @@ int main(int argc, char *argv[]) /* z_extract_offset_negative allows simplification of head_32.S */ printf(".globl z_extract_offset_negative\n"); printf("z_extract_offset_negative = -0x%lx\n", offs); + printf(".globl z_run_size\n"); + printf("z_run_size = %lu\n", run_size); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 97534a7..f226df0 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -10,6 +10,12 @@ * cpu_feature_enabled(). */ +#ifdef CONFIG_X86_INTEL_MPX +# define DISABLE_MPX 0 +#else +# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) +#endif + #ifdef CONFIG_X86_64 # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) @@ -34,6 +40,6 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 #define DISABLED_MASK8 0 -#define DISABLED_MASK9 0 +#define DISABLED_MASK9 (DISABLE_MPX) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 48eb30a..47f29b1 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -65,6 +65,7 @@ struct insn { unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ + const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; @@ -96,7 +97,7 @@ struct insn { #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ -extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); +extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); extern void insn_get_modrm(struct insn *insn); @@ -115,12 +116,13 @@ static inline void insn_get_attribute(struct insn *insn) extern int insn_rip_relative(struct insn *insn); /* Init insn for kernel text */ -static inline void kernel_insn_init(struct insn *insn, const void *kaddr) +static inline void kernel_insn_init(struct insn *insn, + const void *kaddr, int buf_len) { #ifdef CONFIG_X86_64 - insn_init(insn, kaddr, 1); + insn_init(insn, kaddr, buf_len, 1); #else /* CONFIG_X86_32 */ - insn_init(insn, kaddr, 0); + insn_init(insn, kaddr, buf_len, 0); #endif } diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index b8237d8..0cdbe6e 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -74,6 +74,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb_relaxed(v, a) __writeb(v, a) +#define writew_relaxed(v, a) __writew(v, a) +#define writel_relaxed(v, a) __writel(v, a) #define __raw_writeb __writeb #define __raw_writew __writew #define __raw_writel __writel @@ -86,6 +89,7 @@ build_mmio_read(readq, "q", unsigned long, "=r", :"memory") build_mmio_write(writeq, "q", unsigned long, "r", :"memory") #define readq_relaxed(a) readq(a) +#define writeq_relaxed(v, a) writeq(v, a) #define __raw_readq(a) readq(a) #define __raw_writeq(val, addr) writeq(val, addr) @@ -310,8 +314,8 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) -extern void *xlate_dev_mem_ptr(unsigned long phys); -extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); +extern void *xlate_dev_mem_ptr(phys_addr_t phys); +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, unsigned long prot_val); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 166af2a..40269a2 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -10,9 +10,8 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> +#include <asm/mpx.h> #ifndef CONFIG_PARAVIRT -#include <asm-generic/mm_hooks.h> - static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { @@ -53,7 +52,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* Load the LDT, if the LDT is different: */ + /* + * Load the LDT, if the LDT is different. + * + * It's possible leave_mm(prev) has been called. If so, + * then prev->context.ldt could be out of sync with the + * LDT descriptor or the LDT register. This can only happen + * if prev->context.ldt is non-null, since we never free + * an LDT. But LDTs can't be shared across mms, so + * prev->context.ldt won't be equal to next->context.ldt. + */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } @@ -102,4 +110,27 @@ do { \ } while (0) #endif +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + paravirt_arch_dup_mmap(oldmm, mm); +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ + paravirt_arch_exit_mmap(mm); +} + +static inline void arch_bprm_mm_init(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + mpx_mm_init(mm); +} + +static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + mpx_notify_unmap(mm, vma, start, end); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h new file mode 100644 index 0000000..a952a13d --- /dev/null +++ b/arch/x86/include/asm/mpx.h @@ -0,0 +1,103 @@ +#ifndef _ASM_X86_MPX_H +#define _ASM_X86_MPX_H + +#include <linux/types.h> +#include <asm/ptrace.h> +#include <asm/insn.h> + +/* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ +#define MPX_INVALID_BOUNDS_DIR ((void __user *)-1) +#define MPX_BNDCFG_ENABLE_FLAG 0x1 +#define MPX_BD_ENTRY_VALID_FLAG 0x1 + +#ifdef CONFIG_X86_64 + +/* upper 28 bits [47:20] of the virtual address in 64-bit used to + * index into bounds directory (BD). + */ +#define MPX_BD_ENTRY_OFFSET 28 +#define MPX_BD_ENTRY_SHIFT 3 +/* bits [19:3] of the virtual address in 64-bit used to index into + * bounds table (BT). + */ +#define MPX_BT_ENTRY_OFFSET 17 +#define MPX_BT_ENTRY_SHIFT 5 +#define MPX_IGN_BITS 3 +#define MPX_BD_ENTRY_TAIL 3 + +#else + +#define MPX_BD_ENTRY_OFFSET 20 +#define MPX_BD_ENTRY_SHIFT 2 +#define MPX_BT_ENTRY_OFFSET 10 +#define MPX_BT_ENTRY_SHIFT 4 +#define MPX_IGN_BITS 2 +#define MPX_BD_ENTRY_TAIL 2 + +#endif + +#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT)) +#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT)) + +#define MPX_BNDSTA_TAIL 2 +#define MPX_BNDCFG_TAIL 12 +#define MPX_BNDSTA_ADDR_MASK (~((1UL<<MPX_BNDSTA_TAIL)-1)) +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) +#define MPX_BT_ADDR_MASK (~((1UL<<MPX_BD_ENTRY_TAIL)-1)) + +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) +#define MPX_BNDSTA_ERROR_CODE 0x3 + +#define MPX_BD_ENTRY_MASK ((1<<MPX_BD_ENTRY_OFFSET)-1) +#define MPX_BT_ENTRY_MASK ((1<<MPX_BT_ENTRY_OFFSET)-1) +#define MPX_GET_BD_ENTRY_OFFSET(addr) ((((addr)>>(MPX_BT_ENTRY_OFFSET+ \ + MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT) +#define MPX_GET_BT_ENTRY_OFFSET(addr) ((((addr)>>MPX_IGN_BITS) & \ + MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT) + +#ifdef CONFIG_X86_INTEL_MPX +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf); +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf); +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ + /* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; +} +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end); +#else +static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf) +{ + return NULL; +} +static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + return -EINVAL; +} +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return 0; +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ +} +static inline void mpx_notify_unmap(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} +#endif /* CONFIG_X86_INTEL_MPX */ + +#endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index f48b17d..3a52ee0 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -20,7 +20,6 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 #define NMI_STACK 0 #define DEBUG_STACK 0 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6782051..75450b2 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,12 +14,11 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 2 +#define DEBUG_STACK 3 +#define MCE_STACK 4 +#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cd6e161..32444ae 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -330,13 +330,13 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); } -static inline void arch_exit_mmap(struct mm_struct *mm) +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); } @@ -986,5 +986,15 @@ extern void default_banner(void); #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop +#ifndef __ASSEMBLY__ +static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ +} + +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) +{ +} +#endif /* __ASSEMBLY__ */ #endif /* !CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8dfc9fd..dc0f6ed 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -177,6 +177,9 @@ struct x86_pmu_capability { #define IBS_CAPS_BRNTRGT (1U<<5) #define IBS_CAPS_OPCNTEXT (1U<<6) #define IBS_CAPS_RIPINVALIDCHK (1U<<7) +#define IBS_CAPS_OPBRNFUSE (1U<<8) +#define IBS_CAPS_FETCHCTLEXTD (1U<<9) +#define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 4008734..8f327184 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,9 +30,6 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define task_preempt_count(p) \ - (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED) - #define init_task_preempt_count(p) do { \ task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ } while (0) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3aeb31c..25b8de0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,13 +374,14 @@ struct lwp_struct { u8 reserved[128]; }; -struct bndregs_struct { - u64 bndregs[8]; +struct bndreg { + u64 lower_bound; + u64 upper_bound; } __packed; -struct bndcsr_struct { - u64 cfg_reg_u; - u64 status_reg; +struct bndcsr { + u64 bndcfgu; + u64 bndstatus; } __packed; struct xsave_hdr_struct { @@ -394,8 +395,8 @@ struct xsave_struct { struct xsave_hdr_struct xsave_hdr; struct ymmh_struct ymmh; struct lwp_struct lwp; - struct bndregs_struct bndregs; - struct bndcsr_struct bndcsr; + struct bndreg bndreg[4]; + struct bndcsr bndcsr; /* new processor state extensions will go here */ } __attribute__ ((packed, aligned (64))); @@ -953,6 +954,24 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +/* Register/unregister a process' MPX related resource */ +#define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk)) +#define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk)) + +#ifdef CONFIG_X86_INTEL_MPX +extern int mpx_enable_management(struct task_struct *tsk); +extern int mpx_disable_management(struct task_struct *tsk); +#else +static inline int mpx_enable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +static inline int mpx_disable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +#endif /* CONFIG_X86_INTEL_MPX */ + extern u16 amd_get_nb_id(int cpu); static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd27e0..8cd1cc3 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -150,6 +150,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) } void cpu_disable_common(void); +void cpu_die_common(unsigned int cpu); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 9295016..a4efe47 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -183,8 +183,20 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - while (arch_spin_is_locked(lock)) + __ticket_t head = ACCESS_ONCE(lock->tickets.head); + + for (;;) { + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + /* + * We need to check "unlocked" in a loop, tmp.head == head + * can be false positive because of overflow. + */ + if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) || + tmp.head != head) + break; + cpu_relax(); + } } /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index d7f3b3b..751bf4b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do { \ #else /* CONFIG_X86_32 */ /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" +#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" #define __EXTRA_CLOBBER \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" + "r12", "r13", "r14", "r15", "flags" #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ @@ -100,7 +100,11 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ -/* Save restore flags to clear handle leaking NT */ +/* + * There is no need to save or restore flags, because flags are always + * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL + * has no effect. + */ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8540538..547e344 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -141,7 +141,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ - _TIF_USER_RETURN_NOTIFY) + _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bc8352e..707adc6 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -39,6 +39,7 @@ asmlinkage void simd_coprocessor_error(void); #ifdef CONFIG_TRACING asmlinkage void trace_page_fault(void); +#define trace_stack_segment stack_segment #define trace_divide_error divide_error #define trace_bounds bounds #define trace_invalid_op invalid_op diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e45e4da..f58a9c7 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -172,7 +172,6 @@ struct x86_platform_ops { struct pci_dev; struct msi_msg; -struct msi_desc; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); @@ -183,8 +182,6 @@ struct x86_msi_ops { void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); int (*setup_hpet_msi)(unsigned int irq, unsigned int id); - u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag); - u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag); }; struct IO_APIC_route_entry; diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index e21331ce..8f02f69 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -206,6 +206,7 @@ #define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1) #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +#define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ /* Fam 16h MSRs */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index f04dbb3..5caed1d 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,6 +21,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, {} @@ -30,6 +31,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, {} diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1183d54..7ffe0a2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3158,7 +3158,7 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - __write_msi_msg(data->msi_desc, &msg); + __pci_write_msi_msg(data->msi_desc, &msg); return IRQ_SET_MASK_OK_NOCOPY; } @@ -3169,8 +3169,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) */ static struct irq_chip msi_chip = { .name = "PCI-MSI", - .irq_unmask = unmask_msi_irq, - .irq_mask = mask_msi_irq, + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, .irq_ack = ack_apic_edge, .irq_set_affinity = msi_set_affinity, .irq_retrigger = ioapic_retrigger_irq, @@ -3196,7 +3196,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. */ if (!irq_offset) - write_msi_msg(irq, &msg); + pci_write_msi_msg(irq, &msg); setup_remapped_irq(irq, irq_cfg(irq), chip); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e7c798b..4f9359f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -48,7 +48,6 @@ int main(void) #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); - ENTRY(bx); ENTRY(cx); ENTRY(dx); ENTRY(sp); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b4f78c..cfa9b5b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -146,6 +146,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { + if (strlen(s)) + return 0; setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); setup_clear_cpu_cap(X86_FEATURE_XSAVES); diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 7aa1acc..0667447 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -108,12 +108,13 @@ static size_t compute_container_size(u8 *data, u32 total_size) * load_microcode_amd() to save equivalent cpu table and microcode patches in * kernel heap memory. */ -static void apply_ucode_in_initrd(void *ucode, size_t size) +static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) { struct equiv_cpu_entry *eq; size_t *cont_sz; u32 *header; u8 *data, **cont; + u8 (*patch)[PATCH_MAX_SIZE]; u16 eq_id = 0; int offset, left; u32 rev, eax, ebx, ecx, edx; @@ -123,10 +124,12 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); cont_sz = (size_t *)__pa_nodebug(&container_size); cont = (u8 **)__pa_nodebug(&container); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; cont_sz = &container_size; cont = &container; + patch = &amd_ucode_patch; #endif data = ucode; @@ -213,9 +216,9 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) rev = mc->hdr.patch_id; *new_rev = rev; - /* save ucode patch */ - memcpy(amd_ucode_patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); + if (save_patch) + memcpy(patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); } } @@ -246,7 +249,7 @@ void __init load_ucode_amd_bsp(void) *data = cp.data; *size = cp.size; - apply_ucode_in_initrd(cp.data, cp.size); + apply_ucode_in_initrd(cp.data, cp.size, true); } #ifdef CONFIG_X86_32 @@ -263,7 +266,7 @@ void load_ucode_amd_ap(void) size_t *usize; void **ucode; - mc = (struct microcode_amd *)__pa(amd_ucode_patch); + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { __apply_microcode_amd(mc); return; @@ -275,7 +278,7 @@ void load_ucode_amd_ap(void) if (!*ucode || !*usize) return; - apply_ucode_in_initrd(*ucode, *usize); + apply_ucode_in_initrd(*ucode, *usize, false); } static void __init collect_cpu_sig_on_bsp(void *arg) @@ -339,7 +342,7 @@ void load_ucode_amd_ap(void) * AP has a different equivalence ID than BSP, looks like * mixed-steppings silicon so go through the ucode blob anew. */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); } } #endif @@ -347,7 +350,9 @@ void load_ucode_amd_ap(void) int __init save_microcode_in_initrd_amd(void) { unsigned long cont; + int retval = 0; enum ucode_state ret; + u8 *cont_va; u32 eax; if (!container) @@ -355,13 +360,15 @@ int __init save_microcode_in_initrd_amd(void) #ifdef CONFIG_X86_32 get_bsp_sig(); - cont = (unsigned long)container; + cont = (unsigned long)container; + cont_va = __va(container); #else /* * We need the physical address of the container for both bitness since * boot_params.hdr.ramdisk_image is a physical address. */ - cont = __pa(container); + cont = __pa(container); + cont_va = container; #endif /* @@ -372,6 +379,8 @@ int __init save_microcode_in_initrd_amd(void) if (relocated_ramdisk) container = (u8 *)(__va(relocated_ramdisk) + (cont - boot_params.hdr.ramdisk_image)); + else + container = cont_va; if (ucode_new_rev) pr_info("microcode: updated early to new patch_level=0x%08x\n", @@ -382,7 +391,7 @@ int __init save_microcode_in_initrd_amd(void) ret = load_microcode_amd(eax, container, container_size); if (ret != UCODE_OK) - return -EINVAL; + retval = -EINVAL; /* * This will be freed any msec now, stash patches for the current @@ -391,5 +400,5 @@ int __init save_microcode_in_initrd_amd(void) container = NULL; container_size = 0; - return 0; + return retval; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index dd9d619..08fe6e8 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -465,6 +465,16 @@ static void mc_bp_resume(void) if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); +#ifdef CONFIG_X86_64 + else if (!uci->mc) + /* + * We might resume and not have applied late microcode but still + * have a newer patch stashed from the early loader. We don't + * have it in uci->mc so we have to load it the same way we're + * applying patches early on the APs. + */ + load_ucode_ap(); +#endif } static struct syscore_ops mc_syscore_ops = { diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index 5f28a64..2c017f2 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -124,7 +124,7 @@ void __init load_ucode_bsp(void) static bool check_loader_disabled_ap(void) { #ifdef CONFIG_X86_32 - return __pa_nodebug(dis_ucode_ldr); + return *((bool *)__pa_nodebug(&dis_ucode_ldr)); #else return dis_ucode_ldr; #endif diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fc5eb39..4e6cdb0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -253,6 +253,10 @@ struct cpu_hw_events { #define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +/* Like UEVENT_CONSTRAINT, but match flags too */ +#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + #define INTEL_PLD_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index cbb1be3e..a61f5c6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -565,6 +565,21 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + /* + * Read IbsBrTarget and IbsOpData4 separately + * depending on their availability. + * Can't add to offset_max as they are staggered + */ + if (ibs_caps & IBS_CAPS_BRNTRGT) { + rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + size++; + } + if (ibs_caps & IBS_CAPS_OPDATA4) { + rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + size++; + } + } ibs_data.size = sizeof(u64) * size; regs = *iregs; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 46211bc..3c895d4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -552,18 +552,18 @@ int intel_pmu_drain_bts_buffer(void) * PEBS */ struct event_constraint intel_core2_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_atom_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; @@ -577,36 +577,36 @@ struct event_constraint intel_slm_pebs_event_constraints[] = { struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_snb_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ @@ -617,7 +617,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { }; struct event_constraint intel_ivb_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ @@ -628,7 +628,7 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { }; struct event_constraint intel_hsw_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), @@ -724,6 +724,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long ip = regs->ip; int is_64bit = 0; void *kaddr; + int size; /* * We don't need to fixup if the PEBS assist is fault like @@ -758,11 +759,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 1; } + size = ip - to; if (!kernel_ip(ip)) { - int size, bytes; + int bytes; u8 *buf = this_cpu_read(insn_buffer); - size = ip - to; /* Must fit our buffer, see above */ + /* 'size' must fit our buffer, see above */ bytes = copy_from_user_nmi(buf, (void __user *)to, size); if (bytes != 0) return 0; @@ -780,11 +782,20 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) #ifdef CONFIG_X86_64 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); #endif - insn_init(&insn, kaddr, is_64bit); + insn_init(&insn, kaddr, size, is_64bit); insn_get_length(&insn); + /* + * Make sure there was not a problem decoding the + * instruction and getting the length. This is + * doubly important because we have an infinite + * loop if insn.length=0. + */ + if (!insn.length) + break; to += insn.length; kaddr += insn.length; + size -= insn.length; } while (to < ip); if (to == ip) { @@ -886,6 +897,29 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.bp = pebs->bp; regs.sp = pebs->sp; + if (sample_type & PERF_SAMPLE_REGS_INTR) { + regs.ax = pebs->ax; + regs.bx = pebs->bx; + regs.cx = pebs->cx; + regs.dx = pebs->dx; + regs.si = pebs->si; + regs.di = pebs->di; + regs.bp = pebs->bp; + regs.sp = pebs->sp; + + regs.flags = pebs->flags; +#ifndef CONFIG_X86_32 + regs.r8 = pebs->r8; + regs.r9 = pebs->r9; + regs.r10 = pebs->r10; + regs.r11 = pebs->r11; + regs.r12 = pebs->r12; + regs.r13 = pebs->r13; + regs.r14 = pebs->r14; + regs.r15 = pebs->r15; +#endif + } + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { regs.ip = pebs->real_ip; regs.flags |= PERF_EFLAGS_EXACT; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 45fa730..58f1a94 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -465,7 +465,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) { struct insn insn; void *addr; - int bytes, size = MAX_INSN_SIZE; + int bytes_read, bytes_left; int ret = X86_BR_NONE; int ext, to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; @@ -493,8 +493,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return X86_BR_NONE; /* may fail if text not present */ - bytes = copy_from_user_nmi(buf, (void __user *)from, size); - if (bytes != 0) + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) return X86_BR_NONE; addr = buf; @@ -505,10 +507,19 @@ static int branch_type(unsigned long from, unsigned long to, int abort) * Ensure we don't blindy read any address by validating it is * a known text address. */ - if (kernel_text_address(from)) + if (kernel_text_address(from)) { addr = (void *)from; - else + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { return X86_BR_NONE; + } } /* @@ -518,8 +529,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) #ifdef CONFIG_X86_64 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); #endif - insn_init(&insn, addr, is64); + insn_init(&insn, addr, bytes_read, is64); insn_get_opcode(&insn); + if (!insn.opcode.got) + return X86_BR_ABORT; switch (insn.opcode.bytes[0]) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index adf138e..745b158 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -449,7 +449,11 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = { static struct uncore_event_desc snbep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -486,14 +490,17 @@ static struct attribute_group snbep_uncore_qpi_format_group = { .attrs = snbep_uncore_qpi_formats_attr, }; -#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ - .init_box = snbep_uncore_msr_init_box, \ +#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ .disable_box = snbep_uncore_msr_disable_box, \ .enable_box = snbep_uncore_msr_enable_box, \ .disable_event = snbep_uncore_msr_disable_event, \ .enable_event = snbep_uncore_msr_enable_event, \ .read_counter = uncore_msr_read_counter +#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ + __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), \ + .init_box = snbep_uncore_msr_init_box \ + static struct intel_uncore_ops snbep_uncore_msr_ops = { SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), }; @@ -1919,6 +1926,30 @@ static struct intel_uncore_type hswep_uncore_cbox = { .format_group = &hswep_uncore_cbox_format_group, }; +/* + * Write SBOX Initialization register bit by bit to avoid spurious #GPs + */ +static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + + if (msr) { + u64 init = SNBEP_PMON_BOX_CTL_INT; + u64 flags = 0; + int i; + + for_each_set_bit(i, (unsigned long *)&init, 64) { + flags |= (1ULL << i); + wrmsrl(msr, flags); + } + } +} + +static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = { + __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .init_box = hswep_uncore_sbox_msr_init_box +}; + static struct attribute *hswep_uncore_sbox_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -1944,7 +1975,7 @@ static struct intel_uncore_type hswep_uncore_sbox = { .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, .msr_offset = HSWEP_SBOX_MSR_OFFSET, - .ops = &snbep_uncore_msr_ops, + .ops = &hswep_uncore_sbox_msr_ops, .format_group = &hswep_uncore_sbox_format_group, }; @@ -2009,7 +2040,11 @@ static struct intel_uncore_type hswep_uncore_ha = { static struct uncore_event_desc hswep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -2025,13 +2060,27 @@ static struct intel_uncore_type hswep_uncore_imc = { SNBEP_UNCORE_PCI_COMMON_INIT(), }; +static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8}; + +static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count); + pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + + return count; +} + static struct intel_uncore_ops hswep_uncore_irp_ops = { .init_box = snbep_uncore_pci_init_box, .disable_box = snbep_uncore_pci_disable_box, .enable_box = snbep_uncore_pci_enable_box, .disable_event = ivbep_uncore_irp_disable_event, .enable_event = ivbep_uncore_irp_enable_event, - .read_counter = ivbep_uncore_irp_read_counter, + .read_counter = hswep_uncore_irp_read_counter, }; static struct intel_uncore_type hswep_uncore_irp = { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 5433658..e7d8c76 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -72,7 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (c->x86_mask || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else - seq_printf(m, "stepping\t: unknown\n"); + seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); @@ -92,12 +92,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) show_cpuinfo_core(m, c, cpu); show_cpuinfo_misc(m, c); - seq_printf(m, "flags\t\t:"); + seq_puts(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); - seq_printf(m, "\nbugs\t\t:"); + seq_puts(m, "\nbugs\t\t:"); for (i = 0; i < 32*NBUGINTS; i++) { unsigned int bug_bit = 32*NCAPINTS + i; @@ -118,7 +118,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); - seq_printf(m, "power management:"); + seq_puts(m, "power management:"); for (i = 0; i < 32; i++) { if (c->x86_power & (1 << i)) { if (i < ARRAY_SIZE(x86_power_flags) && @@ -131,7 +131,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) } } - seq_printf(m, "\n\n"); + seq_puts(m, "\n\n"); return 0; } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 1abcb50..ff86f19 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -24,7 +24,6 @@ static char x86_stack_ids[][8] = { [ DEBUG_STACK-1 ] = "#DB", [ NMI_STACK-1 ] = "NMI", [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ STACKFAULT_STACK-1 ] = "#SS", [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [ N_EXCEPTION_STACKS ... diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index df088bb..c0226ab 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -828,9 +828,15 @@ ENTRY(native_iret) jnz native_irq_return_ldt #endif +.global native_irq_return_iret native_irq_return_iret: + /* + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. + */ iretq - _ASM_EXTABLE(native_irq_return_iret, bad_iret) #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: @@ -858,25 +864,6 @@ native_irq_return_ldt: jmp native_irq_return_iret #endif - .section .fixup,"ax" -bad_iret: - /* - * The iret traps when the %cs or %ss being restored is bogus. - * We've lost the original trap vector and error code. - * #GPF is the most likely one to get for an invalid selector. - * So pretend we completed the iret and took the #GPF in user mode. - * - * We are now running with the kernel GS after exception recovery. - * But error_entry expects us to have user GS to match the user %cs, - * so swap back. - */ - pushq $0 - - SWAPGS - jmp general_protection - - .previous - /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE @@ -922,37 +909,6 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) - /* - * If IRET takes a fault on the espfix stack, then we - * end up promoting it to a doublefault. In that case, - * modify the stack to make it look like we just entered - * the #GP handler from user space, similar to bad_iret. - */ -#ifdef CONFIG_X86_ESPFIX64 - ALIGN -__do_double_fault: - XCPT_FRAME 1 RDI+8 - movq RSP(%rdi),%rax /* Trap on the espfix stack? */ - sarq $PGDIR_SHIFT,%rax - cmpl $ESPFIX_PGD_ENTRY,%eax - jne do_double_fault /* No, just deliver the fault */ - cmpl $__KERNEL_CS,CS(%rdi) - jne do_double_fault - movq RIP(%rdi),%rax - cmpq $native_irq_return_iret,%rax - jne do_double_fault /* This shouldn't happen... */ - movq PER_CPU_VAR(kernel_stack),%rax - subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ - movq %rax,RSP(%rdi) - movq $0,(%rax) /* Missing (lost) #GP error code */ - movq $general_protection,RIP(%rdi) - retq - CFI_ENDPROC -END(__do_double_fault) -#else -# define __do_double_fault do_double_fault -#endif - /* * APIC interrupts. */ @@ -1124,7 +1080,7 @@ idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 +idtentry double_fault do_double_fault has_error_code=1 paranoid=1 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 @@ -1289,7 +1245,7 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 +idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN idtentry xen_debug do_debug has_error_code=0 idtentry xen_int3 do_int3 has_error_code=0 @@ -1399,17 +1355,16 @@ error_sti: /* * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. The exception handlers after iret run with - * kernel gs again, so don't set the user space flag. B stepping K8s - * sometimes report an truncated RIP for IRET exceptions returning to - * compat mode. Check for these here too. + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. */ error_kernelspace: CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) - je error_swapgs + je error_bad_iret movl %ecx,%eax /* zero extend */ cmpq %rax,RIP+8(%rsp) je bstep_iret @@ -1420,7 +1375,15 @@ error_kernelspace: bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) - jmp error_swapgs + /* fall through */ + +error_bad_iret: + SWAPGS + mov %rsp,%rdi + call fixup_bad_iret + mov %rax,%rsp + decl %ebx /* Return to usergs */ + jmp error_sti CFI_ENDPROC END(error_entry) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 922d285..6307a0f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -59,78 +59,78 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); + seq_puts(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); + seq_puts(p, " Local timer interrupts\n"); seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); + seq_puts(p, " Spurious interrupts\n"); seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance monitoring interrupts\n"); + seq_puts(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); - seq_printf(p, " IRQ work interrupts\n"); + seq_puts(p, " IRQ work interrupts\n"); seq_printf(p, "%*s: ", prec, "RTR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); - seq_printf(p, " APIC ICR read retries\n"); + seq_puts(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); - seq_printf(p, " Platform interrupts\n"); + seq_puts(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); + seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - irq_stats(j)->irq_tlb_count); - seq_printf(p, " Function call interrupts\n"); + seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); + seq_puts(p, " TLB shootdowns\n"); #endif #ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); + seq_puts(p, " Thermal event interrupts\n"); #endif #ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); + seq_puts(p, " Threshold APIC interrupts\n"); #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); - seq_printf(p, " Machine check exceptions\n"); + seq_puts(p, " Machine check exceptions\n"); seq_printf(p, "%*s: ", prec, "MCP"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); - seq_printf(p, " Machine check polls\n"); + seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); - seq_printf(p, " Hypervisor callback interrupts\n"); + seq_puts(p, " Hypervisor callback interrupts\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 67e6d19e..f7e3cd5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -285,7 +285,7 @@ static int can_probe(unsigned long paddr) * normally used, we just go through if there is no kprobe. */ __addr = recover_probed_instruction(buf, addr); - kernel_insn_init(&insn, (void *)__addr); + kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); insn_get_length(&insn); /* @@ -330,8 +330,10 @@ int __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; + unsigned long recovered_insn = + recover_probed_instruction(buf, (unsigned long)src); - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint, failed to recover */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) @@ -342,7 +344,7 @@ int __copy_instruction(u8 *dest, u8 *src) if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest); + kernel_insn_init(&insn, dest, insn.length); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index f1314d0..7c523bb 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -251,13 +251,15 @@ static int can_optimize(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr - offset + size) { /* Decode until function end */ + unsigned long recovered_insn; if (search_exception_tables(addr)) /* * Since some fixup code will jumps into this function, * we can't optimize kprobe in this function. */ return 0; - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); + recovered_insn = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 749b0e4..e510618 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1484,7 +1484,7 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) */ if (work & _TIF_NOHZ) { user_exit(); - work &= ~TIF_NOHZ; + work &= ~_TIF_NOHZ; } #ifdef CONFIG_SECCOMP diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ab08aa2..214245d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -960,6 +960,8 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; + mpx_mm_init(&init_mm); + code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; data_resource.start = __pa_symbol(_etext); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a03ec604..7a8f584 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1303,10 +1303,14 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } +static DEFINE_PER_CPU(struct completion, die_complete); + void cpu_disable_common(void) { int cpu = smp_processor_id(); + init_completion(&per_cpu(die_complete, smp_processor_id())); + remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ @@ -1316,8 +1320,6 @@ void cpu_disable_common(void) fixup_irqs(); } -static DEFINE_PER_CPU(struct completion, die_complete); - int native_cpu_disable(void) { int ret; @@ -1327,16 +1329,21 @@ int native_cpu_disable(void) return ret; clear_local_APIC(); - init_completion(&per_cpu(die_complete, smp_processor_id())); cpu_disable_common(); return 0; } +void cpu_die_common(unsigned int cpu) +{ + wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); +} + void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ - wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); + + cpu_die_common(cpu); /* They ack this in play_dead() by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0d0e922..a9ae205 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> +#include <asm/mpx.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -228,37 +229,44 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) -#endif DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - enum ctx_state prev_state; - - prev_state = exception_enter(); - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { - preempt_conditional_sti(regs); - do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); - } - exception_exit(prev_state); -} - dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_X86_ESPFIX64 + extern unsigned char native_irq_return_iret[]; + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we + * end up promoting it to a doublefault. In that case, modify + * the stack to make it look like we just entered the #GP + * handler from user space, similar to bad_iret. + */ + if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { + struct pt_regs *normal_regs = task_pt_regs(current); + + /* Fake a #GP(0) from userspace. */ + memmove(&normal_regs->ip, (void *)regs->sp, 5*8); + normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + regs->ip = (unsigned long)general_protection; + regs->sp = (unsigned long)&normal_regs->orig_ax; + return; + } +#endif + exception_enter(); /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -278,6 +286,89 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) } #endif +dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + struct xsave_struct *xsave_buf; + enum ctx_state prev_state; + struct bndcsr *bndcsr; + siginfo_t *info; + + prev_state = exception_enter(); + if (notify_die(DIE_TRAP, "bounds", regs, error_code, + X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) + goto exit; + conditional_sti(regs); + + if (!user_mode(regs)) + die("bounds", regs, error_code); + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) { + /* The exception is not from Intel MPX */ + goto exit_trap; + } + + /* + * We need to look at BNDSTATUS to resolve this exception. + * It is not directly accessible, though, so we need to + * do an xsave and then pull it out of the xsave buffer. + */ + fpu_save_init(&tsk->thread.fpu); + xsave_buf = &(tsk->thread.fpu.state->xsave); + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + goto exit_trap; + + /* + * The error code field of the BNDSTATUS register communicates status + * information of a bound range exception #BR or operation involving + * bound directory. + */ + switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { + case 2: /* Bound directory has invalid entry. */ + if (mpx_handle_bd_fault(xsave_buf)) + goto exit_trap; + break; /* Success, it was handled */ + case 1: /* Bound violation. */ + info = mpx_generate_siginfo(regs, xsave_buf); + if (PTR_ERR(info)) { + /* + * We failed to decode the MPX instruction. Act as if + * the exception was not caused by MPX. + */ + goto exit_trap; + } + /* + * Success, we decoded the instruction and retrieved + * an 'info' containing the address being accessed + * which caused the exception. This information + * allows and application to possibly handle the + * #BR exception itself. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info); + kfree(info); + break; + case 0: /* No exception caused by Intel MPX operations. */ + goto exit_trap; + default: + die("bounds", regs, error_code); + } + +exit: + exception_exit(prev_state); + return; +exit_trap: + /* + * This path out is for all the cases where we could not + * handle the exception in some way (like allocating a + * table or telling userspace about it. We will also end + * up here if the kernel has MPX turned off at compile + * time.. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); + exception_exit(prev_state); +} + dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { @@ -379,7 +470,7 @@ NOKPROBE_SYMBOL(do_int3); * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -399,6 +490,36 @@ asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) return regs; } NOKPROBE_SYMBOL(sync_regs); + +struct bad_iret_stack { + void *error_entry_ret; + struct pt_regs regs; +}; + +asmlinkage __visible notrace +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +{ + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault + * correctly, we want move our stack frame to task_pt_regs + * and we want to pretend that the exception came from the + * iret target. + */ + struct bad_iret_stack *new_stack = + container_of(task_pt_regs(current), + struct bad_iret_stack, regs); + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + + /* Copy the remainder of the stack from the current stack. */ + memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + + BUG_ON(!user_mode_vm(&new_stack->regs)); + return new_stack; +} +NOKPROBE_SYMBOL(fixup_bad_iret); #endif /* @@ -778,7 +899,7 @@ void __init trap_init(void) set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); set_intr_gate(X86_TRAP_TS, invalid_TSS); set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_SS, stack_segment); set_intr_gate(X86_TRAP_GP, general_protection); set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); set_intr_gate(X86_TRAP_MF, coprocessor_error); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5d1cbfe..8b96a94 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -219,7 +219,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool { u32 volatile *good_insns; - insn_init(insn, auprobe->insn, x86_64); + insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); /* has the side-effect of processing the entire instruction */ insn_get_length(insn); if (WARN_ON_ONCE(!insn_complete(insn))) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e48b674..234b072 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -116,8 +116,6 @@ struct x86_msi_ops x86_msi = { .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, .setup_hpet_msi = default_setup_hpet_msi, - .msi_mask_irq = default_msi_mask_irq, - .msix_mask_irq = default_msix_mask_irq, }; /* MSI arch specific hooks */ @@ -140,14 +138,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) { x86_msi.restore_msi_irqs(dev); } -u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) -{ - return x86_msi.msi_mask_irq(desc, mask, flag); -} -u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag) -{ - return x86_msi.msix_mask_irq(desc, flag); -} #endif struct x86_io_apic_ops x86_io_apic_ops = { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 749f9fa..9f8a2fa 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -574,12 +574,14 @@ static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, case 4: ctxt->_eip = (u32)dst; break; +#ifdef CONFIG_X86_64 case 8: if ((cs_l && is_noncanonical_address(dst)) || - (!cs_l && (dst & ~(u32)-1))) + (!cs_l && (dst >> 32) != 0)) return emulate_gp(ctxt, 0); ctxt->_eip = dst; break; +#endif default: WARN(1, "unsupported eip assignment size\n"); } @@ -641,7 +643,8 @@ static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) static int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, - unsigned size, bool write, bool fetch, + unsigned *max_size, unsigned size, + bool write, bool fetch, ulong *linear) { struct desc_struct desc; @@ -652,10 +655,15 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, unsigned cpl; la = seg_base(ctxt, addr.seg) + addr.ea; + *max_size = 0; switch (ctxt->mode) { case X86EMUL_MODE_PROT64: if (((signed long)la << 16) >> 16 != la) return emulate_gp(ctxt, 0); + + *max_size = min_t(u64, ~0u, (1ull << 48) - la); + if (size > *max_size) + goto bad; break; default: usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, @@ -673,20 +681,25 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch && (ctxt->d & NoBigReal)) { /* la is between zero and 0xffff */ - if (la > 0xffff || (u32)(la + size - 1) > 0xffff) + if (la > 0xffff) goto bad; + *max_size = 0x10000 - la; } else if ((desc.type & 8) || !(desc.type & 4)) { /* expand-up segment */ - if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + if (addr.ea > lim) goto bad; + *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); } else { /* expand-down segment */ - if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) + if (addr.ea <= lim) goto bad; lim = desc.d ? 0xffffffff : 0xffff; - if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + if (addr.ea > lim) goto bad; + *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); } + if (size > *max_size) + goto bad; cpl = ctxt->ops->cpl(ctxt); if (!(desc.type & 8)) { /* data segment */ @@ -711,9 +724,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; bad: if (addr.seg == VCPU_SREG_SS) - return emulate_ss(ctxt, sel); + return emulate_ss(ctxt, 0); else - return emulate_gp(ctxt, sel); + return emulate_gp(ctxt, 0); } static int linearize(struct x86_emulate_ctxt *ctxt, @@ -721,7 +734,8 @@ static int linearize(struct x86_emulate_ctxt *ctxt, unsigned size, bool write, ulong *linear) { - return __linearize(ctxt, addr, size, write, false, linear); + unsigned max_size; + return __linearize(ctxt, addr, &max_size, size, write, false, linear); } @@ -746,17 +760,27 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { int rc; - unsigned size; + unsigned size, max_size; unsigned long linear; int cur_size = ctxt->fetch.end - ctxt->fetch.data; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; - size = 15UL ^ cur_size; - rc = __linearize(ctxt, addr, size, false, true, &linear); + /* + * We do not know exactly how many bytes will be needed, and + * __linearize is expensive, so fetch as much as possible. We + * just have to avoid going beyond the 15 byte limit, the end + * of the segment, or the end of the page. + * + * __linearize is called with size 0 so that it does not do any + * boundary check itself. Instead, we use max_size to check + * against op_size. + */ + rc = __linearize(ctxt, addr, &max_size, 0, false, true, &linear); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; + size = min_t(unsigned, 15UL ^ cur_size, max_size); size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear)); /* @@ -766,7 +790,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) * still, we must have hit the 15-byte boundary. */ if (unlikely(size < op_size)) - return X86EMUL_UNHANDLEABLE; + return emulate_gp(ctxt, 0); + rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception); if (unlikely(rc != X86EMUL_CONTINUE)) @@ -2012,7 +2037,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); if (rc != X86EMUL_CONTINUE) { - WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); + WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); /* assigning eip failed; restore the old cs */ ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS); return rc; @@ -2109,7 +2134,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; rc = assign_eip_far(ctxt, eip, new_desc.l); if (rc != X86EMUL_CONTINUE) { - WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); + WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64); ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); } return rc; @@ -4262,6 +4287,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, fetch_register_operand(op); break; case OpCL: + op->type = OP_IMM; op->bytes = 1; op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff; break; @@ -4269,6 +4295,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, rc = decode_imm(ctxt, op, 1, true); break; case OpOne: + op->type = OP_IMM; op->bytes = 1; op->val = 1; break; @@ -4327,21 +4354,27 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, ctxt->memop.bytes = ctxt->op_bytes + 2; goto mem_common; case OpES: + op->type = OP_IMM; op->val = VCPU_SREG_ES; break; case OpCS: + op->type = OP_IMM; op->val = VCPU_SREG_CS; break; case OpSS: + op->type = OP_IMM; op->val = VCPU_SREG_SS; break; case OpDS: + op->type = OP_IMM; op->val = VCPU_SREG_DS; break; case OpFS: + op->type = OP_IMM; op->val = VCPU_SREG_FS; break; case OpGS: + op->type = OP_IMM; op->val = VCPU_SREG_GS; break; case OpImplicit: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac1c4de..978f402 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) * kvm mmu, before reclaiming the page, we should * unmap it from mmu first. */ - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + kvm_is_reserved_pfn(pfn)); if (host_writable) spte |= SPTE_HOST_WRITEABLE; @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a8b76c4..3e556c6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4579,7 +4579,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, 0); } - kvm_vcpu_reload_apic_access_page(vcpu); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); if (vmx_vm_has_apicv(vcpu->kvm)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); @@ -6426,6 +6426,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) const unsigned long *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; + preempt_disable(); + vmcs_load(shadow_vmcs); for (i = 0; i < num_fields; i++) { @@ -6449,6 +6451,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) vmcs_clear(shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); + + preempt_enable(); } static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 7609e0e..1318f75 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -41,9 +41,8 @@ csum_partial_copy_from_user(const void __user *src, void *dst, while (((unsigned long)src & 6) && len >= 2) { __u16 val16; - *errp = __get_user(val16, (const __u16 __user *)src); - if (*errp) - return isum; + if (__get_user(val16, (const __u16 __user *)src)) + goto out_err; *(__u16 *)dst = val16; isum = (__force __wsum)add32_with_carry( diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 54fcffe..2480978b 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -28,7 +28,7 @@ /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ - ((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE) + ((insn)->next_byte + sizeof(t) + n < (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) @@ -50,10 +50,11 @@ * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @x86_64: !0 for 64-bit kernel or 64-bit app */ -void insn_init(struct insn *insn, const void *kaddr, int x86_64) +void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; + insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; insn->x86_64 = x86_64 ? 1 : 0; insn->opnd_bytes = 4; diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 6a19ad9..ecfdc46 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -30,3 +30,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_MEMTEST) += memtest.o + +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4cb8763..4e5dfec 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1123,7 +1123,7 @@ void mark_rodata_ro(void) unsigned long end = (unsigned long) &__end_rodata_hpage_align; unsigned long text_end = PFN_ALIGN(&__stop___ex_table); unsigned long rodata_end = PFN_ALIGN(&__end_rodata); - unsigned long all_end = PFN_ALIGN(&_end); + unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -1134,7 +1134,16 @@ void mark_rodata_ro(void) /* * The rodata/data/bss/brk section (but not the kernel text!) * should also be not-executable. + * + * We align all_end to PMD_SIZE because the existing mapping + * is a full PMD. If we would align _brk_end to PAGE_SIZE we + * split the PMD and the reminder between _brk_end and the end + * of the PMD will remain mapped executable. + * + * Any PMD which was setup after the one which covers _brk_end + * has been zapped already via cleanup_highmem(). */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index af78e50..b12f43c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -327,7 +327,7 @@ EXPORT_SYMBOL(iounmap); * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access */ -void *xlate_dev_mem_ptr(unsigned long phys) +void *xlate_dev_mem_ptr(phys_addr_t phys) { void *addr; unsigned long start = phys & PAGE_MASK; @@ -343,7 +343,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) return addr; } -void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { if (page_is_ram(phys >> PAGE_SHIFT)) return; diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c new file mode 100644 index 0000000..67ebf57 --- /dev/null +++ b/arch/x86/mm/mpx.c @@ -0,0 +1,928 @@ +/* + * mpx.c - Memory Protection eXtensions + * + * Copyright (c) 2014, Intel Corporation. + * Qiaowei Ren <qiaowei.ren@intel.com> + * Dave Hansen <dave.hansen@intel.com> + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/sched/sysctl.h> + +#include <asm/i387.h> +#include <asm/insn.h> +#include <asm/mman.h> +#include <asm/mmu_context.h> +#include <asm/mpx.h> +#include <asm/processor.h> +#include <asm/fpu-internal.h> + +static const char *mpx_mapping_name(struct vm_area_struct *vma) +{ + return "[mpx]"; +} + +static struct vm_operations_struct mpx_vma_ops = { + .name = mpx_mapping_name, +}; + +static int is_mpx_vma(struct vm_area_struct *vma) +{ + return (vma->vm_ops == &mpx_vma_ops); +} + +/* + * This is really a simplified "vm_mmap". it only handles MPX + * bounds tables (the bounds directory is user-allocated). + * + * Later on, we use the vma->vm_ops to uniquely identify these + * VMAs. + */ +static unsigned long mpx_mmap(unsigned long len) +{ + unsigned long ret; + unsigned long addr, pgoff; + struct mm_struct *mm = current->mm; + vm_flags_t vm_flags; + struct vm_area_struct *vma; + + /* Only bounds table and bounds directory can be allocated here */ + if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES) + return -EINVAL; + + down_write(&mm->mmap_sem); + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) { + ret = -ENOMEM; + goto out; + } + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); + if (addr & ~PAGE_MASK) { + ret = addr; + goto out; + } + + vm_flags = VM_READ | VM_WRITE | VM_MPX | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + /* Set pgoff according to addr for anon_vma */ + pgoff = addr >> PAGE_SHIFT; + + ret = mmap_region(NULL, addr, len, vm_flags, pgoff); + if (IS_ERR_VALUE(ret)) + goto out; + + vma = find_vma(mm, ret); + if (!vma) { + ret = -ENOMEM; + goto out; + } + vma->vm_ops = &mpx_vma_ops; + + if (vm_flags & VM_LOCKED) { + up_write(&mm->mmap_sem); + mm_populate(ret, len); + return ret; + } + +out: + up_write(&mm->mmap_sem); + return ret; +} + +enum reg_type { + REG_TYPE_RM = 0, + REG_TYPE_INDEX, + REG_TYPE_BASE, +}; + +static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) +{ + int regno = 0; + + static const int regoff[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif + }; + int nr_registers = ARRAY_SIZE(regoff); + /* + * Don't possibly decode a 32-bit instructions as + * reading a 64-bit-only register. + */ + if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) + nr_registers -= 8; + + switch (type) { + case REG_TYPE_RM: + regno = X86_MODRM_RM(insn->modrm.value); + if (X86_REX_B(insn->rex_prefix.value) == 1) + regno += 8; + break; + + case REG_TYPE_INDEX: + regno = X86_SIB_INDEX(insn->sib.value); + if (X86_REX_X(insn->rex_prefix.value) == 1) + regno += 8; + break; + + case REG_TYPE_BASE: + regno = X86_SIB_BASE(insn->sib.value); + if (X86_REX_B(insn->rex_prefix.value) == 1) + regno += 8; + break; + + default: + pr_err("invalid register type"); + BUG(); + break; + } + + if (regno > nr_registers) { + WARN_ONCE(1, "decoded an instruction with an invalid register"); + return -EINVAL; + } + return regoff[regno]; +} + +/* + * return the address being referenced be instruction + * for rm=3 returning the content of the rm reg + * for rm!=3 calculates the address using SIB and Disp + */ +static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) +{ + unsigned long addr, base, indx; + int addr_offset, base_offset, indx_offset; + insn_byte_t sib; + + insn_get_modrm(insn); + insn_get_sib(insn); + sib = insn->sib.value; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out_err; + addr = regs_get_register(regs, addr_offset); + } else { + if (insn->sib.nbytes) { + base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); + if (base_offset < 0) + goto out_err; + + indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); + if (indx_offset < 0) + goto out_err; + + base = regs_get_register(regs, base_offset); + indx = regs_get_register(regs, indx_offset); + addr = base + indx * (1 << X86_SIB_SCALE(sib)); + } else { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out_err; + addr = regs_get_register(regs, addr_offset); + } + addr += insn->displacement.value; + } + return (void __user *)addr; +out_err: + return (void __user *)-1; +} + +static int mpx_insn_decode(struct insn *insn, + struct pt_regs *regs) +{ + unsigned char buf[MAX_INSN_SIZE]; + int x86_64 = !test_thread_flag(TIF_IA32); + int not_copied; + int nr_copied; + + not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + /* + * The decoder _should_ fail nicely if we pass it a short buffer. + * But, let's not depend on that implementation detail. If we + * did not get anything, just error out now. + */ + if (!nr_copied) + return -EFAULT; + insn_init(insn, buf, nr_copied, x86_64); + insn_get_length(insn); + /* + * copy_from_user() tries to get as many bytes as we could see in + * the largest possible instruction. If the instruction we are + * after is shorter than that _and_ we attempt to copy from + * something unreadable, we might get a short read. This is OK + * as long as the read did not stop in the middle of the + * instruction. Check to see if we got a partial instruction. + */ + if (nr_copied < insn->length) + return -EFAULT; + + insn_get_opcode(insn); + /* + * We only _really_ need to decode bndcl/bndcn/bndcu + * Error out on anything else. + */ + if (insn->opcode.bytes[0] != 0x0f) + goto bad_opcode; + if ((insn->opcode.bytes[1] != 0x1a) && + (insn->opcode.bytes[1] != 0x1b)) + goto bad_opcode; + + return 0; +bad_opcode: + return -EINVAL; +} + +/* + * If a bounds overflow occurs then a #BR is generated. This + * function decodes MPX instructions to get violation address + * and set this address into extended struct siginfo. + * + * Note that this is not a super precise way of doing this. + * Userspace could have, by the time we get here, written + * anything it wants in to the instructions. We can not + * trust anything about it. They might not be valid + * instructions or might encode invalid registers, etc... + * + * The caller is expected to kfree() the returned siginfo_t. + */ +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf) +{ + struct bndreg *bndregs, *bndreg; + siginfo_t *info = NULL; + struct insn insn; + uint8_t bndregno; + int err; + + err = mpx_insn_decode(&insn, regs); + if (err) + goto err_out; + + /* + * We know at this point that we are only dealing with + * MPX instructions. + */ + insn_get_modrm(&insn); + bndregno = X86_MODRM_REG(insn.modrm.value); + if (bndregno > 3) { + err = -EINVAL; + goto err_out; + } + /* get the bndregs _area_ of the xsave structure */ + bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS); + if (!bndregs) { + err = -EINVAL; + goto err_out; + } + /* now go select the individual register in the set of 4 */ + bndreg = &bndregs[bndregno]; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto err_out; + } + /* + * The registers are always 64-bit, but the upper 32 + * bits are ignored in 32-bit mode. Also, note that the + * upper bounds are architecturally represented in 1's + * complement form. + * + * The 'unsigned long' cast is because the compiler + * complains when casting from integers to different-size + * pointers. + */ + info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound; + info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound; + info->si_addr_lsb = 0; + info->si_signo = SIGSEGV; + info->si_errno = 0; + info->si_code = SEGV_BNDERR; + info->si_addr = mpx_get_addr_ref(&insn, regs); + /* + * We were not able to extract an address from the instruction, + * probably because there was something invalid in it. + */ + if (info->si_addr == (void *)-1) { + err = -EINVAL; + goto err_out; + } + return info; +err_out: + /* info might be NULL, but kfree() handles that */ + kfree(info); + return ERR_PTR(err); +} + +static __user void *task_get_bounds_dir(struct task_struct *tsk) +{ + struct bndcsr *bndcsr; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * The bounds directory pointer is stored in a register + * only accessible if we first do an xsave. + */ + fpu_save_init(&tsk->thread.fpu); + bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); + if (!bndcsr) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Make sure the register looks valid by checking the + * enable bit. + */ + if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Lastly, mask off the low bits used for configuration + * flags, and return the address of the bounds table. + */ + return (void __user *)(unsigned long) + (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); +} + +int mpx_enable_management(struct task_struct *tsk) +{ + void __user *bd_base = MPX_INVALID_BOUNDS_DIR; + struct mm_struct *mm = tsk->mm; + int ret = 0; + + /* + * runtime in the userspace will be responsible for allocation of + * the bounds directory. Then, it will save the base of the bounds + * directory into XSAVE/XRSTOR Save Area and enable MPX through + * XRSTOR instruction. + * + * fpu_xsave() is expected to be very expensive. Storing the bounds + * directory here means that we do not have to do xsave in the unmap + * path; we can just use mm->bd_addr instead. + */ + bd_base = task_get_bounds_dir(tsk); + down_write(&mm->mmap_sem); + mm->bd_addr = bd_base; + if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + ret = -ENXIO; + + up_write(&mm->mmap_sem); + return ret; +} + +int mpx_disable_management(struct task_struct *tsk) +{ + struct mm_struct *mm = current->mm; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return -ENXIO; + + down_write(&mm->mmap_sem); + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + up_write(&mm->mmap_sem); + return 0; +} + +/* + * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each + * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB, + * and the size of each bounds table is 4MB. + */ +static int allocate_bt(long __user *bd_entry) +{ + unsigned long expected_old_val = 0; + unsigned long actual_old_val = 0; + unsigned long bt_addr; + int ret = 0; + + /* + * Carve the virtual space out of userspace for the new + * bounds table: + */ + bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES); + if (IS_ERR((void *)bt_addr)) + return PTR_ERR((void *)bt_addr); + /* + * Set the valid flag (kinda like _PAGE_PRESENT in a pte) + */ + bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + + /* + * Go poke the address of the new bounds table in to the + * bounds directory entry out in userspace memory. Note: + * we may race with another CPU instantiating the same table. + * In that case the cmpxchg will see an unexpected + * 'actual_old_val'. + * + * This can fault, but that's OK because we do not hold + * mmap_sem at this point, unlike some of the other part + * of the MPX code that have to pagefault_disable(). + */ + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, bt_addr); + if (ret) + goto out_unmap; + + /* + * The user_atomic_cmpxchg_inatomic() will only return nonzero + * for faults, *not* if the cmpxchg itself fails. Now we must + * verify that the cmpxchg itself completed successfully. + */ + /* + * We expected an empty 'expected_old_val', but instead found + * an apparently valid entry. Assume we raced with another + * thread to instantiate this table and desclare succecss. + */ + if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) { + ret = 0; + goto out_unmap; + } + /* + * We found a non-empty bd_entry but it did not have the + * VALID_FLAG set. Return an error which will result in + * a SEGV since this probably means that somebody scribbled + * some invalid data in to a bounds table. + */ + if (expected_old_val != actual_old_val) { + ret = -EINVAL; + goto out_unmap; + } + return 0; +out_unmap: + vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES); + return ret; +} + +/* + * When a BNDSTX instruction attempts to save bounds to a bounds + * table, it will first attempt to look up the table in the + * first-level bounds directory. If it does not find a table in + * the directory, a #BR is generated and we get here in order to + * allocate a new table. + * + * With 32-bit mode, the size of BD is 4MB, and the size of each + * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, + * and the size of each bound table is 4MB. + */ +static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) +{ + unsigned long bd_entry, bd_base; + struct bndcsr *bndcsr; + + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + return -EINVAL; + /* + * Mask off the preserve and enable bits + */ + bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK; + /* + * The hardware provides the address of the missing or invalid + * entry via BNDSTATUS, so we don't have to go look it up. + */ + bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK; + /* + * Make sure the directory entry is within where we think + * the directory is. + */ + if ((bd_entry < bd_base) || + (bd_entry >= bd_base + MPX_BD_SIZE_BYTES)) + return -EINVAL; + + return allocate_bt((long __user *)bd_entry); +} + +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + /* + * Userspace never asked us to manage the bounds tables, + * so refuse to help. + */ + if (!kernel_managing_mpx_tables(current->mm)) + return -EINVAL; + + if (do_mpx_bt_fault(xsave_buf)) { + force_sig(SIGSEGV, current); + /* + * The force_sig() is essentially "handling" this + * exception, so we do not pass up the error + * from do_mpx_bt_fault(). + */ + } + return 0; +} + +/* + * A thin wrapper around get_user_pages(). Returns 0 if the + * fault was resolved or -errno if not. + */ +static int mpx_resolve_fault(long __user *addr, int write) +{ + long gup_ret; + int nr_pages = 1; + int force = 0; + + gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, + nr_pages, write, force, NULL, NULL); + /* + * get_user_pages() returns number of pages gotten. + * 0 means we failed to fault in and get anything, + * probably because 'addr' is bad. + */ + if (!gup_ret) + return -EFAULT; + /* Other error, return it */ + if (gup_ret < 0) + return gup_ret; + /* must have gup'd a page and gup_ret>0, success */ + return 0; +} + +/* + * Get the base of bounds tables pointed by specific bounds + * directory entry. + */ +static int get_bt_addr(struct mm_struct *mm, + long __user *bd_entry, unsigned long *bt_addr) +{ + int ret; + int valid_bit; + + if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry))) + return -EFAULT; + + while (1) { + int need_write = 0; + + pagefault_disable(); + ret = get_user(*bt_addr, bd_entry); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + + valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG; + *bt_addr &= MPX_BT_ADDR_MASK; + + /* + * When the kernel is managing bounds tables, a bounds directory + * entry will either have a valid address (plus the valid bit) + * *OR* be completely empty. If we see a !valid entry *and* some + * data in the address field, we know something is wrong. This + * -EINVAL return will cause a SIGSEGV. + */ + if (!valid_bit && *bt_addr) + return -EINVAL; + /* + * Do we have an completely zeroed bt entry? That is OK. It + * just means there was no bounds table for this memory. Make + * sure to distinguish this from -EINVAL, which will cause + * a SEGV. + */ + if (!valid_bit) + return -ENOENT; + + return 0; +} + +/* + * Free the backing physical pages of bounds table 'bt_addr'. + * Assume start...end is within that bounds table. + */ +static int zap_bt_entries(struct mm_struct *mm, + unsigned long bt_addr, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + unsigned long addr, len; + + /* + * Find the first overlapping vma. If vma->vm_start > start, there + * will be a hole in the bounds table. This -EINVAL return will + * cause a SIGSEGV. + */ + vma = find_vma(mm, start); + if (!vma || vma->vm_start > start) + return -EINVAL; + + /* + * A NUMA policy on a VM_MPX VMA could cause this bouds table to + * be split. So we need to look across the entire 'start -> end' + * range of this bounds table, find all of the VM_MPX VMAs, and + * zap only those. + */ + addr = start; + while (vma && vma->vm_start < end) { + /* + * We followed a bounds directory entry down + * here. If we find a non-MPX VMA, that's bad, + * so stop immediately and return an error. This + * probably results in a SIGSEGV. + */ + if (!is_mpx_vma(vma)) + return -EINVAL; + + len = min(vma->vm_end, end) - addr; + zap_page_range(vma, addr, len, NULL); + + vma = vma->vm_next; + addr = vma->vm_start; + } + + return 0; +} + +static int unmap_single_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long bt_addr) +{ + unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + unsigned long actual_old_val = 0; + int ret; + + while (1) { + int need_write = 1; + + pagefault_disable(); + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, 0); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + /* + * The cmpxchg was performed, check the results. + */ + if (actual_old_val != expected_old_val) { + /* + * Someone else raced with us to unmap the table. + * There was no bounds table pointed to by the + * directory, so declare success. Somebody freed + * it. + */ + if (!actual_old_val) + return 0; + /* + * Something messed with the bounds directory + * entry. We hold mmap_sem for read or write + * here, so it could not be a _new_ bounds table + * that someone just allocated. Something is + * wrong, so pass up the error and SIGSEGV. + */ + return -EINVAL; + } + + /* + * Note, we are likely being called under do_munmap() already. To + * avoid recursion, do_munmap() will check whether it comes + * from one bounds table through VM_MPX flag. + */ + return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES); +} + +/* + * If the bounds table pointed by bounds directory 'bd_entry' is + * not shared, unmap this whole bounds table. Otherwise, only free + * those backing physical pages of bounds table entries covered + * in this virtual address region start...end. + */ +static int unmap_shared_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long start, + unsigned long end, bool prev_shared, bool next_shared) +{ + unsigned long bt_addr; + int ret; + + ret = get_bt_addr(mm, bd_entry, &bt_addr); + /* + * We could see an "error" ret for not-present bounds + * tables (not really an error), or actual errors, but + * stop unmapping either way. + */ + if (ret) + return ret; + + if (prev_shared && next_shared) + ret = zap_bt_entries(mm, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), + bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + else if (prev_shared) + ret = zap_bt_entries(mm, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), + bt_addr+MPX_BT_SIZE_BYTES); + else if (next_shared) + ret = zap_bt_entries(mm, bt_addr, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + else + ret = unmap_single_bt(mm, bd_entry, bt_addr); + + return ret; +} + +/* + * A virtual address region being munmap()ed might share bounds table + * with adjacent VMAs. We only need to free the backing physical + * memory of these shared bounds tables entries covered in this virtual + * address region. + */ +static int unmap_edge_bts(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + int ret; + long __user *bde_start, *bde_end; + struct vm_area_struct *prev, *next; + bool prev_shared = false, next_shared = false; + + bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); + bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + + /* + * Check whether bde_start and bde_end are shared with adjacent + * VMAs. + * + * We already unliked the VMAs from the mm's rbtree so 'start' + * is guaranteed to be in a hole. This gets us the first VMA + * before the hole in to 'prev' and the next VMA after the hole + * in to 'next'. + */ + next = find_vma_prev(mm, start, &prev); + if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1)) + == bde_start) + prev_shared = true; + if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start)) + == bde_end) + next_shared = true; + + /* + * This virtual address region being munmap()ed is only + * covered by one bounds table. + * + * In this case, if this table is also shared with adjacent + * VMAs, only part of the backing physical memory of the bounds + * table need be freeed. Otherwise the whole bounds table need + * be unmapped. + */ + if (bde_start == bde_end) { + return unmap_shared_bt(mm, bde_start, start, end, + prev_shared, next_shared); + } + + /* + * If more than one bounds tables are covered in this virtual + * address region being munmap()ed, we need to separately check + * whether bde_start and bde_end are shared with adjacent VMAs. + */ + ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false); + if (ret) + return ret; + ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared); + if (ret) + return ret; + + return 0; +} + +static int mpx_unmap_tables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + int ret; + long __user *bd_entry, *bde_start, *bde_end; + unsigned long bt_addr; + + /* + * "Edge" bounds tables are those which are being used by the region + * (start -> end), but that may be shared with adjacent areas. If they + * turn out to be completely unshared, they will be freed. If they are + * shared, we will free the backing store (like an MADV_DONTNEED) for + * areas used by this region. + */ + ret = unmap_edge_bts(mm, start, end); + switch (ret) { + /* non-present tables are OK */ + case 0: + case -ENOENT: + /* Success, or no tables to unmap */ + break; + case -EINVAL: + case -EFAULT: + default: + return ret; + } + + /* + * Only unmap the bounds table that are + * 1. fully covered + * 2. not at the edges of the mapping, even if full aligned + */ + bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); + bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) { + ret = get_bt_addr(mm, bd_entry, &bt_addr); + switch (ret) { + case 0: + break; + case -ENOENT: + /* No table here, try the next one */ + continue; + case -EINVAL: + case -EFAULT: + default: + /* + * Note: we are being strict here. + * Any time we run in to an issue + * unmapping tables, we stop and + * SIGSEGV. + */ + return ret; + } + + ret = unmap_single_bt(mm, bd_entry, bt_addr); + if (ret) + return ret; + } + + return 0; +} + +/* + * Free unused bounds tables covered in a virtual address region being + * munmap()ed. Assume end > start. + * + * This function will be called by do_munmap(), and the VMAs covering + * the virtual address region start...end have already been split if + * necessary, and the 'vma' is the first vma in this range (start -> end). + */ +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + int ret; + + /* + * Refuse to do anything unless userspace has asked + * the kernel to help manage the bounds tables, + */ + if (!kernel_managing_mpx_tables(current->mm)) + return; + /* + * This will look across the entire 'start -> end' range, + * and find all of the non-VM_MPX VMAs. + * + * To avoid recursion, if a VM_MPX vma is found in the range + * (start->end), we will not continue follow-up work. This + * recursion represents having bounds tables for bounds tables, + * which should not occur normally. Being strict about it here + * helps ensure that we do not have an exploitable stack overflow. + */ + do { + if (vma->vm_flags & VM_MPX) + return; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + + ret = mpx_unmap_tables(mm, start, end); + if (ret) + force_sig(SIGSEGV, current); +} diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 6574388..c7eddbe 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -824,7 +824,7 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos == 0) { ++*pos; - seq_printf(seq, "PAT memtype list:\n"); + seq_puts(seq, "PAT memtype list:\n"); } return memtype_get_idx(*pos); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 093f5f4..1819a91 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -229,7 +229,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; list_for_each_entry(msidesc, &dev->msi_list, list) { - __read_msi_msg(msidesc, &msg); + __pci_read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); if (msg.data != XEN_PIRQ_MSI_DATA || @@ -240,7 +240,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; } xen_msi_compose_msg(dev, pirq, &msg); - __write_msi_msg(msidesc, &msg); + __pci_write_msi_msg(msidesc, &msg); dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); } else { dev_dbg(&dev->dev, @@ -394,14 +394,7 @@ static void xen_teardown_msi_irq(unsigned int irq) { xen_destroy_irq(irq); } -static u32 xen_nop_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) -{ - return 0; -} -static u32 xen_nop_msix_mask_irq(struct msi_desc *desc, u32 flag) -{ - return 0; -} + #endif int __init pci_xen_init(void) @@ -425,8 +418,7 @@ int __init pci_xen_init(void) x86_msi.setup_msi_irqs = xen_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs; - x86_msi.msi_mask_irq = xen_nop_msi_mask_irq; - x86_msi.msix_mask_irq = xen_nop_msix_mask_irq; + pci_msi_ignore_mask = 1; #endif return 0; } @@ -506,8 +498,7 @@ int __init pci_xen_initial_domain(void) x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs; - x86_msi.msi_mask_irq = xen_nop_msi_mask_irq; - x86_msi.msix_mask_irq = xen_nop_msix_mask_irq; + pci_msi_ignore_mask = 1; #endif xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3968d67..ab4062c 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1367,20 +1367,18 @@ static int ptc_seq_show(struct seq_file *file, void *data) cpu = *(loff_t *)data; if (!cpu) { - seq_printf(file, - "# cpu bauoff sent stime self locals remotes ncpus localhub "); - seq_printf(file, - "remotehub numuvhubs numuvhubs16 numuvhubs8 "); - seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); - seq_printf(file, - "rok resetp resett giveup sto bz throt disable "); - seq_printf(file, - "enable wars warshw warwaits enters ipidis plugged "); - seq_printf(file, - "ipiover glim cong swack recv rtime all one mult "); - seq_printf(file, - "none retry canc nocan reset rcan\n"); + seq_puts(file, + "# cpu bauoff sent stime self locals remotes ncpus localhub "); + seq_puts(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); + seq_puts(file, + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); + seq_puts(file, + "rok resetp resett giveup sto bz throt disable "); + seq_puts(file, + "enable wars warshw warwaits enters ipidis plugged "); + seq_puts(file, + "ipiover glim cong swack recv rtime all one mult "); + seq_puts(file, "none retry canc nocan reset rcan\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { bcp = &per_cpu(bau_control, cpu); diff --git a/arch/x86/tools/calc_run_size.pl b/arch/x86/tools/calc_run_size.pl new file mode 100644 index 0000000..23210ba --- /dev/null +++ b/arch/x86/tools/calc_run_size.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl +# +# Calculate the amount of space needed to run the kernel, including room for +# the .bss and .brk sections. +# +# Usage: +# objdump -h a.out | perl calc_run_size.pl +use strict; + +my $mem_size = 0; +my $file_offset = 0; + +my $sections=" *[0-9]+ \.(?:bss|brk) +"; +while (<>) { + if (/^$sections([0-9a-f]+) +(?:[0-9a-f]+ +){2}([0-9a-f]+)/) { + my $size = hex($1); + my $offset = hex($2); + $mem_size += $size; + if ($file_offset == 0) { + $file_offset = $offset; + } elsif ($file_offset != $offset) { + # BFD linker shows the same file offset in ELF. + # Gold linker shows them as consecutive. + next if ($file_offset + $mem_size == $offset + $size); + + printf STDERR "file_offset: 0x%lx\n", $file_offset; + printf STDERR "mem_size: 0x%lx\n", $mem_size; + printf STDERR "offset: 0x%lx\n", $offset; + printf STDERR "size: 0x%lx\n", $size; + + die ".bss and .brk are non-contiguous\n"; + } + } +} + +if ($file_offset == 0) { + die "Never found .bss or .brk file offset\n"; +} +printf("%d\n", $mem_size + $file_offset); diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 872eb60..ba70ff2 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -254,7 +254,7 @@ int main(int argc, char **argv) continue; /* Decode an instruction */ - insn_init(&insn, insn_buf, x86_64); + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); insn_get_length(&insn); if (insn.next_byte <= insn.kaddr || diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 13403fc..56f04db 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -149,7 +149,7 @@ int main(int argc, char **argv) break; } /* Decode an instruction */ - insn_init(&insn, insn_buf, x86_64); + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); insn_get_length(&insn); if (insn.length != nb) { warnings++; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 8650cdb..4c071ae 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -510,6 +510,9 @@ static void xen_cpu_die(unsigned int cpu) current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } + + cpu_die_common(cpu); + xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); |