diff options
author | H. Peter Anvin <hpa@linux.intel.com> | 2014-06-18 15:26:19 -0700 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2014-06-18 15:26:19 -0700 |
commit | 03ab3da3b215bac4ebb093c808d54596e03e3225 (patch) | |
tree | a42534bb7f314b561b362ad0b5af7eff8dbb9726 /arch/x86/kernel | |
parent | 6229ad278ca74acdbc8bd3a3d469322a3de91039 (diff) | |
parent | 7171511eaec5bf23fb06078f59784a3a0626b38f (diff) | |
download | op-kernel-dev-03ab3da3b215bac4ebb093c808d54596e03e3225.zip op-kernel-dev-03ab3da3b215bac4ebb093c808d54596e03e3225.tar.gz |
Merge tag 'v3.16-rc1' into x86/cpufeature
Linux 3.16-rc1
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86/kernel')
47 files changed, 1728 insertions, 1163 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f4d9600..047f9ff 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,9 +26,11 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o +obj-$(CONFIG_X86_64) += mcount_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index df94598..703130f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -5,7 +5,6 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> -#include <linux/kprobes.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> @@ -551,7 +550,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, * * Note: Must be called under text_mutex. */ -void *__kprobes text_poke(void *addr, const void *opcode, size_t len) +void *text_poke(void *addr, const void *opcode, size_t len) { unsigned long flags; char *vaddr; diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b574b29..8e3842f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, struct dma_attrs *attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); - free_pages((unsigned long)vaddr, get_order(size)); + dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9fa8aa0..76164e1 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -10,6 +10,8 @@ * * Copyright 2002 Andi Kleen, SuSE Labs. */ +#define pr_fmt(fmt) "AGP: " fmt + #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> @@ -75,14 +77,13 @@ static u32 __init allocate_aperture(void) addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); if (!addr) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%lx,%uK)\n", - addr, aper_size>>10); + pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); return 0; } memblock_reserve(addr, aper_size); - printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, addr); + pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, (addr+aper_size) >> PAGE_SHIFT); @@ -126,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) u64 aper; u32 old_order; - printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); + pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func); apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); + pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", + bus, slot, func); return 0; } @@ -153,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * On some sick chips, APSIZE is 0. It means it wants 4G * so let double check that order, and lets trust AMD NB settings: */ - printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", - aper, 32 << old_order); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", + bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, + 32 << old_order); if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { - printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", - 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", + bus, slot, func, 32 << *order, apsizereg); *order = old_order; } - printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", + bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, + 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) return 0; @@ -218,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) } } } - printk(KERN_INFO "No AGP bridge found\n"); + pr_info("No AGP bridge found\n"); return 0; } @@ -310,7 +314,8 @@ void __init early_gart_iommu_check(void) if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { /* reserve it, so we can reuse it in second kernel */ - printk(KERN_INFO "update e820 for GART\n"); + pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", + aper_base, aper_base + aper_size - 1); e820_add_region(aper_base, aper_size, E820_RESERVED); update_e820(); } @@ -354,7 +359,7 @@ int __init gart_iommu_hole_init(void) !early_pci_allowed()) return -ENODEV; - printk(KERN_INFO "Checking aperture...\n"); + pr_info("Checking aperture...\n"); if (!fallback_aper_force) agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); @@ -395,8 +400,9 @@ int __init gart_iommu_hole_init(void) aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; - printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", - node, aper_base, aper_size >> 20); + pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", + node, aper_base, aper_base + aper_size - 1, + aper_size >> 20); node++; if (!aperture_valid(aper_base, aper_size, 64<<20)) { @@ -407,9 +413,9 @@ int __init gart_iommu_hole_init(void) if (!no_iommu && max_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { - printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); - printk(KERN_ERR "please increase GART size in your BIOS setup\n"); - printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); + pr_err("please increase GART size in your BIOS setup\n"); + pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); printed_gart_size_msg = 1; } } else { @@ -446,13 +452,10 @@ out: force_iommu || valid_agp || fallback_aper_force) { - printk(KERN_INFO - "Your BIOS doesn't leave a aperture memory hole\n"); - printk(KERN_INFO - "Please enable the IOMMU option in the BIOS setup\n"); - printk(KERN_INFO - "This costs you %d MB of RAM\n", - 32 << fallback_aper_order); + pr_info("Your BIOS doesn't leave a aperture memory hole\n"); + pr_info("Please enable the IOMMU option in the BIOS setup\n"); + pr_info("This costs you %dMB of RAM\n", + 32 << fallback_aper_order); aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index a698d71..c3fcb5d 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -57,10 +57,10 @@ void arch_trigger_all_cpu_backtrace(void) } clear_bit(0, &backtrace_flag); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } -static int __kprobes +static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { int cpu; @@ -80,6 +80,7 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) return NMI_DONE; } +NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler); static int __init register_trigger_all_cpu_backtrace(void) { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 992060e..81e08ef 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -206,9 +206,6 @@ int __init arch_early_irq_init(void) count = ARRAY_SIZE(irq_cfgx); node = cpu_to_node(0); - /* Make sure the legacy interrupts are marked in the bitmap */ - irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); - for (i = 0; i < count; i++) { irq_set_chip_data(i, &cfg[i]); zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); @@ -281,18 +278,6 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } -static int alloc_irqs_from(unsigned int from, unsigned int count, int node) -{ - return irq_alloc_descs_from(from, count, node); -} - -static void free_irq_at(unsigned int at, struct irq_cfg *cfg) -{ - free_irq_cfg(at, cfg); - irq_free_desc(at); -} - - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -2312,7 +2297,7 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, int err; if (!config_enabled(CONFIG_SMP)) - return -1; + return -EPERM; if (!cpumask_intersects(mask, cpu_online_mask)) return -EINVAL; @@ -2343,7 +2328,7 @@ int native_ioapic_set_affinity(struct irq_data *data, int ret; if (!config_enabled(CONFIG_SMP)) - return -1; + return -EPERM; raw_spin_lock_irqsave(&ioapic_lock, flags); ret = __ioapic_set_affinity(data, mask, &dest); @@ -2916,98 +2901,39 @@ static int __init ioapic_init_ops(void) device_initcall(ioapic_init_ops); /* - * Dynamic irq allocate and deallocation + * Dynamic irq allocate and deallocation. Should be replaced by irq domains! */ -unsigned int __create_irqs(unsigned int from, unsigned int count, int node) +int arch_setup_hwirq(unsigned int irq, int node) { - struct irq_cfg **cfg; + struct irq_cfg *cfg; unsigned long flags; - int irq, i; - - if (from < nr_irqs_gsi) - from = nr_irqs_gsi; + int ret; - cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node); + cfg = alloc_irq_cfg(irq, node); if (!cfg) - return 0; - - irq = alloc_irqs_from(from, count, node); - if (irq < 0) - goto out_cfgs; - - for (i = 0; i < count; i++) { - cfg[i] = alloc_irq_cfg(irq + i, node); - if (!cfg[i]) - goto out_irqs; - } + return -ENOMEM; raw_spin_lock_irqsave(&vector_lock, flags); - for (i = 0; i < count; i++) - if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus())) - goto out_vecs; + ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); raw_spin_unlock_irqrestore(&vector_lock, flags); - for (i = 0; i < count; i++) { - irq_set_chip_data(irq + i, cfg[i]); - irq_clear_status_flags(irq + i, IRQ_NOREQUEST); - } - - kfree(cfg); - return irq; - -out_vecs: - for (i--; i >= 0; i--) - __clear_irq_vector(irq + i, cfg[i]); - raw_spin_unlock_irqrestore(&vector_lock, flags); -out_irqs: - for (i = 0; i < count; i++) - free_irq_at(irq + i, cfg[i]); -out_cfgs: - kfree(cfg); - return 0; -} - -unsigned int create_irq_nr(unsigned int from, int node) -{ - return __create_irqs(from, 1, node); -} - -int create_irq(void) -{ - int node = cpu_to_node(0); - unsigned int irq_want; - int irq; - - irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want, node); - - if (irq == 0) - irq = -1; - - return irq; + if (!ret) + irq_set_chip_data(irq, cfg); + else + free_irq_cfg(irq, cfg); + return ret; } -void destroy_irq(unsigned int irq) +void arch_teardown_hwirq(unsigned int irq) { struct irq_cfg *cfg = irq_get_chip_data(irq); unsigned long flags; - irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); - free_remapped_irq(irq); - raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); - free_irq_at(irq, cfg); -} - -void destroy_irqs(unsigned int irq, unsigned int count) -{ - unsigned int i; - - for (i = 0; i < count; i++) - destroy_irq(irq + i); + free_irq_cfg(irq, cfg); } /* @@ -3075,9 +3001,11 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) struct irq_cfg *cfg = data->chip_data; struct msi_msg msg; unsigned int dest; + int ret; - if (__ioapic_set_affinity(data, mask, &dest)) - return -1; + ret = __ioapic_set_affinity(data, mask, &dest); + if (ret) + return ret; __get_cached_msi_msg(data->msi_desc, &msg); @@ -3136,8 +3064,8 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - unsigned int irq, irq_want; struct msi_desc *msidesc; + unsigned int irq; int node, ret; /* Multiple MSI vectors only supported with interrupt remapping */ @@ -3145,28 +3073,25 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; node = dev_to_node(&dev->dev); - irq_want = nr_irqs_gsi; + list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want, node); - if (irq == 0) + irq = irq_alloc_hwirq(node); + if (!irq) return -ENOSPC; - irq_want = irq + 1; - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) - goto error; + if (ret < 0) { + irq_free_hwirq(irq); + return ret; + } + } return 0; - -error: - destroy_irq(irq); - return ret; } void native_teardown_msi_irq(unsigned int irq) { - destroy_irq(irq); + irq_free_hwirq(irq); } #ifdef CONFIG_DMAR_TABLE @@ -3177,9 +3102,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_cfg *cfg = data->chip_data; unsigned int dest, irq = data->irq; struct msi_msg msg; + int ret; - if (__ioapic_set_affinity(data, mask, &dest)) - return -1; + ret = __ioapic_set_affinity(data, mask, &dest); + if (ret) + return ret; dmar_msi_read(irq, &msg); @@ -3226,9 +3153,11 @@ static int hpet_msi_set_affinity(struct irq_data *data, struct irq_cfg *cfg = data->chip_data; struct msi_msg msg; unsigned int dest; + int ret; - if (__ioapic_set_affinity(data, mask, &dest)) - return -1; + ret = __ioapic_set_affinity(data, mask, &dest); + if (ret) + return ret; hpet_msi_read(data->handler_data, &msg); @@ -3295,9 +3224,11 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { struct irq_cfg *cfg = data->chip_data; unsigned int dest; + int ret; - if (__ioapic_set_affinity(data, mask, &dest)) - return -1; + ret = __ioapic_set_affinity(data, mask, &dest); + if (ret) + return ret; target_ht_irq(data->irq, dest, cfg->vector); return IRQ_SET_MASK_OK_NOCOPY; @@ -3420,11 +3351,6 @@ static void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } -int get_nr_irqs_gsi(void) -{ - return nr_irqs_gsi; -} - unsigned int arch_dynirq_lower_bound(unsigned int from) { return from < nr_irqs_gsi ? nr_irqs_gsi : from; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7834389..293b41d 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -440,6 +440,20 @@ static __initdata struct redir_addr redir_addrs[] = { {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, }; +static unsigned char get_n_lshift(int m_val) +{ + union uv3h_gr0_gam_gr_config_u m_gr_config; + + if (is_uv1_hub()) + return m_val; + + if (is_uv2_hub()) + return m_val == 40 ? 40 : 39; + + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + return m_gr_config.s3.m_skt; +} + static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; @@ -849,6 +863,7 @@ void __init uv_system_init(void) int gnode_extra, min_pnode = 999999, max_pnode = -1; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + unsigned char n_lshift; char *hub = (is_uv1_hub() ? "UV1" : (is_uv2_hub() ? "UV2" : "UV3")); @@ -860,6 +875,7 @@ void __init uv_system_init(void) m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; pnode_mask = (1 << n_val) - 1; + n_lshift = get_n_lshift(m_val); mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; @@ -867,8 +883,9 @@ void __init uv_system_init(void) node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n", - n_val, m_val, pnode_mask, gnode_upper, gnode_extra); + pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", + n_val, m_val, pnode_mask, gnode_upper, gnode_extra, + n_lshift); pr_info("UV: global MMR base 0x%lx\n", mmr_base); @@ -935,8 +952,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; - uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ? - (m_val == 40 ? 40 : 39) : m_val; + uv_cpu_hub_info(cpu)->n_lshift = n_lshift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 3ab0343..f3a1f04 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -844,21 +844,10 @@ static int apm_do_idle(void) int polling; int err = 0; - polling = !!(current_thread_info()->status & TS_POLLING); - if (polling) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - } if (!need_resched()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); } - if (polling) - current_thread_info()->status |= TS_POLLING; if (!idled) return 0; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e7c4b97..188a8c5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -8,6 +8,7 @@ #include <linux/delay.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/kprobes.h> #include <linux/kgdb.h> #include <linux/smp.h> #include <linux/io.h> @@ -20,6 +21,7 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/sections.h> +#include <asm/vsyscall.h> #include <linux/topology.h> #include <linux/cpumask.h> #include <asm/pgtable.h> @@ -962,6 +964,38 @@ static void vgetcpu_set_mode(void) else vgetcpu_mode = VGETCPU_LSL; } + +/* May not be __init: called during resume */ +static void syscall32_cpu_init(void) +{ + /* Load these always in case some future AMD CPU supports + SYSENTER from compat mode too. */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + + wrmsrl(MSR_CSTAR, ia32_cstar_target); +} +#endif + +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) { + put_cpu(); + return; + } + + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + put_cpu(); +} #endif void __init identify_boot_cpu(void) @@ -1169,6 +1203,7 @@ int is_debug_stack(unsigned long addr) (addr <= __get_cpu_var(debug_stack_addr) && addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } +NOKPROBE_SYMBOL(is_debug_stack); DEFINE_PER_CPU(u32, debug_idt_ctr); @@ -1177,6 +1212,7 @@ void debug_stack_set_zero(void) this_cpu_inc(debug_idt_ctr); load_current_idt(); } +NOKPROBE_SYMBOL(debug_stack_set_zero); void debug_stack_reset(void) { @@ -1185,6 +1221,7 @@ void debug_stack_reset(void) if (this_cpu_dec_return(debug_idt_ctr) == 0) load_current_idt(); } +NOKPROBE_SYMBOL(debug_stack_reset); #else /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 68317c8..bb92f38 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define SPINUNIT 100 /* 100ns */ -atomic_t mce_entry; - DEFINE_PER_CPU(unsigned, mce_exception_count); struct mce_bank *mce_banks __read_mostly; @@ -704,8 +702,7 @@ static int mce_timed_out(u64 *t) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (mca_cfg.tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -1041,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - atomic_inc(&mce_entry); - this_cpu_inc(mce_exception_count); if (!cfg->banks) @@ -1172,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: - atomic_dec(&mce_entry); sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -2437,32 +2431,65 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); if (err) { cpu_notifier_register_done(); - return err; + goto err_device_create; } } - register_syscore_ops(&mce_syscore_ops); __register_hotcpu_notifier(&mce_cpu_notifier); cpu_notifier_register_done(); + register_syscore_ops(&mce_syscore_ops); + /* register character device /dev/mcelog */ - misc_register(&mce_chrdev_device); + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 15c9876..dd9d619 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -97,6 +97,9 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); + /* * Synchronization. * @@ -546,6 +549,9 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &cpu_data(0); int error; + if (dis_ucode_ldr) + return 0; + if (c->x86_vendor == X86_VENDOR_INTEL) microcode_ops = init_intel_microcode(); else if (c->x86_vendor == X86_VENDOR_AMD) diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index be7f851..5f28a64 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -17,9 +17,11 @@ * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> +#include <asm/microcode.h> #include <asm/microcode_intel.h> #include <asm/microcode_amd.h> #include <asm/processor.h> +#include <asm/cmdline.h> #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) #define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') @@ -72,10 +74,33 @@ static int x86_family(void) return x86; } +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + void __init load_ucode_bsp(void) { int vendor, x86; + if (check_loader_disabled_bsp()) + return; + if (!have_cpuid_p()) return; @@ -96,10 +121,22 @@ void __init load_ucode_bsp(void) } } +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return __pa_nodebug(dis_ucode_ldr); +#else + return dis_ucode_ldr; +#endif +} + void load_ucode_ap(void) { int vendor, x86; + if (check_loader_disabled_ap()) + return; + if (!have_cpuid_p()) return; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 76f98fe..a450373 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -132,15 +132,6 @@ static void __init ms_hyperv_init_platform(void) lapic_timer_frequency = hv_lapic_frequency; printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); - - /* - * On Hyper-V, when we are booting off an EFI firmware stack, - * we do not have many legacy devices including PIC, PIT etc. - */ - if (efi_enabled(EFI_BOOT)) { - printk(KERN_INFO "HyperV: Using null_legacy_pic\n"); - legacy_pic = &null_legacy_pic; - } } #endif diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ae407f7..2bdfbff 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -303,15 +303,6 @@ int x86_setup_perfctr(struct perf_event *event) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); - } else { - /* - * If we have a PMU initialized but no APIC - * interrupts, we cannot sample hardware - * events (user-space has to fall back and - * sample via a hrtimer based software event): - */ - if (!x86_pmu.apic) - return -EOPNOTSUPP; } if (attr->type == PERF_TYPE_RAW) @@ -721,6 +712,7 @@ int perf_assign_events(struct perf_event **events, int n, return sched.state.unassigned; } +EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { @@ -1292,7 +1284,7 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } -static int __kprobes +static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { u64 start_clock; @@ -1310,6 +1302,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) return ret; } +NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; @@ -1365,6 +1358,15 @@ static void __init pmu_check_apic(void) x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); + + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * events (user-space has to fall back and + * sample via a hrtimer based software event): + */ + pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + } static struct attribute_group x86_pmu_format_group = { diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 4c36bbe..cbb1be3e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -593,7 +593,7 @@ out: return 1; } -static int __kprobes +static int perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) { int handled = 0; @@ -606,6 +606,7 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) return handled; } +NOKPROBE_SYMBOL(perf_ibs_nmi_handler); static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ae96cfa..980970c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -108,15 +108,31 @@ static u64 precise_store_data(u64 status) return val; } -static u64 precise_store_data_hsw(u64 status) +static u64 precise_store_data_hsw(struct perf_event *event, u64 status) { union perf_mem_data_src dse; + u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; dse.val = 0; dse.mem_op = PERF_MEM_OP_STORE; dse.mem_lvl = PERF_MEM_LVL_NA; + + /* + * L1 info only valid for following events: + * + * MEM_UOPS_RETIRED.STLB_MISS_STORES + * MEM_UOPS_RETIRED.LOCK_STORES + * MEM_UOPS_RETIRED.SPLIT_STORES + * MEM_UOPS_RETIRED.ALL_STORES + */ + if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) + return dse.mem_lvl; + if (status & 1) - dse.mem_lvl = PERF_MEM_LVL_L1; + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + /* Nothing else supported. Sorry. */ return dse.val; } @@ -887,7 +903,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, data.data_src.val = load_latency_data(pebs->dse); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) data.data_src.val = - precise_store_data_hsw(pebs->dse); + precise_store_data_hsw(event, pebs->dse); else data.data_src.val = precise_store_data(pebs->dse); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d82d155..9dd2459 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -384,6 +384,9 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_NO_TX) mask |= X86_BR_NO_TX; + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -678,6 +681,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { * NHM/WSM erratum: must include IND_JMP to capture IND_CALL */ [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, }; static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { @@ -689,6 +693,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR, [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, }; /* core */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index d35078e..7db54b5 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -206,23 +206,21 @@ static void __init dtb_apic_setup(void) static void __init x86_flattree_get_config(void) { u32 size, map_len; - struct boot_param_header *dt; + void *dt; if (!initial_dtb) return; - map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), - (u64)sizeof(struct boot_param_header)); + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - dt = early_memremap(initial_dtb, map_len); - size = be32_to_cpu(dt->totalsize); + initial_boot_params = dt = early_memremap(initial_dtb, map_len); + size = of_get_flat_dt_size(); if (map_len < size) { early_iounmap(dt, map_len); - dt = early_memremap(initial_dtb, size); + initial_boot_params = dt = early_memremap(initial_dtb, size); map_len = size; } - initial_boot_params = dt; unflatten_and_copy_device_tree(); early_iounmap(dt, map_len); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index d9c12d3..b74ebc7 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -200,7 +200,7 @@ static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; -unsigned __kprobes long oops_begin(void) +unsigned long oops_begin(void) { int cpu; unsigned long flags; @@ -223,8 +223,9 @@ unsigned __kprobes long oops_begin(void) return flags; } EXPORT_SYMBOL_GPL(oops_begin); +NOKPROBE_SYMBOL(oops_begin); -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { if (regs && kexec_should_crash(current)) crash_kexec(regs); @@ -247,8 +248,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception"); do_exit(signr); } +NOKPROBE_SYMBOL(oops_end); -int __kprobes __die(const char *str, struct pt_regs *regs, long err) +int __die(const char *str, struct pt_regs *regs, long err) { #ifdef CONFIG_X86_32 unsigned short ss; @@ -291,6 +293,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) #endif return 0; } +NOKPROBE_SYMBOL(__die); /* * This is gone through when something in the kernel has done something bad diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6cda0ba..2e1a685 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -419,7 +419,7 @@ static size_t __init gen6_stolen_size(int num, int slot, int func) return gmch_ctrl << 25; /* 32 MB units */ } -static size_t gen8_stolen_size(int num, int slot, int func) +static size_t __init gen8_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; @@ -429,48 +429,73 @@ static size_t gen8_stolen_size(int num, int slot, int func) return gmch_ctrl << 25; /* 32 MB units */ } +static size_t __init chv_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; + gmch_ctrl &= SNB_GMCH_GMS_MASK; + + /* + * 0x0 to 0x10: 32MB increments starting at 0MB + * 0x11 to 0x16: 4MB increments starting at 8MB + * 0x17 to 0x1d: 4MB increments start at 36MB + */ + if (gmch_ctrl < 0x11) + return gmch_ctrl << 25; + else if (gmch_ctrl < 0x17) + return (gmch_ctrl - 0x11 + 2) << 22; + else + return (gmch_ctrl - 0x17 + 9) << 22; +} struct intel_stolen_funcs { size_t (*size)(int num, int slot, int func); u32 (*base)(int num, int slot, int func, size_t size); }; -static const struct intel_stolen_funcs i830_stolen_funcs = { +static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { .base = i830_stolen_base, .size = i830_stolen_size, }; -static const struct intel_stolen_funcs i845_stolen_funcs = { +static const struct intel_stolen_funcs i845_stolen_funcs __initconst = { .base = i845_stolen_base, .size = i830_stolen_size, }; -static const struct intel_stolen_funcs i85x_stolen_funcs = { +static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = { .base = i85x_stolen_base, .size = gen3_stolen_size, }; -static const struct intel_stolen_funcs i865_stolen_funcs = { +static const struct intel_stolen_funcs i865_stolen_funcs __initconst = { .base = i865_stolen_base, .size = gen3_stolen_size, }; -static const struct intel_stolen_funcs gen3_stolen_funcs = { +static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = { .base = intel_stolen_base, .size = gen3_stolen_size, }; -static const struct intel_stolen_funcs gen6_stolen_funcs = { +static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = { .base = intel_stolen_base, .size = gen6_stolen_size, }; -static const struct intel_stolen_funcs gen8_stolen_funcs = { +static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { .base = intel_stolen_base, .size = gen8_stolen_size, }; -static struct pci_device_id intel_stolen_ids[] __initdata = { +static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = chv_stolen_size, +}; + +static const struct pci_device_id intel_stolen_ids[] __initconst = { INTEL_I830_IDS(&i830_stolen_funcs), INTEL_I845G_IDS(&i845_stolen_funcs), INTEL_I85X_IDS(&i85x_stolen_funcs), @@ -496,7 +521,8 @@ static struct pci_device_id intel_stolen_ids[] __initdata = { INTEL_HSW_D_IDS(&gen6_stolen_funcs), INTEL_HSW_M_IDS(&gen6_stolen_funcs), INTEL_BDW_M_IDS(&gen8_stolen_funcs), - INTEL_BDW_D_IDS(&gen8_stolen_funcs) + INTEL_BDW_D_IDS(&gen8_stolen_funcs), + INTEL_CHV_IDS(&chv_stolen_funcs), }; static void __init intel_graphics_stolen(int num, int slot, int func) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a2a4f46..f0da82b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -315,10 +315,6 @@ ENTRY(ret_from_kernel_thread) ENDPROC(ret_from_kernel_thread) /* - * Interrupt exit functions should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" -/* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to * go as quickly as possible which is why some of this is @@ -372,10 +368,6 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC -/* - * End of kprobes section - */ - .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -495,10 +487,6 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) -/* - * syscall stub including irq exit should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -527,6 +515,7 @@ syscall_exit: restore_all: TRACE_IRQS_IRET restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -537,6 +526,7 @@ restore_all_notrace: cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS +#endif restore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code irq_return: @@ -549,13 +539,9 @@ ENTRY(iret_exc) .previous _ASM_EXTABLE(irq_return,iret_exc) +#ifdef CONFIG_X86_ESPFIX32 CFI_RESTORE_STATE ldt_ss: - larl PT_OLDSS(%esp), %eax - jnz restore_nocheck - testl $0x00400000, %eax # returning to 32bit stack? - jnz restore_nocheck # allright, normal return - #ifdef CONFIG_PARAVIRT /* * The kernel can't run on a non-flat stack if paravirt mode @@ -597,6 +583,7 @@ ldt_ss: lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck +#endif CFI_ENDPROC ENDPROC(system_call) @@ -691,10 +678,6 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC -/* - * End of kprobes section - */ - .popsection .macro FIXUP_ESPFIX_STACK /* @@ -704,6 +687,7 @@ END(syscall_badsys) * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. */ +#ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ @@ -713,8 +697,10 @@ END(syscall_badsys) pushl_cfi %eax lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 +#endif .endm .macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax @@ -725,6 +711,7 @@ END(syscall_badsys) /* switch to normal stack */ FIXUP_ESPFIX_STACK 27: +#endif .endm /* @@ -781,10 +768,6 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC -/* - * Irq entries should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -961,10 +944,6 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) -/* - * End of kprobes section - */ - .popsection #ifdef CONFIG_XEN /* Xen doesn't set %esp to be precisely what the normal sysenter @@ -1239,11 +1218,6 @@ return_to_handler: jmp *%ecx #endif -/* - * Some functions should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" - #ifdef CONFIG_TRACING ENTRY(trace_page_fault) RING0_EC_FRAME @@ -1355,11 +1329,13 @@ END(debug) ENTRY(nmi) RING0_INT_FRAME ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl_cfi %eax je nmi_espfix_stack +#endif cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl_cfi %eax @@ -1399,6 +1375,7 @@ nmi_debug_stack_check: FIX_STACK 24, nmi_stack_correct, 1 jmp nmi_stack_correct +#ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * @@ -1420,6 +1397,7 @@ nmi_espfix_stack: lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 jmp irq_return +#endif CFI_ENDPROC END(nmi) @@ -1453,7 +1431,3 @@ ENTRY(async_page_fault) END(async_page_fault) #endif -/* - * End of kprobes section - */ - .popsection diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c36..b25ca96 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -36,7 +36,7 @@ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack * frame that is otherwise undefined after a SYSCALL * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - errorentry/paranoidentry/zeroentry - Define exception entry points. + * - idtentry - Define exception entry points. */ #include <linux/linkage.h> @@ -53,11 +53,11 @@ #include <asm/page_types.h> #include <asm/irqflags.h> #include <asm/paravirt.h> -#include <asm/ftrace.h> #include <asm/percpu.h> #include <asm/asm.h> #include <asm/context_tracking.h> #include <asm/smap.h> +#include <asm/pgtable_types.h> #include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -69,209 +69,6 @@ .code64 .section .entry.text, "ax" -#ifdef CONFIG_FUNCTION_TRACER - -#ifdef CC_USING_FENTRY -# define function_hook __fentry__ -#else -# define function_hook mcount -#endif - -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(function_hook) - retq -END(function_hook) - -/* skip is set if stack has been adjusted */ -.macro ftrace_caller_setup skip=0 - MCOUNT_SAVE_FRAME \skip - - /* Load the ftrace_ops into the 3rd parameter */ - movq function_trace_op(%rip), %rdx - - /* Load ip into the first parameter */ - movq RIP(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - /* Load the parent_ip into the second parameter */ -#ifdef CC_USING_FENTRY - movq SS+16(%rsp), %rsi -#else - movq 8(%rbp), %rsi -#endif -.endm - -ENTRY(ftrace_caller) - /* Check if tracing was disabled (quick check) */ - cmpl $0, function_trace_stop - jne ftrace_stub - - ftrace_caller_setup - /* regs go into 4th parameter (but make it NULL) */ - movq $0, %rcx - -GLOBAL(ftrace_call) - call ftrace_stub - - MCOUNT_RESTORE_FRAME -ftrace_return: - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) - jmp ftrace_stub -#endif - -GLOBAL(ftrace_stub) - retq -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - /* Save the current flags before compare (in SS location)*/ - pushfq - - /* Check if tracing was disabled (quick check) */ - cmpl $0, function_trace_stop - jne ftrace_restore_flags - - /* skip=8 to skip flags saved in SS */ - ftrace_caller_setup 8 - - /* Save the rest of pt_regs */ - movq %r15, R15(%rsp) - movq %r14, R14(%rsp) - movq %r13, R13(%rsp) - movq %r12, R12(%rsp) - movq %r11, R11(%rsp) - movq %r10, R10(%rsp) - movq %rbp, RBP(%rsp) - movq %rbx, RBX(%rsp) - /* Copy saved flags */ - movq SS(%rsp), %rcx - movq %rcx, EFLAGS(%rsp) - /* Kernel segments */ - movq $__KERNEL_DS, %rcx - movq %rcx, SS(%rsp) - movq $__KERNEL_CS, %rcx - movq %rcx, CS(%rsp) - /* Stack - skipping return address */ - leaq SS+16(%rsp), %rcx - movq %rcx, RSP(%rsp) - - /* regs go into 4th parameter */ - leaq (%rsp), %rcx - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - /* Copy flags back to SS, to restore them */ - movq EFLAGS(%rsp), %rax - movq %rax, SS(%rsp) - - /* Handlers can change the RIP */ - movq RIP(%rsp), %rax - movq %rax, SS+8(%rsp) - - /* restore the rest of pt_regs */ - movq R15(%rsp), %r15 - movq R14(%rsp), %r14 - movq R13(%rsp), %r13 - movq R12(%rsp), %r12 - movq R10(%rsp), %r10 - movq RBP(%rsp), %rbp - movq RBX(%rsp), %rbx - - /* skip=8 to skip flags saved in SS */ - MCOUNT_RESTORE_FRAME 8 - - /* Restore flags */ - popfq - - jmp ftrace_return -ftrace_restore_flags: - popfq - jmp ftrace_stub - -END(ftrace_regs_caller) - - -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(function_hook) - cmpl $0, function_trace_stop - jne ftrace_stub - - cmpq $ftrace_stub, ftrace_trace_function - jnz trace - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpq $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpq $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif - -GLOBAL(ftrace_stub) - retq - -trace: - MCOUNT_SAVE_FRAME - - movq RIP(%rsp), %rdi -#ifdef CC_USING_FENTRY - movq SS+16(%rsp), %rsi -#else - movq 8(%rbp), %rsi -#endif - subq $MCOUNT_INSN_SIZE, %rdi - - call *ftrace_trace_function - - MCOUNT_RESTORE_FRAME - - jmp ftrace_stub -END(function_hook) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - MCOUNT_SAVE_FRAME - -#ifdef CC_USING_FENTRY - leaq SS+16(%rsp), %rdi - movq $0, %rdx /* No framepointers needed */ -#else - leaq 8(%rbp), %rdi - movq (%rbp), %rdx -#endif - movq RIP(%rsp), %rsi - subq $MCOUNT_INSN_SIZE, %rsi - - call prepare_ftrace_return - - MCOUNT_RESTORE_FRAME - - retq -END(ftrace_graph_caller) - -GLOBAL(return_to_handler) - subq $24, %rsp - - /* Save the return values */ - movq %rax, (%rsp) - movq %rdx, 8(%rsp) - movq %rbp, %rdi - - call ftrace_return_to_handler - - movq %rax, %rdi - movq 8(%rsp), %rdx - movq (%rsp), %rax - addq $24, %rsp - jmp *%rdi -#endif - #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args @@ -487,8 +284,6 @@ ENDPROC(native_usergs_sysret64) TRACE_IRQS_OFF .endm -/* save complete stack frame */ - .pushsection .kprobes.text, "ax" ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -517,7 +312,6 @@ ENTRY(save_paranoid) 1: ret CFI_ENDPROC END(save_paranoid) - .popsection /* * A newly forked process directly context switches into this address. @@ -975,10 +769,6 @@ END(interrupt) call \func .endm -/* - * Interrupt entry/exit should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" /* * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_interrupt. @@ -1040,8 +830,18 @@ restore_args: RESTORE_ARGS 1,8,1 irq_return: + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ +#ifdef CONFIG_X86_ESPFIX64 + testb $4,(SS-RIP)(%rsp) + jnz irq_return_ldt +#endif + +irq_return_iret: INTERRUPT_RETURN - _ASM_EXTABLE(irq_return, bad_iret) + _ASM_EXTABLE(irq_return_iret, bad_iret) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) @@ -1049,6 +849,32 @@ ENTRY(native_iret) _ASM_EXTABLE(native_iret, bad_iret) #endif +#ifdef CONFIG_X86_ESPFIX64 +irq_return_ldt: + pushq_cfi %rax + pushq_cfi %rdi + SWAPGS + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq_cfi %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq_cfi %rax + jmp irq_return_iret +#endif + .section .fixup,"ax" bad_iret: /* @@ -1110,13 +936,44 @@ ENTRY(retint_kernel) call preempt_schedule_irq jmp exit_intr #endif - CFI_ENDPROC END(common_interrupt) -/* - * End of kprobes section - */ - .popsection + + /* + * If IRET takes a fault on the espfix stack, then we + * end up promoting it to a doublefault. In that case, + * modify the stack to make it look like we just entered + * the #GP handler from user space, similar to bad_iret. + */ +#ifdef CONFIG_X86_ESPFIX64 + ALIGN +__do_double_fault: + XCPT_FRAME 1 RDI+8 + movq RSP(%rdi),%rax /* Trap on the espfix stack? */ + sarq $PGDIR_SHIFT,%rax + cmpl $ESPFIX_PGD_ENTRY,%eax + jne do_double_fault /* No, just deliver the fault */ + cmpl $__KERNEL_CS,CS(%rdi) + jne do_double_fault + movq RIP(%rdi),%rax + cmpq $irq_return_iret,%rax +#ifdef CONFIG_PARAVIRT + je 1f + cmpq $native_iret,%rax +#endif + jne do_double_fault /* This shouldn't happen... */ +1: + movq PER_CPU_VAR(kernel_stack),%rax + subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ + movq %rax,RSP(%rdi) + movq $0,(%rax) /* Missing (lost) #GP error code */ + movq $general_protection,RIP(%rdi) + retq + CFI_ENDPROC +END(__do_double_fault) +#else +# define __do_double_fault do_double_fault +#endif /* * APIC interrupts. @@ -1203,125 +1060,100 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -.macro zeroentry sym do_sym -ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call error_entry - DEFAULT_FRAME 0 - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry sym do_sym +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry_ist sym do_sym ist -ENTRY(\sym) + .if \has_error_code + XCPT_FRAME + .else INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - TRACE_IRQS_OFF_DEBUG - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - call \do_sym - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + .endif -.macro errorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + .endif + subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + + .if \paranoid + call save_paranoid + .else call error_entry + .endif + DEFAULT_FRAME 0 + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + + .if \paranoid + jmp paranoid_exit /* %ebx: no swapgs flag */ + .else jmp error_exit /* %ebx: no swapgs flag */ + .endif + CFI_ENDPROC END(\sym) .endm #ifdef CONFIG_TRACING -.macro trace_errorentry sym do_sym -errorentry trace(\sym) trace(\do_sym) -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code .endm #else -.macro trace_errorentry sym do_sym -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif - /* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - DEFAULT_FRAME 0 - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm - -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 /* Reload gs selector with exception handling */ @@ -1371,7 +1203,7 @@ ENTRY(do_softirq_own_stack) END(do_softirq_own_stack) #ifdef CONFIG_XEN -zeroentry xen_hypervisor_callback xen_do_hypervisor_callback +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -1477,26 +1309,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler #endif /* CONFIG_HYPERV */ -/* - * Some functions should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" - -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 #ifdef CONFIG_XEN -zeroentry xen_debug do_debug -zeroentry xen_int3 do_int3 -errorentry xen_stack_segment do_stack_segment +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -errorentry general_protection do_general_protection -trace_errorentry page_fault do_page_fault +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST -errorentry async_page_fault do_async_page_fault +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check *machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* @@ -1601,7 +1428,7 @@ error_sti: */ error_kernelspace: incl %ebx - leaq irq_return(%rip),%rcx + leaq irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs movl %ecx,%eax /* zero extend */ @@ -1898,7 +1725,3 @@ ENTRY(ignore_sysret) CFI_ENDPROC END(ignore_sysret) -/* - * End of kprobes section - */ - .popsection diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 0000000..6afbb16 --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,209 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer. This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart. When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace. The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables. The + * actual ministack invocation is in entry_64.S. + */ + +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/random.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/setup.h> +#include <asm/espfix.h> + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE (8*8UL) +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more than one PGD for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, slot; + unsigned long addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ + unsigned long rand; + + /* + * This is run before the entropy pools are initialized, + * but this is hopefully better than nothing. + */ + if (!arch_get_random_long(&rand)) { + /* The constant is an arbitrary large prime */ + rdtscll(rand); + rand *= 0xc345c6b72fd16123UL; + } + + slot_random = rand % ESPFIX_STACKS_PER_PAGE; + page_random = (rand / ESPFIX_STACKS_PER_PAGE) + & (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ + pgd_t *pgd_p; + pteval_t ptemask; + + ptemask = __supported_pte_mask; + + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + + /* Randomize the locations */ + init_espfix_random(); + + /* The rest is the same as for any other processor */ + init_espfix_ap(); +} + +void init_espfix_ap(void) +{ + unsigned int cpu, page; + unsigned long addr; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(this_cpu_read(espfix_stack))) + return; /* Already initialized */ + + cpu = smp_processor_id(); + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (likely(stack_page)) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (stack_page) + goto unlock_done; + + ptemask = __supported_pte_mask; + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = (void *)__get_free_page(GFP_KERNEL); + pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + ACCESS_ONCE(espfix_pages[page]) = stack_page; + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + this_cpu_write(espfix_stack, addr); + this_cpu_write(espfix_waddr, (unsigned long)stack_page + + (addr & ~PAGE_MASK)); +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 52819e8..cbc4a91 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -297,16 +297,7 @@ int ftrace_int3_handler(struct pt_regs *regs) static int ftrace_write(unsigned long ip, const char *val, int size) { - /* - * On x86_64, kernel text mappings are mapped read-only with - * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead - * of the kernel text mapping to modify the kernel text. - * - * For 32bit kernels, these mappings are same and we can use - * kernel identity mapping to modify code. - */ - if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa_symbol(ip)); + ip = text_ip_addr(ip); if (probe_kernel_write((void *)ip, val, size)) return -EPERM; @@ -349,40 +340,14 @@ static int add_brk_on_nop(struct dyn_ftrace *rec) return add_break(rec->ip, old); } -/* - * If the record has the FTRACE_FL_REGS set, that means that it - * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS - * is not not set, then it wants to convert to the normal callback. - */ -static unsigned long get_ftrace_addr(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_REGS) - return (unsigned long)FTRACE_REGS_ADDR; - else - return (unsigned long)FTRACE_ADDR; -} - -/* - * The FTRACE_FL_REGS_EN is set when the record already points to - * a function that saves all the regs. Basically the '_EN' version - * represents the current state of the function. - */ -static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_REGS_EN) - return (unsigned long)FTRACE_REGS_ADDR; - else - return (unsigned long)FTRACE_ADDR; -} - static int add_breakpoints(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; int ret; - ret = ftrace_test_record(rec, enable); + ftrace_addr = ftrace_get_addr_curr(rec); - ftrace_addr = get_ftrace_addr(rec); + ret = ftrace_test_record(rec, enable); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -392,10 +357,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable) /* converting nop to call */ return add_brk_on_nop(rec); - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: - ftrace_addr = get_ftrace_old_addr(rec); - /* fall through */ case FTRACE_UPDATE_MAKE_NOP: /* converting a call to a nop */ return add_brk_on_call(rec, ftrace_addr); @@ -440,14 +402,14 @@ static int remove_breakpoint(struct dyn_ftrace *rec) * If not, don't touch the breakpoint, we make just create * a disaster. */ - ftrace_addr = get_ftrace_addr(rec); + ftrace_addr = ftrace_get_addr_new(rec); nop = ftrace_call_replace(ip, ftrace_addr); if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) goto update; /* Check both ftrace_addr and ftrace_old_addr */ - ftrace_addr = get_ftrace_old_addr(rec); + ftrace_addr = ftrace_get_addr_curr(rec); nop = ftrace_call_replace(ip, ftrace_addr); if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) @@ -491,13 +453,12 @@ static int add_update(struct dyn_ftrace *rec, int enable) ret = ftrace_test_record(rec, enable); - ftrace_addr = get_ftrace_addr(rec); + ftrace_addr = ftrace_get_addr_new(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ @@ -538,13 +499,12 @@ static int finish_update(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); - ftrace_addr = get_ftrace_addr(rec); + ftrace_addr = ftrace_get_addr_new(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ @@ -621,8 +581,8 @@ void ftrace_replace_code(int enable) return; remove_breakpoints: + pr_warn("Failed on %s (%d):\n", report, count); ftrace_bug(ret, rec ? rec->ip : 0); - printk(KERN_WARNING "Failed on %s (%d):\n", report, count); for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 068054f..eda1a86 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -172,7 +172,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - if (console_loglevel == 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) early_printk("Kernel alive\n"); clear_page(init_level4_pgt); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4177bfb..319bcb9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -74,9 +74,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a) static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); -#ifdef CONFIG_X86_64 - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); -#endif } static inline void hpet_clear_mapping(void) @@ -479,7 +476,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - destroy_irq(irq); + irq_free_hwirq(irq); return -EINVAL; } return 0; @@ -487,9 +484,8 @@ static int hpet_setup_msi_irq(unsigned int irq) static int hpet_assign_irq(struct hpet_dev *dev) { - unsigned int irq; + unsigned int irq = irq_alloc_hwirq(-1); - irq = create_irq_nr(0, -1); if (!irq) return -EINVAL; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a67b47c..5f9cf20 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,7 +32,6 @@ #include <linux/irqflags.h> #include <linux/notifier.h> #include <linux/kallsyms.h> -#include <linux/kprobes.h> #include <linux/percpu.h> #include <linux/kdebug.h> #include <linux/kernel.h> @@ -424,7 +423,7 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore); * NOTIFY_STOP returned for all other cases * */ -static int __kprobes hw_breakpoint_handler(struct die_args *args) +static int hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct perf_event *bp; @@ -511,7 +510,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) /* * Handle debug exception notifications. */ -int __kprobes hw_breakpoint_exceptions_notify( +int hw_breakpoint_exceptions_notify( struct notifier_block *unused, unsigned long val, void *data) { if (val != DIE_DEBUG) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 2e977b5..8af81710 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -299,13 +299,31 @@ static void unmask_8259A(void) static void init_8259A(int auto_eoi) { unsigned long flags; + unsigned char probe_val = ~(1 << PIC_CASCADE_IR); + unsigned char new_val; i8259A_auto_eoi = auto_eoi; raw_spin_lock_irqsave(&i8259A_lock, flags); - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + /* + * Check to see if we have a PIC. + * Mask all except the cascade and read + * back the value we just wrote. If we don't + * have a PIC, we will read 0xff as opposed to the + * value we wrote. + */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + outb(probe_val, PIC_MASTER_IMR); + new_val = inb(PIC_MASTER_IMR); + if (new_val != probe_val) { + printk(KERN_INFO "Using NULL legacy PIC\n"); + legacy_pic = &null_legacy_pic; + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return; + } + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ /* * outb_pic - this has to work on a wide range of PC hardware. diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index c3aae66..d30acdc 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -25,6 +25,9 @@ #include <asm/iosf_mbi.h> +#define PCI_DEVICE_ID_BAYTRAIL 0x0F00 +#define PCI_DEVICE_ID_QUARK_X1000 0x0958 + static DEFINE_SPINLOCK(iosf_mbi_lock); static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) @@ -177,6 +180,13 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) } EXPORT_SYMBOL(iosf_mbi_modify); +bool iosf_mbi_available(void) +{ + /* Mbi isn't hot-pluggable. No remove routine is provided */ + return mbi_pdev; +} +EXPORT_SYMBOL(iosf_mbi_available); + static int iosf_mbi_probe(struct pci_dev *pdev, const struct pci_device_id *unused) { @@ -193,7 +203,8 @@ static int iosf_mbi_probe(struct pci_dev *pdev, } static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, { 0, }, }; MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 283a76a..922d285 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -17,6 +17,7 @@ #include <asm/idle.h> #include <asm/mce.h> #include <asm/hw_irq.h> +#include <asm/desc.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -334,10 +335,17 @@ int check_irq_vectors_for_cpu_disable(void) for_each_online_cpu(cpu) { if (cpu == this_cpu) continue; - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; - vector++) { - if (per_cpu(vector_irq, cpu)[vector] < 0) - count++; + /* + * We scan from FIRST_EXTERNAL_VECTOR to first system + * vector. If the vector is marked in the used vectors + * bitmap or an irq is assigned to it, we don't count + * it as available. + */ + for (vector = FIRST_EXTERNAL_VECTOR; + vector < first_system_vector; vector++) { + if (!test_bit(vector, used_vectors) && + per_cpu(vector_irq, cpu)[vector] < 0) + count++; } } @@ -357,6 +365,7 @@ void fixup_irqs(void) struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; + int ret; for_each_irq_desc(irq, desc) { int break_affinity = 0; @@ -395,10 +404,14 @@ void fixup_irqs(void) if (!irqd_can_move_in_process_context(data) && chip->irq_mask) chip->irq_mask(data); - if (chip->irq_set_affinity) - chip->irq_set_affinity(data, affinity, true); - else if (!(warned++)) - set_affinity = 0; + if (chip->irq_set_affinity) { + ret = chip->irq_set_affinity(data, affinity, true); + if (ret == -ENOSPC) + pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq); + } else { + if (!(warned++)) + set_affinity = 0; + } /* * We unmask if the irq was not marked masked by the diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 61b17dc..7596df6 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -112,7 +112,8 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) +static nokprobe_inline void +__synthesize_relative_insn(void *from, void *to, u8 op) { struct __arch_relative_insn { u8 op; @@ -125,21 +126,23 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) } /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -void __kprobes synthesize_reljump(void *from, void *to) +void synthesize_reljump(void *from, void *to) { __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } +NOKPROBE_SYMBOL(synthesize_reljump); /* Insert a call instruction at address 'from', which calls address 'to'.*/ -void __kprobes synthesize_relcall(void *from, void *to) +void synthesize_relcall(void *from, void *to) { __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); } +NOKPROBE_SYMBOL(synthesize_relcall); /* * Skip the prefixes of the instruction. */ -static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) +static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) { insn_attr_t attr; @@ -154,12 +157,13 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) #endif return insn; } +NOKPROBE_SYMBOL(skip_prefixes); /* * Returns non-zero if opcode is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int __kprobes can_boost(kprobe_opcode_t *opcodes) +int can_boost(kprobe_opcode_t *opcodes) { kprobe_opcode_t opcode; kprobe_opcode_t *orig_opcodes = opcodes; @@ -260,7 +264,7 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add } /* Check if paddr is at an instruction boundary */ -static int __kprobes can_probe(unsigned long paddr) +static int can_probe(unsigned long paddr) { unsigned long addr, __addr, offset = 0; struct insn insn; @@ -299,7 +303,7 @@ static int __kprobes can_probe(unsigned long paddr) /* * Returns non-zero if opcode modifies the interrupt flag. */ -static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) +static int is_IF_modifier(kprobe_opcode_t *insn) { /* Skip prefixes */ insn = skip_prefixes(insn); @@ -322,7 +326,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) * If not, return null. * Only applicable to 64-bit x86. */ -int __kprobes __copy_instruction(u8 *dest, u8 *src) +int __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -365,7 +369,7 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src) return insn.length; } -static int __kprobes arch_copy_kprobe(struct kprobe *p) +static int arch_copy_kprobe(struct kprobe *p) { int ret; @@ -392,7 +396,7 @@ static int __kprobes arch_copy_kprobe(struct kprobe *p) return 0; } -int __kprobes arch_prepare_kprobe(struct kprobe *p) +int arch_prepare_kprobe(struct kprobe *p) { if (alternatives_text_reserved(p->addr, p->addr)) return -EINVAL; @@ -407,17 +411,17 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return arch_copy_kprobe(p); } -void __kprobes arch_arm_kprobe(struct kprobe *p) +void arch_arm_kprobe(struct kprobe *p) { text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); } -void __kprobes arch_disarm_kprobe(struct kprobe *p) +void arch_disarm_kprobe(struct kprobe *p) { text_poke(p->addr, &p->opcode, 1); } -void __kprobes arch_remove_kprobe(struct kprobe *p) +void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); @@ -425,7 +429,8 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) } } -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +static nokprobe_inline void +save_previous_kprobe(struct kprobe_ctlblk *kcb) { kcb->prev_kprobe.kp = kprobe_running(); kcb->prev_kprobe.status = kcb->kprobe_status; @@ -433,7 +438,8 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags; } -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +static nokprobe_inline void +restore_previous_kprobe(struct kprobe_ctlblk *kcb) { __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; @@ -441,8 +447,9 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; } -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) +static nokprobe_inline void +set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) { __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags @@ -451,7 +458,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; } -static void __kprobes clear_btf(void) +static nokprobe_inline void clear_btf(void) { if (test_thread_flag(TIF_BLOCKSTEP)) { unsigned long debugctl = get_debugctlmsr(); @@ -461,7 +468,7 @@ static void __kprobes clear_btf(void) } } -static void __kprobes restore_btf(void) +static nokprobe_inline void restore_btf(void) { if (test_thread_flag(TIF_BLOCKSTEP)) { unsigned long debugctl = get_debugctlmsr(); @@ -471,8 +478,7 @@ static void __kprobes restore_btf(void) } } -void __kprobes -arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) +void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long *sara = stack_addr(regs); @@ -481,9 +487,10 @@ arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; } +NOKPROBE_SYMBOL(arch_prepare_kretprobe); -static void __kprobes -setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) +static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb, int reenter) { if (setup_detour_execution(p, regs, reenter)) return; @@ -519,22 +526,24 @@ setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k else regs->ip = (unsigned long)p->ainsn.insn; } +NOKPROBE_SYMBOL(setup_singlestep); /* * We have reentered the kprobe_handler(), since another probe was hit while * within the handler. We save the original kprobes variables and just single * step on the instruction of the new probe without calling any user handlers. */ -static int __kprobes -reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) +static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SS: kprobes_inc_nmissed_count(p); setup_singlestep(p, regs, kcb, 1); break; - case KPROBE_HIT_SS: + case KPROBE_REENTER: /* A probe has been hit in the codepath leading up to, or just * after, single-stepping of a probed instruction. This entire * codepath should strictly reside in .kprobes.text section. @@ -553,12 +562,13 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb return 1; } +NOKPROBE_SYMBOL(reenter_kprobe); /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. */ -static int __kprobes kprobe_handler(struct pt_regs *regs) +int kprobe_int3_handler(struct pt_regs *regs) { kprobe_opcode_t *addr; struct kprobe *p; @@ -621,12 +631,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) preempt_enable_no_resched(); return 0; } +NOKPROBE_SYMBOL(kprobe_int3_handler); /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. */ -static void __used __kprobes kretprobe_trampoline_holder(void) +static void __used kretprobe_trampoline_holder(void) { asm volatile ( ".global kretprobe_trampoline\n" @@ -657,11 +668,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void) #endif " ret\n"); } +NOKPROBE_SYMBOL(kretprobe_trampoline_holder); +NOKPROBE_SYMBOL(kretprobe_trampoline); /* * Called from kretprobe_trampoline */ -__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs) +__visible __used void *trampoline_handler(struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; @@ -747,6 +760,7 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs) } return (void *)orig_ret_address; } +NOKPROBE_SYMBOL(trampoline_handler); /* * Called after single-stepping. p->addr is the address of the @@ -775,8 +789,8 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs) * jump instruction after the copied instruction, that jumps to the next * instruction after the probepoint. */ -static void __kprobes -resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) +static void resume_execution(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) { unsigned long *tos = stack_addr(regs); unsigned long copy_ip = (unsigned long)p->ainsn.insn; @@ -851,12 +865,13 @@ resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k no_change: restore_btf(); } +NOKPROBE_SYMBOL(resume_execution); /* * Interrupts are disabled on entry as trap1 is an interrupt gate and they * remain disabled throughout this function. */ -static int __kprobes post_kprobe_handler(struct pt_regs *regs) +int kprobe_debug_handler(struct pt_regs *regs) { struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -891,8 +906,9 @@ out: return 1; } +NOKPROBE_SYMBOL(kprobe_debug_handler); -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -949,12 +965,13 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) return 0; } +NOKPROBE_SYMBOL(kprobe_fault_handler); /* * Wrapper routine for handling exceptions. */ -int __kprobes -kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) +int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, + void *data) { struct die_args *args = data; int ret = NOTIFY_DONE; @@ -962,22 +979,7 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d if (args->regs && user_mode_vm(args->regs)) return ret; - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) { - /* - * Reset the BS bit in dr6 (pointed by args->err) to - * denote completion of processing - */ - (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; - ret = NOTIFY_STOP; - } - break; - case DIE_GPF: + if (val == DIE_GPF) { /* * To be potentially processing a kprobe fault and to * trust the result from kprobe_running(), we have @@ -986,14 +988,12 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d if (!preemptible() && kprobe_running() && kprobe_fault_handler(args->regs, args->trapnr)) ret = NOTIFY_STOP; - break; - default: - break; } return ret; } +NOKPROBE_SYMBOL(kprobe_exceptions_notify); -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct jprobe *jp = container_of(p, struct jprobe, kp); unsigned long addr; @@ -1017,8 +1017,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) regs->ip = (unsigned long)(jp->entry); return 1; } +NOKPROBE_SYMBOL(setjmp_pre_handler); -void __kprobes jprobe_return(void) +void jprobe_return(void) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -1034,8 +1035,10 @@ void __kprobes jprobe_return(void) " nop \n"::"b" (kcb->jprobe_saved_sp):"memory"); } +NOKPROBE_SYMBOL(jprobe_return); +NOKPROBE_SYMBOL(jprobe_return_end); -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); u8 *addr = (u8 *) (regs->ip - 1); @@ -1063,13 +1066,22 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } +NOKPROBE_SYMBOL(longjmp_break_handler); + +bool arch_within_kprobe_blacklist(unsigned long addr) +{ + return (addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end) || + (addr >= (unsigned long)__entry_text_start && + addr < (unsigned long)__entry_text_end); +} int __init arch_init_kprobes(void) { return 0; } -int __kprobes arch_trampoline_kprobe(struct kprobe *p) +int arch_trampoline_kprobe(struct kprobe *p) { return 0; } diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 23ef5c5..717b02a 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,8 +25,9 @@ #include "common.h" -static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) +static nokprobe_inline +int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) { /* * Emulate singlestep (and also recover regs->ip) @@ -41,18 +42,19 @@ static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, return 1; } -int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) +int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) { if (kprobe_ftrace(p)) return __skip_singlestep(p, regs, kcb); else return 0; } +NOKPROBE_SYMBOL(skip_singlestep); /* Ftrace callback handler for kprobes */ -void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe *p; struct kprobe_ctlblk *kcb; @@ -84,8 +86,9 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, end: local_irq_restore(flags); } +NOKPROBE_SYMBOL(kprobe_ftrace_handler); -int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +int arch_prepare_kprobe_ftrace(struct kprobe *p) { p->ainsn.insn = NULL; p->ainsn.boostable = -1; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 898160b..f304773 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -77,7 +77,7 @@ found: } /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ -static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) +static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) { #ifdef CONFIG_X86_64 *addr++ = 0x48; @@ -138,7 +138,8 @@ asm ( #define INT3_SIZE sizeof(kprobe_opcode_t) /* Optimized kprobe call back function: called from optinsn */ -static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) +static void +optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); unsigned long flags; @@ -168,8 +169,9 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_ } local_irq_restore(flags); } +NOKPROBE_SYMBOL(optimized_callback); -static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +static int copy_optimized_instructions(u8 *dest, u8 *src) { int len = 0, ret; @@ -189,7 +191,7 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) } /* Check whether insn is indirect jump */ -static int __kprobes insn_is_indirect_jump(struct insn *insn) +static int insn_is_indirect_jump(struct insn *insn) { return ((insn->opcode.bytes[0] == 0xff && (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ @@ -224,7 +226,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) } /* Decode whole function to ensure any instructions don't jump into target */ -static int __kprobes can_optimize(unsigned long paddr) +static int can_optimize(unsigned long paddr) { unsigned long addr, size = 0, offset = 0; struct insn insn; @@ -275,7 +277,7 @@ static int __kprobes can_optimize(unsigned long paddr) } /* Check optimized_kprobe can actually be optimized. */ -int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +int arch_check_optimized_kprobe(struct optimized_kprobe *op) { int i; struct kprobe *p; @@ -290,15 +292,15 @@ int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) } /* Check the addr is within the optimized instructions. */ -int __kprobes -arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr) +int arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) { return ((unsigned long)op->kp.addr <= addr && (unsigned long)op->kp.addr + op->optinsn.size > addr); } /* Free optimized instruction slot */ -static __kprobes +static void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) { if (op->optinsn.insn) { @@ -308,7 +310,7 @@ void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) } } -void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +void arch_remove_optimized_kprobe(struct optimized_kprobe *op) { __arch_remove_optimized_kprobe(op, 1); } @@ -318,7 +320,7 @@ void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) * Target instructions MUST be relocatable (checked inside) * This is called when new aggr(opt)probe is allocated or reused. */ -int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op) { u8 *buf; int ret; @@ -372,7 +374,7 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) * Replace breakpoints (int3) with relative jumps. * Caller must call with locking kprobe_mutex and text_mutex. */ -void __kprobes arch_optimize_kprobes(struct list_head *oplist) +void arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; u8 insn_buf[RELATIVEJUMP_SIZE]; @@ -398,7 +400,7 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist) } /* Replace a relative jump with a breakpoint (int3). */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +void arch_unoptimize_kprobe(struct optimized_kprobe *op) { u8 insn_buf[RELATIVEJUMP_SIZE]; @@ -424,8 +426,7 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist, } } -int __kprobes -setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) { struct optimized_kprobe *op; @@ -441,3 +442,4 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) } return 0; } +NOKPROBE_SYMBOL(setup_detour_execution); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0331cb3..3dd8e2c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -251,15 +251,16 @@ u32 kvm_read_and_reset_pf_reason(void) return reason; } EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); +NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); -dotraplinkage void __kprobes +dotraplinkage void do_async_page_fault(struct pt_regs *regs, unsigned long error_code) { enum ctx_state prev_state; switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code); + trace_do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ @@ -276,6 +277,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) break; } } +NOKPROBE_SYMBOL(do_async_page_fault); static void __init paravirt_ops_setup(void) { diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index dcbbaa1..c37886d 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,8 +20,6 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> -int sysctl_ldt16 = 0; - #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) { @@ -231,16 +229,10 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } - /* - * On x86-64 we do not support 16-bit segments due to - * IRET leaking the high bits of the kernel stack address. - */ -#ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit && !sysctl_ldt16) { + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { error = -EINVAL; goto out_unlock; } -#endif fill_ldt(&ldt, &ldt_info); if (oldmode) diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S new file mode 100644 index 0000000..c050a01 --- /dev/null +++ b/arch/x86/kernel/mcount_64.S @@ -0,0 +1,217 @@ +/* + * linux/arch/x86_64/mcount_64.S + * + * Copyright (C) 2014 Steven Rostedt, Red Hat Inc + */ + +#include <linux/linkage.h> +#include <asm/ptrace.h> +#include <asm/ftrace.h> + + + .code64 + .section .entry.text, "ax" + + +#ifdef CONFIG_FUNCTION_TRACER + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +#else +# define function_hook mcount +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(function_hook) + retq +END(function_hook) + +/* skip is set if stack has been adjusted */ +.macro ftrace_caller_setup skip=0 + MCOUNT_SAVE_FRAME \skip + + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx + + /* Load ip into the first parameter */ + movq RIP(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi + /* Load the parent_ip into the second parameter */ +#ifdef CC_USING_FENTRY + movq SS+16(%rsp), %rsi +#else + movq 8(%rbp), %rsi +#endif +.endm + +ENTRY(ftrace_caller) + /* Check if tracing was disabled (quick check) */ + cmpl $0, function_trace_stop + jne ftrace_stub + + ftrace_caller_setup + /* regs go into 4th parameter (but make it NULL) */ + movq $0, %rcx + +GLOBAL(ftrace_call) + call ftrace_stub + + MCOUNT_RESTORE_FRAME +ftrace_return: + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +GLOBAL(ftrace_graph_call) + jmp ftrace_stub +#endif + +GLOBAL(ftrace_stub) + retq +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + /* Save the current flags before compare (in SS location)*/ + pushfq + + /* Check if tracing was disabled (quick check) */ + cmpl $0, function_trace_stop + jne ftrace_restore_flags + + /* skip=8 to skip flags saved in SS */ + ftrace_caller_setup 8 + + /* Save the rest of pt_regs */ + movq %r15, R15(%rsp) + movq %r14, R14(%rsp) + movq %r13, R13(%rsp) + movq %r12, R12(%rsp) + movq %r11, R11(%rsp) + movq %r10, R10(%rsp) + movq %rbp, RBP(%rsp) + movq %rbx, RBX(%rsp) + /* Copy saved flags */ + movq SS(%rsp), %rcx + movq %rcx, EFLAGS(%rsp) + /* Kernel segments */ + movq $__KERNEL_DS, %rcx + movq %rcx, SS(%rsp) + movq $__KERNEL_CS, %rcx + movq %rcx, CS(%rsp) + /* Stack - skipping return address */ + leaq SS+16(%rsp), %rcx + movq %rcx, RSP(%rsp) + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + /* Copy flags back to SS, to restore them */ + movq EFLAGS(%rsp), %rax + movq %rax, SS(%rsp) + + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, SS+8(%rsp) + + /* restore the rest of pt_regs */ + movq R15(%rsp), %r15 + movq R14(%rsp), %r14 + movq R13(%rsp), %r13 + movq R12(%rsp), %r12 + movq R10(%rsp), %r10 + movq RBP(%rsp), %rbp + movq RBX(%rsp), %rbx + + /* skip=8 to skip flags saved in SS */ + MCOUNT_RESTORE_FRAME 8 + + /* Restore flags */ + popfq + + jmp ftrace_return +ftrace_restore_flags: + popfq + jmp ftrace_stub + +END(ftrace_regs_caller) + + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(function_hook) + cmpl $0, function_trace_stop + jne ftrace_stub + + cmpq $ftrace_stub, ftrace_trace_function + jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + +GLOBAL(ftrace_stub) + retq + +trace: + MCOUNT_SAVE_FRAME + + movq RIP(%rsp), %rdi +#ifdef CC_USING_FENTRY + movq SS+16(%rsp), %rsi +#else + movq 8(%rbp), %rsi +#endif + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + MCOUNT_RESTORE_FRAME + + jmp ftrace_stub +END(function_hook) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + MCOUNT_SAVE_FRAME + +#ifdef CC_USING_FENTRY + leaq SS+16(%rsp), %rdi + movq $0, %rdx /* No framepointers needed */ +#else + leaq 8(%rbp), %rdi + movq (%rbp), %rdx +#endif + movq RIP(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi + + call prepare_ftrace_return + + MCOUNT_RESTORE_FRAME + + retq +END(ftrace_graph_caller) + +GLOBAL(return_to_handler) + subq $24, %rsp + + /* Save the return values */ + movq %rax, (%rsp) + movq %rdx, 8(%rsp) + movq %rbp, %rdi + + call ftrace_return_to_handler + + movq %rax, %rdi + movq 8(%rsp), %rdx + movq (%rsp), %rax + addq $24, %rsp + jmp *%rdi +#endif diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index b4872b9..c3e985d 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w) a->handler, whole_msecs, decimal_msecs); } -static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -146,6 +146,7 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2 /* return total number of NMI events handled */ return handled; } +NOKPROBE_SYMBOL(nmi_handle); int __register_nmi_handler(unsigned int type, struct nmiaction *action) { @@ -208,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) } EXPORT_SYMBOL_GPL(unregister_nmi_handler); -static __kprobes void +static void pci_serr_error(unsigned char reason, struct pt_regs *regs) { /* check to see if anyone registered against these types of errors */ @@ -238,8 +239,9 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; outb(reason, NMI_REASON_PORT); } +NOKPROBE_SYMBOL(pci_serr_error); -static __kprobes void +static void io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; @@ -269,8 +271,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) reason &= ~NMI_REASON_CLEAR_IOCHK; outb(reason, NMI_REASON_PORT); } +NOKPROBE_SYMBOL(io_check_error); -static __kprobes void +static void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { int handled; @@ -298,11 +301,12 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Dazed and confused, but trying to continue\n"); } +NOKPROBE_SYMBOL(unknown_nmi_error); static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static __kprobes void default_do_nmi(struct pt_regs *regs) +static void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; @@ -401,6 +405,7 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) else unknown_nmi_error(reason, regs); } +NOKPROBE_SYMBOL(default_do_nmi); /* * NMIs can hit breakpoints which will cause it to lose its @@ -520,7 +525,7 @@ static inline void nmi_nesting_postprocess(void) } #endif -dotraplinkage notrace __kprobes void +dotraplinkage notrace void do_nmi(struct pt_regs *regs, long error_code) { nmi_nesting_preprocess(regs); @@ -537,6 +542,7 @@ do_nmi(struct pt_regs *regs, long error_code) /* On i386, may loop back to preprocess */ nmi_nesting_postprocess(); } +NOKPROBE_SYMBOL(do_nmi); void stop_nmi(void) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b10af8..548d25f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -23,6 +23,7 @@ #include <linux/efi.h> #include <linux/bcd.h> #include <linux/highmem.h> +#include <linux/kprobes.h> #include <asm/bug.h> #include <asm/paravirt.h> @@ -389,6 +390,11 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .end_context_switch = paravirt_nop, }; +/* At this point, native_get/set_debugreg has real function entries */ +NOKPROBE_SYMBOL(native_get_debugreg); +NOKPROBE_SYMBOL(native_set_debugreg); +NOKPROBE_SYMBOL(native_load_idt); + struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .startup_ipi_hook = paravirt_nop, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f7d0672..a25e202 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -97,12 +97,17 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_mask = dma_alloc_coherent_mask(dev, flag); - flag |= __GFP_ZERO; + flag &= ~__GFP_ZERO; again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) + if (flag & __GFP_WAIT) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); + if (page && page_to_phys(page) + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } + } /* fallback */ if (!page) page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); @@ -120,7 +125,7 @@ again: return NULL; } - + memset(page_address(page), 0, size); *dma_addr = addr; return page_address(page); } diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6c483ba..77dd0ad 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -14,7 +14,7 @@ #include <asm/iommu_table.h> int swiotlb __read_mostly; -static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, +void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, struct dma_attrs *attrs) { @@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); } -static void x86_swiotlb_free_coherent(struct device *dev, size_t size, +void x86_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, struct dma_attrs *attrs) { - swiotlb_free_coherent(dev, size, vaddr, dma_addr); + if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) + swiotlb_free_coherent(dev, size, vaddr, dma_addr); + else + dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } static struct dma_map_ops swiotlb_dma_ops = { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 898d077..ca5b02d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -413,12 +413,11 @@ void set_personality_ia32(bool x32) set_thread_flag(TIF_ADDR32); /* Mark the associated mm as containing 32-bit tasks. */ - if (current->mm) - current->mm->context.ia32_compat = 1; - if (x32) { clear_thread_flag(TIF_IA32); set_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_X32; current->personality &= ~READ_IMPLIES_EXEC; /* is_compat_task() uses the presence of the x32 syscall bit flag to determine compat status */ @@ -426,6 +425,8 @@ void set_personality_ia32(bool x32) } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_IA32; current->personality |= force_personality32; /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 09c76d2..78a0e62 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); memblock_set_current_limit(get_max_mapped()); - dma_contiguous_reserve(0); + dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9e5de68..a0da58d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -298,7 +298,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, } if (current->mm->context.vdso) - restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; else restorer = &frame->retcode; if (ksig->ka.sa.sa_flags & SA_RESTORER) @@ -361,7 +362,8 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ - restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3482693..5492798 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -244,6 +244,13 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_ESPFIX64 + init_espfix_ap(); +#endif + + /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. @@ -859,9 +866,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* was set by cpu_init() */ cpumask_clear_cpu(cpu, cpu_initialized_mask); - - set_cpu_present(cpu, false); - per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; } /* mark "stuck" area as not stuck */ @@ -921,7 +925,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) err = do_boot_cpu(apicid, cpu, tidle); if (err) { - pr_debug("do_boot_cpu failed %d\n", err); + pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); return -EIO; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f73b5d4..c6eb418 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -23,6 +23,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/ptrace.h> +#include <linux/uprobes.h> #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> @@ -106,7 +107,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) preempt_count_dec(); } -static int __kprobes +static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) { @@ -136,7 +137,38 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, return -1; } -static void __kprobes +static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, + siginfo_t *info) +{ + unsigned long siaddr; + int sicode; + + switch (trapnr) { + default: + return SEND_SIG_PRIV; + + case X86_TRAP_DE: + sicode = FPE_INTDIV; + siaddr = uprobe_get_trap_addr(regs); + break; + case X86_TRAP_UD: + sicode = ILL_ILLOPN; + siaddr = uprobe_get_trap_addr(regs); + break; + case X86_TRAP_AC: + sicode = BUS_ADRALN; + siaddr = 0; + break; + } + + info->si_signo = signr; + info->si_errno = 0; + info->si_code = sicode; + info->si_addr = (void __user *)siaddr; + return info; +} + +static void do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) { @@ -168,60 +200,43 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, } #endif - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); + force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } +NOKPROBE_SYMBOL(do_trap); -#define DO_ERROR(trapnr, signr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - enum ctx_state prev_state; \ - \ - prev_state = exception_enter(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, \ - trapnr, signr) == NOTIFY_STOP) { \ - exception_exit(prev_state); \ - return; \ - } \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ - exception_exit(prev_state); \ +static void do_error_trap(struct pt_regs *regs, long error_code, char *str, + unsigned long trapnr, int signr) +{ + enum ctx_state prev_state = exception_enter(); + siginfo_t info; + + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != + NOTIFY_STOP) { + conditional_sti(regs); + do_trap(trapnr, signr, str, regs, error_code, + fill_trap_info(regs, signr, trapnr, &info)); + } + + exception_exit(prev_state); } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +#define DO_ERROR(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - siginfo_t info; \ - enum ctx_state prev_state; \ - \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - prev_state = exception_enter(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, \ - trapnr, signr) == NOTIFY_STOP) { \ - exception_exit(prev_state); \ - return; \ - } \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ - exception_exit(prev_state); \ + do_error_trap(regs, error_code, str, trapnr, signr); \ } -DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip ) -DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow ) -DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds ) -DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip ) -DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun ) -DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS ) -DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present ) +DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) +DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) #ifdef CONFIG_X86_32 -DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment ) +DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) #endif -DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 ) +DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_X86_64 /* Runs on IST stack */ @@ -263,7 +278,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) } #endif -dotraplinkage void __kprobes +dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; @@ -305,13 +320,14 @@ do_general_protection(struct pt_regs *regs, long error_code) pr_cont("\n"); } - force_sig(SIGSEGV, tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); exit: exception_exit(prev_state); } +NOKPROBE_SYMBOL(do_general_protection); /* May run on IST stack. */ -dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { enum ctx_state prev_state; @@ -327,13 +343,18 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co if (poke_int3_handler(regs)) return; - prev_state = exception_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) goto exit; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ +#ifdef CONFIG_KPROBES + if (kprobe_int3_handler(regs)) + return; +#endif + prev_state = exception_enter(); + if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) goto exit; @@ -350,6 +371,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co exit: exception_exit(prev_state); } +NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* @@ -357,7 +379,7 @@ exit: * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -376,6 +398,7 @@ asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) *regs = *eregs; return regs; } +NOKPROBE_SYMBOL(sync_regs); #endif /* @@ -402,7 +425,7 @@ asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) * * May run on IST stack. */ -dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) +dotraplinkage void do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; enum ctx_state prev_state; @@ -410,8 +433,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; - prev_state = exception_enter(); - get_debugreg(dr6, 6); /* Filter out all the reserved bits which are preset to 1 */ @@ -440,6 +461,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; +#ifdef CONFIG_KPROBES + if (kprobe_debug_handler(regs)) + goto exit; +#endif + prev_state = exception_enter(); + if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, SIGTRAP) == NOTIFY_STOP) goto exit; @@ -482,13 +509,14 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) exit: exception_exit(prev_state); } +NOKPROBE_SYMBOL(do_debug); /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int error_code, int trapnr) { struct task_struct *task = current; siginfo_t info; @@ -518,7 +546,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_addr = (void __user *)regs->ip; + info.si_addr = (void __user *)uprobe_get_trap_addr(regs); if (trapnr == X86_TRAP_MF) { unsigned short cwd, swd; /* @@ -645,7 +673,7 @@ void math_state_restore(void) */ if (unlikely(restore_fpu_checking(tsk))) { drop_init_fpu(tsk); - force_sig(SIGSEGV, tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); return; } @@ -653,7 +681,7 @@ void math_state_restore(void) } EXPORT_SYMBOL_GPL(math_state_restore); -dotraplinkage void __kprobes +dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { enum ctx_state prev_state; @@ -679,6 +707,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) #endif exception_exit(prev_state); } +NOKPROBE_SYMBOL(do_device_not_available); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ed8459..5d1cbfe 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -32,20 +32,20 @@ /* Post-execution fixups. */ -/* No fixup needed */ -#define UPROBE_FIX_NONE 0x0 - /* Adjust IP back to vicinity of actual insn */ -#define UPROBE_FIX_IP 0x1 +#define UPROBE_FIX_IP 0x01 /* Adjust the return address of a call insn */ -#define UPROBE_FIX_CALL 0x2 +#define UPROBE_FIX_CALL 0x02 /* Instruction will modify TF, don't change it */ -#define UPROBE_FIX_SETF 0x4 +#define UPROBE_FIX_SETF 0x04 -#define UPROBE_FIX_RIP_AX 0x8000 -#define UPROBE_FIX_RIP_CX 0x4000 +#define UPROBE_FIX_RIP_SI 0x08 +#define UPROBE_FIX_RIP_DI 0x10 +#define UPROBE_FIX_RIP_BX 0x20 +#define UPROBE_FIX_RIP_MASK \ + (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) #define UPROBE_TRAP_NR UINT_MAX @@ -53,7 +53,7 @@ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) -#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) +#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -67,6 +67,7 @@ * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ @@ -89,33 +90,12 @@ static volatile u32 good_insns_32[256 / 32] = { /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; +#else +#define good_insns_32 NULL +#endif -/* Using this for both 64-bit and 32-bit apps */ -static volatile u32 good_2byte_insns[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ---------------------------------------------- */ - W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ - W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ - W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ - W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ - W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ - /* ---------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; - -#ifdef CONFIG_X86_64 /* Good-instruction tables for 64-bit apps */ +#if defined(CONFIG_X86_64) static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ @@ -138,7 +118,33 @@ static volatile u32 good_insns_64[256 / 32] = { /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; +#else +#define good_insns_64 NULL #endif + +/* Using this for both 64-bit and 32-bit apps */ +static volatile u32 good_2byte_insns[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; #undef W /* @@ -209,16 +215,25 @@ static bool is_prefix_bad(struct insn *insn) return false; } -static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) +static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) { - insn_init(insn, auprobe->insn, false); + u32 volatile *good_insns; + + insn_init(insn, auprobe->insn, x86_64); + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; - /* Skip good instruction prefixes; reject "bad" ones. */ - insn_get_opcode(insn); if (is_prefix_bad(insn)) return -ENOTSUPP; - if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) + if (x86_64) + good_insns = good_insns_64; + else + good_insns = good_insns_32; + + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns)) return 0; if (insn->opcode.nbytes == 2) { @@ -229,72 +244,19 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } -/* - * Figure out which fixups arch_uprobe_post_xol() will need to perform, and - * annotate arch_uprobe->fixups accordingly. To start with, - * arch_uprobe->fixups is either zero or it reflects rip-related fixups. - */ -static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) +#ifdef CONFIG_X86_64 +static inline bool is_64bit_mm(struct mm_struct *mm) { - bool fix_ip = true, fix_call = false; /* defaults */ - int reg; - - insn_get_opcode(insn); /* should be a nop */ - - switch (OPCODE1(insn)) { - case 0x9d: - /* popf */ - auprobe->fixups |= UPROBE_FIX_SETF; - break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: - case 0xca: - /* ip is correct */ - fix_ip = false; - break; - case 0xe8: /* call relative - Fix return addr */ - fix_call = true; - break; - case 0x9a: /* call absolute - Fix return addr, not ip */ - fix_call = true; - fix_ip = false; - break; - case 0xff: - insn_get_modrm(insn); - reg = MODRM_REG(insn); - if (reg == 2 || reg == 3) { - /* call or lcall, indirect */ - /* Fix return addr; ip is correct. */ - fix_call = true; - fix_ip = false; - } else if (reg == 4 || reg == 5) { - /* jmp or ljmp, indirect */ - /* ip is correct. */ - fix_ip = false; - } - break; - case 0xea: /* jmp absolute -- ip is correct */ - fix_ip = false; - break; - default: - break; - } - if (fix_ip) - auprobe->fixups |= UPROBE_FIX_IP; - if (fix_call) - auprobe->fixups |= UPROBE_FIX_CALL; + return !config_enabled(CONFIG_IA32_EMULATION) || + !(mm->context.ia32_compat == TIF_IA32); } - -#ifdef CONFIG_X86_64 /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses * its memory operand indirectly through a scratch register. Set - * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address - * accordingly. (The contents of the scratch register will be saved - * before we single-step the modified instruction, and restored - * afterward.) + * defparam->fixups accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, + * and restored afterward). * * We do this because a rip-relative instruction can access only a * relatively small area (+/- 2 GB from the instruction), and the XOL @@ -305,248 +267,513 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) * * Some useful facts about rip-relative instructions: * - * - There's always a modrm byte. + * - There's always a modrm byte with bit layout "00 reg 101". * - There's never a SIB byte. * - The displacement is always 4 bytes. + * - REX.B=1 bit in REX prefix, which normally extends r/m field, + * has no effect on rip-relative mode. It doesn't make modrm byte + * with r/m=101 refer to register 1101 = R13. */ -static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; + u8 reg2; - if (mm->context.ia32_compat) - return; - - auprobe->rip_rela_target_address = 0x0; if (!insn_rip_relative(insn)) return; /* - * insn_rip_relative() would have decoded rex_prefix, modrm. + * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. * Clear REX.b bit (extension of MODRM.rm field): - * we want to encode rax/rcx, not r8/r9. + * we want to encode low numbered reg, not r8+. */ if (insn->rex_prefix.nbytes) { cursor = auprobe->insn + insn_offset_rex_prefix(insn); - *cursor &= 0xfe; /* Clearing REX.B bit */ + /* REX byte has 0100wrxb layout, clearing REX.b bit */ + *cursor &= 0xfe; + } + /* + * Similar treatment for VEX3 prefix. + * TODO: add XOP/EVEX treatment when insn decoder supports them + */ + if (insn->vex_prefix.nbytes == 3) { + /* + * vex2: c5 rvvvvLpp (has no b bit) + * vex3/xop: c4/8f rxbmmmmm wvvvvLpp + * evex: 62 rxbR00mm wvvvv1pp zllBVaaa + * (evex will need setting of both b and x since + * in non-sib encoding evex.x is 4th bit of MODRM.rm) + * Setting VEX3.b (setting because it has inverted meaning): + */ + cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; + *cursor |= 0x20; } /* + * Convert from rip-relative addressing to register-relative addressing + * via a scratch register. + * + * This is tricky since there are insns with modrm byte + * which also use registers not encoded in modrm byte: + * [i]div/[i]mul: implicitly use dx:ax + * shift ops: implicitly use cx + * cmpxchg: implicitly uses ax + * cmpxchg8/16b: implicitly uses dx:ax and bx:cx + * Encoding: 0f c7/1 modrm + * The code below thinks that reg=1 (cx), chooses si as scratch. + * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. + * First appeared in Haswell (BMI2 insn). It is vex-encoded. + * Example where none of bx,cx,dx can be used as scratch reg: + * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx + * [v]pcmpistri: implicitly uses cx, xmm0 + * [v]pcmpistrm: implicitly uses xmm0 + * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 + * [v]pcmpestrm: implicitly uses ax, dx, xmm0 + * Evil SSE4.2 string comparison ops from hell. + * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. + * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. + * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). + * AMD says it has no 3-operand form (vex.vvvv must be 1111) + * and that it can have only register operands, not mem + * (its modrm byte must have mode=11). + * If these restrictions will ever be lifted, + * we'll need code to prevent selection of di as scratch reg! + * + * Summary: I don't know any insns with modrm byte which + * use SI register implicitly. DI register is used only + * by one insn (maskmovq) and BX register is used + * only by one too (cmpxchg8b). + * BP is stack-segment based (may be a problem?). + * AX, DX, CX are off-limits (many implicit users). + * SP is unusable (it's stack pointer - think about "pop mem"; + * also, rsp+disp32 needs sib encoding -> insn length change). + */ + + reg = MODRM_REG(insn); /* Fetch modrm.reg */ + reg2 = 0xff; /* Fetch vex.vvvv */ + if (insn->vex_prefix.nbytes == 2) + reg2 = insn->vex_prefix.bytes[1]; + else if (insn->vex_prefix.nbytes == 3) + reg2 = insn->vex_prefix.bytes[2]; + /* + * TODO: add XOP, EXEV vvvv reading. + * + * vex.vvvv field is in bits 6-3, bits are inverted. + * But in 32-bit mode, high-order bit may be ignored. + * Therefore, let's consider only 3 low-order bits. + */ + reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; + /* + * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. + * + * Choose scratch reg. Order is important: must not select bx + * if we can use si (cmpxchg8b case!) + */ + if (reg != 6 && reg2 != 6) { + reg2 = 6; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI; + } else if (reg != 7 && reg2 != 7) { + reg2 = 7; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI; + /* TODO (paranoia): force maskmovq to not use di */ + } else { + reg2 = 3; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX; + } + /* * Point cursor at the modrm byte. The next 4 bytes are the * displacement. Beyond the displacement, for some instructions, * is the immediate operand. */ cursor = auprobe->insn + insn_offset_modrm(insn); - insn_get_length(insn); - /* - * Convert from rip-relative addressing to indirect addressing - * via a scratch register. Change the r/m field from 0x5 (%rip) - * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. + * Change modrm from "00 reg 101" to "10 reg reg2". Example: + * 89 05 disp32 mov %eax,disp32(%rip) becomes + * 89 86 disp32 mov %eax,disp32(%rsi) */ - reg = MODRM_REG(insn); - if (reg == 0) { - /* - * The register operand (if any) is either the A register - * (%rax, %eax, etc.) or (if the 0x4 bit is set in the - * REX prefix) %r8. In any case, we know the C register - * is NOT the register operand, so we use %rcx (register - * #1) for the scratch register. - */ - auprobe->fixups = UPROBE_FIX_RIP_CX; - /* Change modrm from 00 000 101 to 00 000 001. */ - *cursor = 0x1; - } else { - /* Use %rax (register #0) for the scratch register. */ - auprobe->fixups = UPROBE_FIX_RIP_AX; - /* Change modrm from 00 xxx 101 to 00 xxx 000 */ - *cursor = (reg << 3); - } - - /* Target address = address of next instruction + (signed) offset */ - auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value; - - /* Displacement field is gone; slide immediate field (if any) over. */ - if (insn->immediate.nbytes) { - cursor++; - memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); - } - return; + *cursor = 0x80 | (reg << 3) | reg2; } -static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) +static inline unsigned long * +scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) { - insn_init(insn, auprobe->insn, true); - - /* Skip good instruction prefixes; reject "bad" ones. */ - insn_get_opcode(insn); - if (is_prefix_bad(insn)) - return -ENOTSUPP; + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI) + return ®s->si; + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI) + return ®s->di; + return ®s->bx; +} - if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) - return 0; +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { + struct uprobe_task *utask = current->utask; + unsigned long *sr = scratch_reg(auprobe, regs); - if (insn->opcode.nbytes == 2) { - if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) - return 0; + utask->autask.saved_scratch_register = *sr; + *sr = utask->vaddr + auprobe->defparam.ilen; } - return -ENOTSUPP; } -static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (mm->context.ia32_compat) - return validate_insn_32bits(auprobe, insn); - return validate_insn_64bits(auprobe, insn); + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { + struct uprobe_task *utask = current->utask; + unsigned long *sr = scratch_reg(auprobe, regs); + + *sr = utask->autask.saved_scratch_register; + } } #else /* 32-bit: */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +static inline bool is_64bit_mm(struct mm_struct *mm) { - /* No RIP-relative addressing on 32-bit */ + return false; } - -static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +/* + * No RIP-relative addressing on 32-bit + */ +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) +{ +} +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +} +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return validate_insn_32bits(auprobe, insn); } #endif /* CONFIG_X86_64 */ -/** - * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. - * @mm: the probed address space. - * @arch_uprobe: the probepoint information. - * @addr: virtual address at which to install the probepoint - * Return 0 on success or a -ve number on error. - */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +struct uprobe_xol_ops { + bool (*emulate)(struct arch_uprobe *, struct pt_regs *); + int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); + int (*post_xol)(struct arch_uprobe *, struct pt_regs *); + void (*abort)(struct arch_uprobe *, struct pt_regs *); +}; + +static inline int sizeof_long(void) { - int ret; - struct insn insn; + return is_ia32_task() ? 4 : 8; +} - auprobe->fixups = 0; - ret = validate_insn_bits(auprobe, mm, &insn); - if (ret != 0) - return ret; +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + riprel_pre_xol(auprobe, regs); + return 0; +} - handle_riprel_insn(auprobe, mm, &insn); - prepare_fixups(auprobe, &insn); +static int push_ret_address(struct pt_regs *regs, unsigned long ip) +{ + unsigned long new_sp = regs->sp - sizeof_long(); + if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) + return -EFAULT; + + regs->sp = new_sp; return 0; } -#ifdef CONFIG_X86_64 /* - * If we're emulating a rip-relative instruction, save the contents - * of the scratch register and store the target address in that register. + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction. We need + * to make it relative to the original instruction (FIX_IP). Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction. We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". + * We need to restore the contents of the scratch register + * (FIX_RIP_reg). */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) -{ - if (auprobe->fixups & UPROBE_FIX_RIP_AX) { - autask->saved_scratch_register = regs->ax; - regs->ax = current->utask->vaddr; - regs->ax += auprobe->rip_rela_target_address; - } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { - autask->saved_scratch_register = regs->cx; - regs->cx = current->utask->vaddr; - regs->cx += auprobe->rip_rela_target_address; +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + riprel_post_xol(auprobe, regs); + if (auprobe->defparam.fixups & UPROBE_FIX_IP) { + long correction = utask->vaddr - utask->xol_vaddr; + regs->ip += correction; + } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { + regs->sp += sizeof_long(); /* Pop incorrect return address */ + if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen)) + return -ERESTART; } + /* popf; tell the caller to not touch TF */ + if (auprobe->defparam.fixups & UPROBE_FIX_SETF) + utask->autask.saved_tf = true; + + return 0; } -#else -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) + +static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - /* No RIP-relative addressing on 32-bit */ + riprel_post_xol(auprobe, regs); } -#endif -/* - * arch_uprobe_pre_xol - prepare to execute out of line. - * @auprobe: the probepoint information. - * @regs: reflects the saved user state of current task. - */ -int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +static struct uprobe_xol_ops default_xol_ops = { + .pre_xol = default_pre_xol_op, + .post_xol = default_post_xol_op, + .abort = default_abort_op, +}; + +static bool branch_is_call(struct arch_uprobe *auprobe) { - struct arch_uprobe_task *autask; + return auprobe->branch.opc1 == 0xe8; +} - autask = ¤t->utask->autask; - autask->saved_trap_nr = current->thread.trap_nr; - current->thread.trap_nr = UPROBE_TRAP_NR; - regs->ip = current->utask->xol_vaddr; - pre_xol_rip_insn(auprobe, regs, autask); +#define CASE_COND \ + COND(70, 71, XF(OF)) \ + COND(72, 73, XF(CF)) \ + COND(74, 75, XF(ZF)) \ + COND(78, 79, XF(SF)) \ + COND(7a, 7b, XF(PF)) \ + COND(76, 77, XF(CF) || XF(ZF)) \ + COND(7c, 7d, XF(SF) != XF(OF)) \ + COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); - regs->flags |= X86_EFLAGS_TF; - if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) - set_task_blockstep(current, false); +#define COND(op_y, op_n, expr) \ + case 0x ## op_y: DO((expr) != 0) \ + case 0x ## op_n: DO((expr) == 0) - return 0; +#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) + +static bool is_cond_jmp_opcode(u8 opcode) +{ + switch (opcode) { + #define DO(expr) \ + return true; + CASE_COND + #undef DO + + default: + return false; + } } -/* - * This function is called by arch_uprobe_post_xol() to adjust the return - * address pushed by a call instruction executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int rasize, ncopied; - long ra = 0; + unsigned long flags = regs->flags; - if (is_ia32_task()) - rasize = 4; - else - rasize = 8; + switch (auprobe->branch.opc1) { + #define DO(expr) \ + return expr; + CASE_COND + #undef DO - ncopied = copy_from_user(&ra, (void __user *)sp, rasize); - if (unlikely(ncopied)) - return -EFAULT; + default: /* not a conditional jmp */ + return true; + } +} - ra += correction; - ncopied = copy_to_user((void __user *)sp, &ra, rasize); - if (unlikely(ncopied)) - return -EFAULT; +#undef XF +#undef COND +#undef CASE_COND - return 0; +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long new_ip = regs->ip += auprobe->branch.ilen; + unsigned long offs = (long)auprobe->branch.offs; + + if (branch_is_call(auprobe)) { + /* + * If it fails we execute this (mangled, see the comment in + * branch_clear_offset) insn out-of-line. In the likely case + * this should trigger the trap, and the probed application + * should die or restart the same insn after it handles the + * signal, arch_uprobe_post_xol() won't be even called. + * + * But there is corner case, see the comment in ->post_xol(). + */ + if (push_ret_address(regs, new_ip)) + return false; + } else if (!check_jmp_cond(auprobe, regs)) { + offs = 0; + } + + regs->ip = new_ip + offs; + return true; } -#ifdef CONFIG_X86_64 -static bool is_riprel_insn(struct arch_uprobe *auprobe) +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); + BUG_ON(!branch_is_call(auprobe)); + /* + * We can only get here if branch_emulate_op() failed to push the ret + * address _and_ another thread expanded our stack before the (mangled) + * "call" insn was executed out-of-line. Just restore ->sp and restart. + * We could also restore ->ip and try to call branch_emulate_op() again. + */ + regs->sp += sizeof_long(); + return -ERESTART; +} + +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) +{ + /* + * Turn this insn into "call 1f; 1:", this is what we will execute + * out-of-line if ->emulate() fails. We only need this to generate + * a trap, so that the probed task receives the correct signal with + * the properly filled siginfo. + * + * But see the comment in ->post_xol(), in the unlikely case it can + * succeed. So we need to ensure that the new ->ip can not fall into + * the non-canonical area and trigger #GP. + * + * We could turn it into (say) "pushf", but then we would need to + * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte + * of ->insn[] for set_orig_insn(). + */ + memset(auprobe->insn + insn_offset_immediate(insn), + 0, insn->immediate.nbytes); } -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +static struct uprobe_xol_ops branch_xol_ops = { + .emulate = branch_emulate_op, + .post_xol = branch_post_xol_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { - if (is_riprel_insn(auprobe)) { - struct arch_uprobe_task *autask; + u8 opc1 = OPCODE1(insn); + int i; - autask = ¤t->utask->autask; - if (auprobe->fixups & UPROBE_FIX_RIP_AX) - regs->ax = autask->saved_scratch_register; - else - regs->cx = autask->saved_scratch_register; + switch (opc1) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + break; + + case 0xe8: /* call relative */ + branch_clear_offset(auprobe, insn); + break; + case 0x0f: + if (insn->opcode.nbytes != 2) + return -ENOSYS; /* - * The original instruction includes a displacement, and so - * is 4 bytes longer than what we've just single-stepped. - * Fall through to handle stuff like "jmpq *...(%rip)" and - * "callq *...(%rip)". + * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches + * OPCODE1() of the "short" jmp which checks the same condition. */ - if (correction) - *correction += 4; + opc1 = OPCODE2(insn) - 0x10; + default: + if (!is_cond_jmp_opcode(opc1)) + return -ENOSYS; + } + + /* + * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. + * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. + * No one uses these insns, reject any branch insns with such prefix. + */ + for (i = 0; i < insn->prefixes.nbytes; i++) { + if (insn->prefixes.bytes[i] == 0x66) + return -ENOTSUPP; } + + auprobe->branch.opc1 = opc1; + auprobe->branch.ilen = insn->length; + auprobe->branch.offs = insn->immediate.value; + + auprobe->ops = &branch_xol_ops; + return 0; } -#else -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - /* No RIP-relative addressing on 32-bit */ + struct insn insn; + u8 fix_ip_or_call = UPROBE_FIX_IP; + int ret; + + ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); + if (ret) + return ret; + + ret = branch_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + + /* + * Figure out which fixups default_post_xol_op() will need to perform, + * and annotate defparam->fixups accordingly. + */ + switch (OPCODE1(&insn)) { + case 0x9d: /* popf */ + auprobe->defparam.fixups |= UPROBE_FIX_SETF; + break; + case 0xc3: /* ret or lret -- ip is correct */ + case 0xcb: + case 0xc2: + case 0xca: + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip_or_call = 0; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_ip_or_call = UPROBE_FIX_CALL; + break; + case 0xff: + switch (MODRM_REG(&insn)) { + case 2: case 3: /* call or lcall, indirect */ + fix_ip_or_call = UPROBE_FIX_CALL; + break; + case 4: case 5: /* jmp or ljmp, indirect */ + fix_ip_or_call = 0; + break; + } + /* fall through */ + default: + riprel_analyze(auprobe, &insn); + } + + auprobe->defparam.ilen = insn.length; + auprobe->defparam.fixups |= fix_ip_or_call; + + auprobe->ops = &default_xol_ops; + return 0; +} + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (auprobe->ops->pre_xol) { + int err = auprobe->ops->pre_xol(auprobe, regs); + if (err) + return err; + } + + regs->ip = utask->xol_vaddr; + utask->autask.saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + + utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + + return 0; } -#endif /* * If xol insn itself traps and generates a signal(Say, @@ -572,53 +799,42 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) * single-step, we single-stepped a copy of the instruction. * * This function prepares to resume execution after the single-step. - * We have to fix things up as follows: - * - * Typically, the new ip is relative to the copied instruction. We need - * to make it relative to the original instruction (FIX_IP). Exceptions - * are return instructions and absolute or indirect jump or call instructions. - * - * If the single-stepped instruction was a call, the return address that - * is atop the stack is the address following the copied instruction. We - * need to make it the address following the original instruction (FIX_CALL). - * - * If the original instruction was a rip-relative instruction such as - * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent - * instruction using a scratch register -- e.g., "movl %edx,(%rax)". - * We need to restore the contents of the scratch register and adjust - * the ip, keeping in mind that the instruction we executed is 4 bytes - * shorter than the original instruction (since we squeezed out the offset - * field). (FIX_RIP_AX or FIX_RIP_CX) */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct uprobe_task *utask; - long correction; - int result = 0; + struct uprobe_task *utask = current->utask; + bool send_sigtrap = utask->autask.saved_tf; + int err = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - - utask = current->utask; current->thread.trap_nr = utask->autask.saved_trap_nr; - correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) - regs->ip += correction; - - if (auprobe->fixups & UPROBE_FIX_CALL) - result = adjust_ret_addr(regs->sp, correction); + if (auprobe->ops->post_xol) { + err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + /* + * Restore ->ip for restart or post mortem analysis. + * ->post_xol() must not return -ERESTART unless this + * is really possible. + */ + regs->ip = utask->vaddr; + if (err == -ERESTART) + err = 0; + send_sigtrap = false; + } + } /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need * to examine the opcode to make it right. */ - if (utask->autask.saved_tf) + if (send_sigtrap) send_sig(SIGTRAP, current, 0); - else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + + if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; - return result; + return err; } /* callback routine for handling exceptions. */ @@ -652,41 +868,27 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal. Reset the instruction pointer to its + * probed address for the potential restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; - current->thread.trap_nr = utask->autask.saved_trap_nr; - handle_riprel_post_xol(auprobe, regs, NULL); - instruction_pointer_set(regs, utask->vaddr); + if (auprobe->ops->abort) + auprobe->ops->abort(auprobe, regs); + current->thread.trap_nr = utask->autask.saved_trap_nr; + regs->ip = utask->vaddr; /* clear TF if it was set by us in arch_uprobe_pre_xol() */ if (!utask->autask.saved_tf) regs->flags &= ~X86_EFLAGS_TF; } -/* - * Skip these instructions as per the currently known x86 ISA. - * rep=0x66*; nop=0x90 - */ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int i; - - for (i = 0; i < MAX_UINSN_BYTES; i++) { - if (auprobe->insn[i] == 0x66) - continue; - - if (auprobe->insn[i] == 0x90) { - regs->ip += i + 1; - return true; - } - - break; - } + if (auprobe->ops->emulate) + return auprobe->ops->emulate(auprobe, regs); return false; } @@ -701,23 +903,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize, ncopied; + int rasize = sizeof_long(), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ - rasize = is_ia32_task() ? 4 : 8; - ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize); - if (unlikely(ncopied)) + if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; - ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); - if (likely(!ncopied)) + nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); + if (likely(!nleft)) return orig_ret_vaddr; - if (ncopied != rasize) { + if (nleft != rasize) { pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8b3b3eb..ea5b570 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -91,7 +91,7 @@ static int addr_to_vsyscall_nr(unsigned long addr) { int nr; - if ((addr & ~0xC00UL) != VSYSCALL_START) + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) return -EINVAL; nr = (addr & 0xC00UL) >> 10; @@ -330,24 +330,17 @@ void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, vsyscall_mode == NATIVE ? PAGE_KERNEL_VSYSCALL : PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != - (unsigned long)VSYSCALL_START); - - __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != - (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); } static int __init vsyscall_init(void) { - BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); - cpu_notifier_register_begin(); on_each_cpu(cpu_vsyscall_init, NULL, 1); |