diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-05-14 14:43:40 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-05-14 14:43:40 +0200 |
commit | 9cba26e66d09bf394ae5a739627a1dc8b7cae6f4 (patch) | |
tree | f03743d576a0c7826b9921ad47e70370ebe80a22 /arch/x86 | |
parent | ec83db0f78cd44c3b586ec1c3a348d1a8a389797 (diff) | |
parent | 73eff9f56e15598c8399c0b86899fd889b97f085 (diff) | |
download | op-kernel-dev-9cba26e66d09bf394ae5a739627a1dc8b7cae6f4.zip op-kernel-dev-9cba26e66d09bf394ae5a739627a1dc8b7cae6f4.tar.gz |
Merge branch 'perf/uprobes' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/uprobes
Diffstat (limited to 'arch/x86')
29 files changed, 1193 insertions, 546 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 76f5a46..8443c50 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -40,7 +40,6 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM select HAVE_ARCH_KGDB diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 4be406a..36b62bc 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -14,6 +14,9 @@ LINK-y += $(call cc-option,-m32) export LDFLAGS +LDS_EXTRA := -Ui386 +export LDS_EXTRA + # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. include $(srctree)/arch/x86/Makefile_32.cpu diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index d511d95..4824fb4 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -119,9 +119,7 @@ static void set_brk(unsigned long start, unsigned long end) end = PAGE_ALIGN(end); if (end <= start) return; - down_write(¤t->mm->mmap_sem); - do_brk(start, end - start); - up_write(¤t->mm->mmap_sem); + vm_brk(start, end - start); } #ifdef CORE_DUMP @@ -332,9 +330,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) pos = 32; map_size = ex.a_text+ex.a_data; - down_write(¤t->mm->mmap_sem); - error = do_brk(text_addr & PAGE_MASK, map_size); - up_write(¤t->mm->mmap_sem); + error = vm_brk(text_addr & PAGE_MASK, map_size); if (error != (text_addr & PAGE_MASK)) { send_sig(SIGKILL, current, 0); @@ -373,9 +369,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { loff_t pos = fd_offset; - down_write(¤t->mm->mmap_sem); - do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - up_write(¤t->mm->mmap_sem); + vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); bprm->file->f_op->read(bprm->file, (char __user *)N_TXTADDR(ex), ex.a_text+ex.a_data, &pos); @@ -385,26 +379,22 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto beyond_if; } - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, + error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, fd_offset); - up_write(¤t->mm->mmap_sem); if (error != N_TXTADDR(ex)) { send_sig(SIGKILL, current, 0); return error; } - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, fd_offset + ex.a_text); - up_write(¤t->mm->mmap_sem); if (error != N_DATADDR(ex)) { send_sig(SIGKILL, current, 0); return error; @@ -476,9 +466,7 @@ static int load_aout_library(struct file *file) error_time = jiffies; } #endif - down_write(¤t->mm->mmap_sem); - do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - up_write(¤t->mm->mmap_sem); + vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); @@ -490,12 +478,10 @@ static int load_aout_library(struct file *file) goto out; } /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, start_addr, ex.a_text + ex.a_data, + error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT, N_TXTOFF(ex)); - up_write(¤t->mm->mmap_sem); retval = error; if (error != start_addr) goto out; @@ -503,9 +489,7 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - down_write(¤t->mm->mmap_sem); - error = do_brk(start_addr + len, bss - len); - up_write(¤t->mm->mmap_sem); + error = vm_brk(start_addr + len, bss - len); retval = error; if (error != start_addr + len) goto out; diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index b3b7332..99480e5 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -43,7 +43,7 @@ extern void __add_wrong_size(void) switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ asm volatile (lock #op "b %b0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ + : "+q" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_W: \ @@ -173,7 +173,7 @@ extern void __add_wrong_size(void) switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ asm volatile (lock "addb %b1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ + : "+m" (*(ptr)) : "qi" (inc) \ : "memory", "cc"); \ break; \ case __X86_CASE_W: \ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 268c783..18d9005 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,6 +34,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); +extern int modifying_ftrace_code; static inline unsigned long ftrace_call_adjust(unsigned long addr) { @@ -50,6 +51,8 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; +int ftrace_int3_handler(struct pt_regs *regs); + #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ccb8059..957ec87 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -134,6 +134,8 @@ #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSFETCH_REG_COUNT 3 +#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1) #define MSR_AMD64_IBSOPCTL 0xc0011033 #define MSR_AMD64_IBSOPRIP 0xc0011034 #define MSR_AMD64_IBSOPDATA 0xc0011035 @@ -141,8 +143,11 @@ #define MSR_AMD64_IBSOPDATA3 0xc0011037 #define MSR_AMD64_IBSDCLINAD 0xc0011038 #define MSR_AMD64_IBSDCPHYSAD 0xc0011039 +#define MSR_AMD64_IBSOP_REG_COUNT 7 +#define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1) #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ /* Fam 15h MSRs */ #define MSR_F15H_PERF_CTL 0xc0010200 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 2291895..4e40a64 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -158,6 +158,7 @@ struct x86_pmu_capability { #define IBS_CAPS_OPCNT (1U<<4) #define IBS_CAPS_BRNTRGT (1U<<5) #define IBS_CAPS_OPCNTEXT (1U<<6) +#define IBS_CAPS_RIPINVALIDCHK (1U<<7) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -170,19 +171,22 @@ struct x86_pmu_capability { #define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) #define IBSCTL_LVT_OFFSET_MASK 0x0F -/* IbsFetchCtl bits/masks */ +/* ibs fetch bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT 0xFFFF0000ULL #define IBS_FETCH_MAX_CNT 0x0000FFFFULL -/* IbsOpCtl bits */ +/* ibs op bits/masks */ +/* lower 4 bits of the current count are ignored: */ +#define IBS_OP_CUR_CNT (0xFFFF0ULL<<32) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +#define IBS_RIP_INVALID (1ULL<<38) extern u32 get_ibs_caps(void); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8be5f54..e054459 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -557,6 +557,8 @@ struct __large_struct { unsigned long buf[100]; }; extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); +extern __must_check long +strncpy_from_user(char *dst, const char __user *src, long count); /* * movsl can be slow when source and dest are not both 8-byte aligned diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 566e803..8084bc73 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -213,11 +213,6 @@ static inline unsigned long __must_check copy_from_user(void *to, return n; } -long __must_check strncpy_from_user(char *dst, const char __user *src, - long count); -long __must_check __strncpy_from_user(char *dst, - const char __user *src, long count); - /** * strlen_user: - Get the size of a string in user space. * @str: The string to measure. diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 1c66d30..fcd4b6f 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -208,10 +208,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) } } -__must_check long -strncpy_from_user(char *dst, const char __user *src, long count); -__must_check long -__strncpy_from_user(char *dst, const char __user *src, long count); __must_check long strnlen_user(const char __user *str, long n); __must_check long __strnlen_user(const char __user *str, long n); __must_check long strlen_user(const char __user *str); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bb8e034..e049d6d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; - - /* mark not used */ - event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); @@ -1186,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1222,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 95e7fe1..6565226 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event) static int amd_pmu_hw_config(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); + int ret; + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + ret = x86_pmu_hw_config(event); if (ret) return ret; @@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * when we come here */ for (i = 0; i < x86_pmu.num_counters; i++) { - if (nb->owners[i] == event) { - cmpxchg(nb->owners+i, event, NULL); + if (cmpxchg(nb->owners + i, event, NULL) == event) break; - } } } diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3b8a2d3..da9bcdc 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -9,6 +9,7 @@ #include <linux/perf_event.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/ptrace.h> #include <asm/apic.h> @@ -16,36 +17,591 @@ static u32 ibs_caps; #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) -static struct pmu perf_ibs; +#include <linux/kprobes.h> +#include <linux/hardirq.h> + +#include <asm/nmi.h> + +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + +enum ibs_states { + IBS_ENABLED = 0, + IBS_STARTED = 1, + IBS_STOPPING = 2, + + IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { + struct perf_event *event; + unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + +struct perf_ibs { + struct pmu pmu; + unsigned int msr; + u64 config_mask; + u64 cnt_mask; + u64 enable_mask; + u64 valid_mask; + u64 max_period; + unsigned long offset_mask[1]; + int offset_max; + struct cpu_perf_ibs __percpu *pcpu; + u64 (*get_count)(u64 config); +}; + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; + +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) +{ + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int overflow = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left < (s64)min)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + /* + * If the hw period that triggers the sw overflow is too short + * we might hit the irq handler. This biases the results. + * Thus we shorten the next-to-last period and set the last + * period to the max period. + */ + if (left > max) { + left -= max; + if (left > max) + left = max; + else if (left < min) + left = min; + } + + *hw_period = (u64)left; + + return overflow; +} + +static int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - width; + u64 prev_raw_count; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + return 0; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return 1; +} + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ + if (perf_ibs_fetch.pmu.type == type) + return &perf_ibs_fetch; + if (perf_ibs_op.pmu.type == type) + return &perf_ibs_op; + return NULL; +} + +/* + * Use IBS for precise event sampling: + * + * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count + * perf record -a -e r076:p ... # same as -e cpu-cycles:p + * perf record -a -e r0C1:p ... # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + * + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + * + */ +static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +{ + switch (event->attr.precise_ip) { + case 0: + return -ENOENT; + case 1: + case 2: + break; + default: + return -EOPNOTSUPP; + } + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + switch (event->attr.config) { + case PERF_COUNT_HW_CPU_CYCLES: + *config = 0; + return 0; + } + break; + case PERF_TYPE_RAW: + switch (event->attr.config) { + case 0x0076: + *config = 0; + return 0; + case 0x00C1: + *config = IBS_OP_CNT_CTL; + return 0; + } + break; + default: + return -ENOENT; + } + + return -EOPNOTSUPP; +} static int perf_ibs_init(struct perf_event *event) { - if (perf_ibs.type != event->attr.type) + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs; + u64 max_cnt, config; + int ret; + + perf_ibs = get_ibs_pmu(event->attr.type); + if (perf_ibs) { + config = event->attr.config; + } else { + perf_ibs = &perf_ibs_op; + ret = perf_ibs_precise_event(event, &config); + if (ret) + return ret; + } + + if (event->pmu != &perf_ibs->pmu) return -ENOENT; + + if (config & ~perf_ibs->config_mask) + return -EINVAL; + + if (hwc->sample_period) { + if (config & perf_ibs->cnt_mask) + /* raw max_cnt may not be set */ + return -EINVAL; + if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + /* + * lower 4 bits can not be set in ibs max cnt, + * but allowing it in case we adjust the + * sample period to set a frequency. + */ + return -EINVAL; + hwc->sample_period &= ~0x0FULL; + if (!hwc->sample_period) + hwc->sample_period = 0x10; + } else { + max_cnt = config & perf_ibs->cnt_mask; + config &= ~perf_ibs->cnt_mask; + event->attr.sample_period = max_cnt << 4; + hwc->sample_period = event->attr.sample_period; + } + + if (!hwc->sample_period) + return -EINVAL; + + /* + * If we modify hwc->sample_period, we also need to update + * hwc->last_period and hwc->period_left. + */ + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + + hwc->config_base = perf_ibs->msr; + hwc->config = config; + return 0; } +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 *period) +{ + int overflow; + + /* ignore lower 4 bits in min count: */ + overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + local64_set(&hwc->prev_count, 0); + + return overflow; +} + +static u64 get_ibs_fetch_count(u64 config) +{ + return (config & IBS_FETCH_CNT) >> 12; +} + +static u64 get_ibs_op_count(u64 config) +{ + u64 count = 0; + + if (config & IBS_OP_VAL) + count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ + + if (ibs_caps & IBS_CAPS_RDWROPCNT) + count += (config & IBS_OP_CUR_CNT) >> 32; + + return count; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, + u64 *config) +{ + u64 count = perf_ibs->get_count(*config); + + /* + * Set width to 64 since we do not overflow on max width but + * instead on max count. In perf_ibs_set_period() we clear + * prev count manually on overflow. + */ + while (!perf_event_try_update(event, count, 64)) { + rdmsrl(event->hw.config_base, *config); + count = perf_ibs->get_count(*config); + } +} + +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + config &= ~perf_ibs->cnt_mask; + wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ +static void perf_ibs_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 period; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + perf_ibs_set_period(perf_ibs, hwc, &period); + set_bit(IBS_STARTED, pcpu->state); + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 config; + int stopping; + + stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); + + if (!stopping && (hwc->state & PERF_HES_UPTODATE)) + return; + + rdmsrl(hwc->config_base, config); + + if (stopping) { + set_bit(IBS_STOPPING, pcpu->state); + perf_ibs_disable_event(perf_ibs, hwc, config); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if (hwc->state & PERF_HES_UPTODATE) + return; + + /* + * Clear valid bit to not count rollovers on update, rollovers + * are only updated in the irq handler. + */ + config &= ~perf_ibs->valid_mask; + + perf_ibs_event_update(perf_ibs, event, &config); + hwc->state |= PERF_HES_UPTODATE; +} + static int perf_ibs_add(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_ENABLED, pcpu->state)) + return -ENOSPC; + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + pcpu->event = event; + + if (flags & PERF_EF_START) + perf_ibs_start(event, PERF_EF_RELOAD); + return 0; } static void perf_ibs_del(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) + return; + + perf_ibs_stop(event, PERF_EF_UPDATE); + + pcpu->event = NULL; + + perf_event_update_userpage(event); } -static struct pmu perf_ibs = { - .event_init= perf_ibs_init, - .add= perf_ibs_add, - .del= perf_ibs_del, +static void perf_ibs_read(struct perf_event *event) { } + +static struct perf_ibs perf_ibs_fetch = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSFETCHCTL, + .config_mask = IBS_FETCH_CONFIG_MASK, + .cnt_mask = IBS_FETCH_MAX_CNT, + .enable_mask = IBS_FETCH_ENABLE, + .valid_mask = IBS_FETCH_VAL, + .max_period = IBS_FETCH_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, + .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, + + .get_count = get_ibs_fetch_count, }; +static struct perf_ibs perf_ibs_op = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, + .cnt_mask = IBS_OP_MAX_CNT, + .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, + .offset_max = MSR_AMD64_IBSOP_REG_COUNT, + + .get_count = get_ibs_op_count, +}; + +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + struct perf_event *event = pcpu->event; + struct hw_perf_event *hwc = &event->hw; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + struct perf_ibs_data ibs_data; + int offset, size, check_rip, offset_max, throttle = 0; + unsigned int msr; + u64 *buf, *config, period; + + if (!test_bit(IBS_STARTED, pcpu->state)) { + /* + * Catch spurious interrupts after stopping IBS: After + * disabling IBS there could be still incomming NMIs + * with samples that even have the valid bit cleared. + * Mark all this NMIs as handled. + */ + return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; + } + + msr = hwc->config_base; + buf = ibs_data.regs; + rdmsrl(msr, *buf); + if (!(*buf++ & perf_ibs->valid_mask)) + return 0; + + config = &ibs_data.regs[0]; + perf_ibs_event_update(perf_ibs, event, config); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!perf_ibs_set_period(perf_ibs, hwc, &period)) + goto out; /* no sw counter overflow */ + + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); + if (event->attr.sample_type & PERF_SAMPLE_RAW) + offset_max = perf_ibs->offset_max; + else if (check_rip) + offset_max = 2; + else + offset_max = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < offset_max); + ibs_data.size = sizeof(u64) * size; + + regs = *iregs; + if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { + regs.flags &= ~PERF_EFLAGS_EXACT; + } else { + instruction_pointer_set(®s, ibs_data.regs[1]); + regs.flags |= PERF_EFLAGS_EXACT; + } + + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.size = sizeof(u32) + ibs_data.size; + raw.data = ibs_data.data; + data.raw = &raw; + } + + throttle = perf_event_overflow(event, &data, ®s); +out: + if (throttle) + perf_ibs_disable_event(perf_ibs, hwc, *config); + else + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); + + return 1; +} + +static int __kprobes +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int handled = 0; + + handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); + handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ + struct cpu_perf_ibs __percpu *pcpu; + int ret; + + pcpu = alloc_percpu(struct cpu_perf_ibs); + if (!pcpu) + return -ENOMEM; + + perf_ibs->pcpu = pcpu; + + ret = perf_pmu_register(&perf_ibs->pmu, name, -1); + if (ret) { + perf_ibs->pcpu = NULL; + free_percpu(pcpu); + } + + return ret; +} + static __init int perf_event_ibs_init(void) { if (!ibs_caps) return -ENODEV; /* ibs not supported by the cpu */ - perf_pmu_register(&perf_ibs, "ibs", -1); + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ibs_caps & IBS_CAPS_OPCNT) + perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 26b3e2f..166546e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 status; int handled; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1082,7 +1080,7 @@ again: if (!intel_pmu_save_and_restart(event)) continue; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f64df1..5a3edc2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); regs.ip = 0; /* @@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, if (!intel_pmu_save_and_restart(event)) return; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); /* * We use the interrupt regs as a base because the PEBS record diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index a2dfacf..47124a7 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) handled += overflow; /* event overflow for sure */ - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!x86_perf_event_set_period(event)) continue; + + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c9a281f..4243e8b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,40 +24,21 @@ #include <trace/syscall.h> #include <asm/cacheflush.h> +#include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> -#include <asm/nmi.h> - #ifdef CONFIG_DYNAMIC_FTRACE -/* - * modifying_code is set to notify NMIs that they need to use - * memory barriers when entering or exiting. But we don't want - * to burden NMIs with unnecessary memory barriers when code - * modification is not being done (which is most of the time). - * - * A mutex is already held when ftrace_arch_code_modify_prepare - * and post_process are called. No locks need to be taken here. - * - * Stop machine will make sure currently running NMIs are done - * and new NMIs will see the updated variable before we need - * to worry about NMIs doing memory barriers. - */ -static int modifying_code __read_mostly; -static DEFINE_PER_CPU(int, save_modifying_code); - int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); set_all_modules_text_rw(); - modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { - modifying_code = 0; set_all_modules_text_ro(); set_kernel_text_ro(); return 0; @@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } -/* - * Modifying code must take extra care. On an SMP machine, if - * the code being modified is also being executed on another CPU - * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. - * But this does not stop NMIs from happening. We still need - * to protect against that. We separate out the modification of - * the code to take care of this. - * - * Two buffers are added: An IP buffer and a "code" buffer. - * - * 1) Put the instruction pointer into the IP buffer - * and the new code into the "code" buffer. - * 2) Wait for any running NMIs to finish and set a flag that says - * we are modifying code, it is done in an atomic operation. - * 3) Write the code - * 4) clear the flag. - * 5) Wait for any running NMIs to finish. - * - * If an NMI is executed, the first thing it does is to call - * "ftrace_nmi_enter". This will check if the flag is set to write - * and if it is, it will write what is in the IP and "code" buffers. - * - * The trick is, it does not matter if everyone is writing the same - * content to the code location. Also, if a CPU is executing code - * it is OK to write to that code location if the contents being written - * are the same as what exists. - */ - -#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ -static atomic_t nmi_running = ATOMIC_INIT(0); -static int mod_code_status; /* holds return value of text write */ -static void *mod_code_ip; /* holds the IP to write to */ -static const void *mod_code_newcode; /* holds the text to write to the IP */ - -static unsigned nmi_wait_count; -static atomic_t nmi_update_count = ATOMIC_INIT(0); - -int ftrace_arch_read_dyn_info(char *buf, int size) -{ - int r; - - r = snprintf(buf, size, "%u %u", - nmi_wait_count, - atomic_read(&nmi_update_count)); - return r; -} - -static void clear_mod_flag(void) -{ - int old = atomic_read(&nmi_running); - - for (;;) { - int new = old & ~MOD_CODE_WRITE_FLAG; - - if (old == new) - break; - - old = atomic_cmpxchg(&nmi_running, old, new); - } -} - -static void ftrace_mod_code(void) -{ - /* - * Yes, more than one CPU process can be writing to mod_code_status. - * (and the code itself) - * But if one were to fail, then they all should, and if one were - * to succeed, then they all should. - */ - mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, - MCOUNT_INSN_SIZE); - - /* if we fail, then kill any new writers */ - if (mod_code_status) - clear_mod_flag(); -} - -void ftrace_nmi_enter(void) -{ - __this_cpu_write(save_modifying_code, modifying_code); - - if (!__this_cpu_read(save_modifying_code)) - return; - - if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { - smp_rmb(); - ftrace_mod_code(); - atomic_inc(&nmi_update_count); - } - /* Must have previous changes seen before executions */ - smp_mb(); -} - -void ftrace_nmi_exit(void) -{ - if (!__this_cpu_read(save_modifying_code)) - return; - - /* Finish all executions before clearing nmi_running */ - smp_mb(); - atomic_dec(&nmi_running); -} - -static void wait_for_nmi_and_set_mod_flag(void) -{ - if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) - return; - - do { - cpu_relax(); - } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); - - nmi_wait_count++; -} - -static void wait_for_nmi(void) -{ - if (!atomic_read(&nmi_running)) - return; - - do { - cpu_relax(); - } while (atomic_read(&nmi_running)); - - nmi_wait_count++; -} - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa(ip)); - mod_code_ip = (void *)ip; - mod_code_newcode = new_code; - - /* The buffers need to be visible before we let NMIs write them */ - smp_mb(); - - wait_for_nmi_and_set_mod_flag(); - - /* Make sure all running NMIs have finished before we write the code */ - smp_mb(); - - ftrace_mod_code(); - - /* Make sure the write happens before clearing the bit */ - smp_mb(); - - clear_mod_flag(); - wait_for_nmi(); - - return mod_code_status; + return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } static const unsigned char *ftrace_nop_replace(void) @@ -334,6 +168,347 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } +int modifying_ftrace_code __read_mostly; + +/* + * A breakpoint was added to the code address we are about to + * modify, and this is the handle that will just skip over it. + * We are either changing a nop into a trace call, or a trace + * call to a nop. While the change is taking place, we treat + * it just like it was a nop. + */ +int ftrace_int3_handler(struct pt_regs *regs) +{ + if (WARN_ON_ONCE(!regs)) + return 0; + + if (!ftrace_location(regs->ip - 1)) + return 0; + + regs->ip += MCOUNT_INSN_SIZE - 1; + + return 1; +} + +static int ftrace_write(unsigned long ip, const char *val, int size) +{ + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + + return probe_kernel_write((void *)ip, val, size); +} + +static int add_break(unsigned long ip, const char *old) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + if (ftrace_write(ip, &brk, 1)) + return -EPERM; + + return 0; +} + +static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned const char *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + + return add_break(rec->ip, old); +} + + +static int add_brk_on_nop(struct dyn_ftrace *rec) +{ + unsigned const char *old; + + old = ftrace_nop_replace(); + + return add_break(rec->ip, old); +} + +static int add_breakpoints(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_brk_on_nop(rec); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_brk_on_call(rec, ftrace_addr); + } + return 0; +} + +/* + * On error, we need to remove breakpoints. This needs to + * be done caefully. If the address does not currently have a + * breakpoint, we know we are done. Otherwise, we look at the + * remaining 4 bytes of the instruction. If it matches a nop + * we replace the breakpoint with the nop. Otherwise we replace + * it with the call instruction. + */ +static int remove_breakpoint(struct dyn_ftrace *rec) +{ + unsigned char ins[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + const unsigned char *nop; + unsigned long ftrace_addr; + unsigned long ip = rec->ip; + + /* If we fail the read, just give up */ + if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* If this does not have a breakpoint, we are done */ + if (ins[0] != brk) + return -1; + + nop = ftrace_nop_replace(); + + /* + * If the last 4 bytes of the instruction do not match + * a nop, then we assume that this is a call to ftrace_addr. + */ + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { + /* + * For extra paranoidism, we check if the breakpoint is on + * a call that would actually jump to the ftrace_addr. + * If not, don't touch the breakpoint, we make just create + * a disaster. + */ + ftrace_addr = (unsigned long)FTRACE_ADDR; + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) + return -EINVAL; + } + + return probe_kernel_write((void *)ip, &nop[0], 1); +} + +static int add_update_code(unsigned long ip, unsigned const char *new) +{ + /* skip breakpoint */ + ip++; + new++; + if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) + return -EPERM; + return 0; +} + +static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + return add_update_code(ip, new); +} + +static int add_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + return add_update_code(ip, new); +} + +static int add_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_update_nop(rec); + } + + return 0; +} + +static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + + return 0; +} + +static int finish_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + return 0; +} + +static int finish_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_update_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return finish_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return finish_update_nop(rec); + } + + return 0; +} + +static void do_sync_core(void *data) +{ + sync_core(); +} + +static void run_sync(void) +{ + int enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disbled (on bootup). */ + if (enable_irqs) + local_irq_enable(); + on_each_cpu(do_sync_core, NULL, 1); + if (enable_irqs) + local_irq_disable(); +} + +static void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + const char *report = "adding breakpoints"; + int count = 0; + int ret; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_breakpoints(rec, enable); + if (ret) + goto remove_breakpoints; + count++; + } + + run_sync(); + + report = "updating code"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + report = "removing breakpoints"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = finish_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + return; + + remove_breakpoints: + ftrace_bug(ret, rec ? rec->ip : 0); + printk(KERN_WARNING "Failed on %s (%d):\n", report, count); + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + remove_breakpoint(rec); + } +} + +void arch_ftrace_update_code(int command) +{ + modifying_ftrace_code++; + + if (command & FTRACE_UPDATE_CALLS) + ftrace_replace_code(1); + else if (command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (command & FTRACE_START_FUNC_RET) + ftrace_enable_ftrace_graph_caller(); + else if (command & FTRACE_STOP_FUNC_RET) + ftrace_disable_ftrace_graph_caller(); + + modifying_ftrace_code--; +} + int __init ftrace_dyn_arch_init(void *data) { /* The return code is retured via data */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf3..eb1539e 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) -static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -209,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) EXPORT_SYMBOL_GPL(unregister_nmi_handler); -static notrace __kprobes void +static __kprobes void pci_serr_error(unsigned char reason, struct pt_regs *regs) { pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", @@ -236,7 +236,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; @@ -263,7 +263,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { int handled; @@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +static __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff9281f1..92d5756 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,6 +50,7 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <linux/atomic.h> +#include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> @@ -303,8 +304,13 @@ gp_in_kernel: } /* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_DYNAMIC_FTRACE + /* ftrace must be first, everything else may cause a recursive crash */ + if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + return; +#endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index f386dc4..7515cf0e 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -216,9 +216,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = 1; /* - * 0 is a valid user pointer (in the access_ok sense) on 32-bit and + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, 0 means "don't write anything" not "write it at + * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ ret = -EFAULT; @@ -247,7 +247,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, - 0); + NULL); break; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 173df38..2e88438 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -459,17 +459,17 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); if (pmu->version == 1) { - pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; - return; + pmu->nr_arch_fixed_counters = 0; + } else { + pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), + X86_PMC_MAX_FIXED); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; } - pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), - X86_PMC_MAX_FIXED); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; - pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1) - | (((1ull << pmu->nr_arch_fixed_counters) - 1) - << X86_PMC_IDX_FIXED)); + pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); + pmu->global_ctrl_mask = ~pmu->global_ctrl; } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad85adf..4ff0ab9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2210,9 +2210,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) + if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + preempt_disable(); kvm_set_shared_msr(msr->index, msr->data, msr->mask); + preempt_enable(); + } break; } ret = kvm_set_msr_common(vcpu, msr_index, data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4044ce0..91a5e98 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6336,13 +6336,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (npages && !old.rmap) { unsigned long userspace_addr; - down_write(¤t->mm->mmap_sem); - userspace_addr = do_mmap(NULL, 0, + userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, map_flags, 0); - up_write(¤t->mm->mmap_sem); if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); @@ -6366,10 +6364,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (!user_alloc && !old.user_alloc && old.rmap && !npages) { int ret; - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, + ret = vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); if (ret < 0) printk(KERN_WARNING "kvm_vm_ioctl_set_memory_region: " diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 25feb1a..b1e6c4b 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -379,8 +379,8 @@ err_out: return; } -/* Decode moffset16/32/64 */ -static void __get_moffset(struct insn *insn) +/* Decode moffset16/32/64. Return 0 if failed */ +static int __get_moffset(struct insn *insn) { switch (insn->addr_bytes) { case 2: @@ -397,15 +397,19 @@ static void __get_moffset(struct insn *insn) insn->moffset2.value = get_next(int, insn); insn->moffset2.nbytes = 4; break; + default: /* opnd_bytes must be modified manually */ + goto err_out; } insn->moffset1.got = insn->moffset2.got = 1; + return 1; + err_out: - return; + return 0; } -/* Decode imm v32(Iz) */ -static void __get_immv32(struct insn *insn) +/* Decode imm v32(Iz). Return 0 if failed */ +static int __get_immv32(struct insn *insn) { switch (insn->opnd_bytes) { case 2: @@ -417,14 +421,18 @@ static void __get_immv32(struct insn *insn) insn->immediate.value = get_next(int, insn); insn->immediate.nbytes = 4; break; + default: /* opnd_bytes must be modified manually */ + goto err_out; } + return 1; + err_out: - return; + return 0; } -/* Decode imm v64(Iv/Ov) */ -static void __get_immv(struct insn *insn) +/* Decode imm v64(Iv/Ov), Return 0 if failed */ +static int __get_immv(struct insn *insn) { switch (insn->opnd_bytes) { case 2: @@ -441,15 +449,18 @@ static void __get_immv(struct insn *insn) insn->immediate2.value = get_next(int, insn); insn->immediate2.nbytes = 4; break; + default: /* opnd_bytes must be modified manually */ + goto err_out; } insn->immediate1.got = insn->immediate2.got = 1; + return 1; err_out: - return; + return 0; } /* Decode ptr16:16/32(Ap) */ -static void __get_immptr(struct insn *insn) +static int __get_immptr(struct insn *insn) { switch (insn->opnd_bytes) { case 2: @@ -462,14 +473,17 @@ static void __get_immptr(struct insn *insn) break; case 8: /* ptr16:64 is not exist (no segment) */ - return; + return 0; + default: /* opnd_bytes must be modified manually */ + goto err_out; } insn->immediate2.value = get_next(unsigned short, insn); insn->immediate2.nbytes = 2; insn->immediate1.got = insn->immediate2.got = 1; + return 1; err_out: - return; + return 0; } /** @@ -489,7 +503,8 @@ void insn_get_immediate(struct insn *insn) insn_get_displacement(insn); if (inat_has_moffset(insn->attr)) { - __get_moffset(insn); + if (!__get_moffset(insn)) + goto err_out; goto done; } @@ -517,16 +532,20 @@ void insn_get_immediate(struct insn *insn) insn->immediate2.nbytes = 4; break; case INAT_IMM_PTR: - __get_immptr(insn); + if (!__get_immptr(insn)) + goto err_out; break; case INAT_IMM_VWORD32: - __get_immv32(insn); + if (!__get_immv32(insn)) + goto err_out; break; case INAT_IMM_VWORD: - __get_immv(insn); + if (!__get_immv(insn)) + goto err_out; break; default: - break; + /* Here, insn must have an immediate, but failed */ + goto err_out; } if (inat_has_second_immediate(insn->attr)) { insn->immediate2.value = get_next(char, insn); diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 97be9cb..d6ae30b 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -7,6 +7,8 @@ #include <linux/highmem.h> #include <linux/module.h> +#include <asm/word-at-a-time.h> + /* * best effort, GUP based copy_from_user() that is NMI-safe */ @@ -41,3 +43,104 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); + +static inline unsigned long count_bytes(unsigned long mask) +{ + mask = (mask - 1) & ~mask; + mask >>= 7; + return count_masked_bytes(mask); +} + +/* + * Do a strncpy, return length of string without final '\0'. + * 'count' is the user-supplied count (return 'count' if we + * hit it), 'max' is the address space maximum (and we return + * -EFAULT if we hit it). + */ +static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max) +{ + long res = 0; + + /* + * Truncate 'max' to the user-specified limit, so that + * we only have one limit we need to check in the loop + */ + if (max > count) + max = count; + + while (max >= sizeof(unsigned long)) { + unsigned long c; + + /* Fall back to byte-at-a-time if we get a page fault */ + if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + break; + /* This can write a few bytes past the NUL character, but that's ok */ + *(unsigned long *)(dst+res) = c; + c = has_zero(c); + if (c) + return res + count_bytes(c); + res += sizeof(unsigned long); + max -= sizeof(unsigned long); + } + + while (max) { + char c; + + if (unlikely(__get_user(c,src+res))) + return -EFAULT; + dst[res] = c; + if (!c) + return res; + res++; + max--; + } + + /* + * Uhhuh. We hit 'max'. But was that the user-specified maximum + * too? If so, that's ok - we got as much as the user asked for. + */ + if (res >= count) + return res; + + /* + * Nope: we hit the address space limit, and we still had more + * characters the caller would have wanted. That's an EFAULT. + */ + return -EFAULT; +} + +/** + * strncpy_from_user: - Copy a NUL terminated string from userspace. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @src: Source address, in user space. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from userspace to kernel space. + * + * On success, returns the length of the string (not including the trailing + * NUL). + * + * If access to userspace fails, returns -EFAULT (some data may have been + * copied). + * + * If @count is smaller than the length of the string, copies @count bytes + * and returns @count. + */ +long +strncpy_from_user(char *dst, const char __user *src, long count) +{ + unsigned long max_addr, src_addr; + + if (unlikely(count <= 0)) + return 0; + + max_addr = current_thread_info()->addr_limit.seg; + src_addr = (unsigned long)src; + if (likely(src_addr < max_addr)) { + unsigned long max = max_addr - src_addr; + return do_strncpy_from_user(dst, src, count, max); + } + return -EFAULT; +} +EXPORT_SYMBOL(strncpy_from_user); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index d9b094c..ef2a6a5 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -33,93 +33,6 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon __movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n)) /* - * Copy a null terminated string from userspace. - */ - -#define __do_strncpy_from_user(dst, src, count, res) \ -do { \ - int __d0, __d1, __d2; \ - might_fault(); \ - __asm__ __volatile__( \ - " testl %1,%1\n" \ - " jz 2f\n" \ - "0: lodsb\n" \ - " stosb\n" \ - " testb %%al,%%al\n" \ - " jz 1f\n" \ - " decl %1\n" \ - " jnz 0b\n" \ - "1: subl %1,%0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl %5,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(0b,3b) \ - : "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), \ - "=&D" (__d2) \ - : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ - : "memory"); \ -} while (0) - -/** - * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * Caller must check the specified block with access_ok() before calling - * this function. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long -__strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -/** - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); - -/* * Zero Userspace */ diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index b7c2849..0d0326f 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -9,55 +9,6 @@ #include <asm/uaccess.h> /* - * Copy a null terminated string from userspace. - */ - -#define __do_strncpy_from_user(dst,src,count,res) \ -do { \ - long __d0, __d1, __d2; \ - might_fault(); \ - __asm__ __volatile__( \ - " testq %1,%1\n" \ - " jz 2f\n" \ - "0: lodsb\n" \ - " stosb\n" \ - " testb %%al,%%al\n" \ - " jz 1f\n" \ - " decq %1\n" \ - " jnz 0b\n" \ - "1: subq %1,%0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movq %5,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(0b,3b) \ - : "=&r"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), \ - "=&D" (__d2) \ - : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ - : "memory"); \ -} while (0) - -long -__strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) - return __strncpy_from_user(dst, src, count); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); - -/* * Zero Userspace */ diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h new file mode 100644 index 0000000..7d01b8c --- /dev/null +++ b/arch/x86/um/asm/barrier.h @@ -0,0 +1,75 @@ +#ifndef _ASM_UM_BARRIER_H_ +#define _ASM_UM_BARRIER_H_ + +#include <asm/asm.h> +#include <asm/segment.h> +#include <asm/cpufeature.h> +#include <asm/cmpxchg.h> +#include <asm/nops.h> + +#include <linux/kernel.h> +#include <linux/irqflags.h> + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#ifdef CONFIG_X86_32 + +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) + +#else /* CONFIG_X86_32 */ + +#define mb() asm volatile("mfence" : : : "memory") +#define rmb() asm volatile("lfence" : : : "memory") +#define wmb() asm volatile("sfence" : : : "memory") + +#endif /* CONFIG_X86_32 */ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP + +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +#define smp_rmb() rmb() +#else /* CONFIG_X86_PPRO_FENCE */ +#define smp_rmb() barrier() +#endif /* CONFIG_X86_PPRO_FENCE */ + +#ifdef CONFIG_X86_OOSTORE +#define smp_wmb() wmb() +#else /* CONFIG_X86_OOSTORE */ +#define smp_wmb() barrier() +#endif /* CONFIG_X86_OOSTORE */ + +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) + +#else /* CONFIG_SMP */ + +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) + +#endif /* CONFIG_SMP */ + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif diff --git a/arch/x86/um/asm/system.h b/arch/x86/um/asm/system.h deleted file mode 100644 index a459fd9..0000000 --- a/arch/x86/um/asm/system.h +++ /dev/null @@ -1,135 +0,0 @@ -#ifndef _ASM_X86_SYSTEM_H_ -#define _ASM_X86_SYSTEM_H_ - -#include <asm/asm.h> -#include <asm/segment.h> -#include <asm/cpufeature.h> -#include <asm/cmpxchg.h> -#include <asm/nops.h> - -#include <linux/kernel.h> -#include <linux/irqflags.h> - -/* entries in ARCH_DLINFO: */ -#ifdef CONFIG_IA32_EMULATION -# define AT_VECTOR_SIZE_ARCH 2 -#else -# define AT_VECTOR_SIZE_ARCH 1 -#endif - -extern unsigned long arch_align_stack(unsigned long sp); - -void default_idle(void); - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - */ -#ifdef CONFIG_X86_32 -/* - * Some non-Intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) -#else -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") -#define wmb() asm volatile("sfence" ::: "memory") -#endif - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * <programlisting> - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * </programlisting> - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * <programlisting> - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * </programlisting> - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() -#else -# define smp_rmb() barrier() -#endif -#ifdef CONFIG_X86_OOSTORE -# define smp_wmb() wmb() -#else -# define smp_wmb() barrier() -#endif -#define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - * - * (Could use an alternative three way for this if there was one.) - */ -static inline void rdtsc_barrier(void) -{ - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); -} - -extern void *_switch_to(void *prev, void *next, void *last); -#define switch_to(prev, next, last) prev = _switch_to(prev, next, last) - -#endif |