From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Dec 2008 10:39:53 +0100 Subject: performance counters: x86 support Implement performance counters for x86 Intel CPUs. It's simplified right now: the PERFMON CPU feature is assumed, which is available in Core2 and later Intel CPUs. The design is flexible to be extended to more CPU types as well. Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 + arch/x86/kernel/cpu/Makefile | 12 +- arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/cpu/perf_counter.c | 571 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 + arch/x86/kernel/irq.c | 5 + arch/x86/kernel/irqinit_32.c | 3 + arch/x86/kernel/irqinit_64.c | 5 + arch/x86/kernel/signal.c | 7 +- arch/x86/kernel/syscall_table_32.S | 1 + 10 files changed, 607 insertions(+), 6 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_counter.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f9487..8ab8c18 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif + perf_counters_lapic_init(0); preempt_disable(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec607..89e5336 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@ # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks # obj-y := intel_cacheinfo.o addon_cpuid_features.o @@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_X86_MCE) += mcheck/ -obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_MCE) += mcheck/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0..4461011 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -750,6 +751,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + init_hw_perf_counters(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 0000000..82440cb --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,571 @@ +/* + * Performance counter x86 architecture code + * + * Copyright(C) 2008 Thomas Gleixner + * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static bool perf_counters_initialized __read_mostly; + +/* + * Number of (generic) HW counters: + */ +static int nr_hw_counters __read_mostly; +static u32 perf_counter_mask __read_mostly; + +/* No support for fixed function counters yet */ + +#define MAX_HW_COUNTERS 8 + +struct cpu_hw_counters { + struct perf_counter *counters[MAX_HW_COUNTERS]; + unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + int enable_all; +}; + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); + +const int intel_perfmon_event_map[] = +{ + [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_CACHE_MISSES] = 0x412e, + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_BRANCH_MISSES] = 0x00c5, +}; + +const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); + +/* + * Setup the hardware configuration for a given hw_event_type + */ +int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +{ + struct hw_perf_counter *hwc = &counter->hw; + + if (unlikely(!perf_counters_initialized)) + return -EINVAL; + + /* + * Count user events, and generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + + /* + * If privileged enough, count OS events too, and allow + * NMI events as well: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN)) { + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + if (hw_event_type & PERF_COUNT_NMI) + hwc->nmi = 1; + } + + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + + hwc->irq_period = counter->__irq_period; + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + if (!hwc->irq_period) + hwc->irq_period = 0x7FFFFFFF; + + hwc->next_count = -((s32) hwc->irq_period); + + /* + * Negative event types mean raw encoded event+umask values: + */ + if (hw_event_type < 0) { + counter->hw_event_type = -hw_event_type; + counter->hw_event_type &= ~PERF_COUNT_NMI; + } else { + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type >= max_intel_perfmon_events) + return -EINVAL; + /* + * The generic map: + */ + counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + } + hwc->config |= counter->hw_event_type; + counter->wakeup_pending = 0; + + return 0; +} + +static void __hw_perf_enable_all(void) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); +} + +void hw_perf_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + cpuc->enable_all = 1; + __hw_perf_enable_all(); +} + +void hw_perf_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + cpuc->enable_all = 0; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); +} + +static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); + +static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +{ + per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + + wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + wrmsr(hwc->config_base + idx, hwc->config, 0); +} + +void hw_perf_counter_enable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + /* Try to get the previous counter again */ + if (test_and_set_bit(idx, cpuc->used)) { + idx = find_first_zero_bit(cpuc->used, nr_hw_counters); + set_bit(idx, cpuc->used); + hwc->idx = idx; + } + + perf_counters_lapic_init(hwc->nmi); + + wrmsr(hwc->config_base + idx, + hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + + cpuc->counters[idx] = counter; + counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + __hw_perf_counter_enable(hwc, idx); +} + +#ifdef CONFIG_X86_64 +static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) +{ + atomic64_set(&counter->count, val); +} + +static inline u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic64_read(&counter->count); +} +#else +/* + * Todo: add proper atomic64_t support to 32-bit x86: + */ +static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) +{ + u32 *val32 = (void *)&val64; + + atomic_set(counter->count32 + 0, *(val32 + 0)); + atomic_set(counter->count32 + 1, *(val32 + 1)); +} + +static inline u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic_read(counter->count32 + 0) | + (u64) atomic_read(counter->count32 + 1) << 32; +} +#endif + +static void __hw_perf_save_counter(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + s64 raw = -1; + s64 delta; + int err; + + /* + * Get the raw hw counter value: + */ + err = rdmsrl_safe(hwc->counter_base + idx, &raw); + WARN_ON_ONCE(err); + + /* + * Rebase it to zero (it started counting at -irq_period), + * to see the delta since ->prev_count: + */ + delta = (s64)hwc->irq_period + (s64)(s32)raw; + + atomic64_counter_set(counter, hwc->prev_count + delta); + + /* + * Adjust the ->prev_count offset - if we went beyond + * irq_period of units, then we got an IRQ and the counter + * was set back to -irq_period: + */ + while (delta >= (s64)hwc->irq_period) { + hwc->prev_count += hwc->irq_period; + delta -= (s64)hwc->irq_period; + } + + /* + * Calculate the next raw counter value we'll write into + * the counter at the next sched-in time: + */ + delta -= (s64)hwc->irq_period; + + hwc->next_count = (s32)delta; +} + +void perf_counter_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + int cpu, err, idx; + + local_irq_disable(); + + cpu = smp_processor_id(); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow); + WARN_ON_ONCE(err); + + printk(KERN_INFO "\n"); + printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); + printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); + printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + + for (idx = 0; idx < nr_hw_counters; idx++) { + err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count); + WARN_ON_ONCE(err); + + next_count = per_cpu(prev_next_count[idx], cpu); + + printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", + cpu, idx, pmc_count); + printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", + cpu, idx, next_count); + } + local_irq_enable(); +} + +void hw_perf_counter_disable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + unsigned int idx = hwc->idx; + + counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(hwc->config_base + idx, hwc->config, 0); + + clear_bit(idx, cpuc->used); + cpuc->counters[idx] = NULL; + __hw_perf_save_counter(counter, hwc, idx); +} + +void hw_perf_counter_read(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + unsigned long addr = hwc->counter_base + hwc->idx; + s64 offs, val = -1LL; + s32 val32; + int err; + + /* Careful: NMI might modify the counter offset */ + do { + offs = hwc->prev_count; + err = rdmsrl_safe(addr, &val); + WARN_ON_ONCE(err); + } while (offs != hwc->prev_count); + + val32 = (s32) val; + val = (s64)hwc->irq_period + (s64)val32; + atomic64_counter_set(counter, hwc->prev_count + val); +} + +static void perf_store_irq_data(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_save_and_restart(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + wrmsr(hwc->config_base + idx, + hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + + if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) { + __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_counter_enable(hwc, idx); + } +} + +static void +perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +{ + struct perf_counter_context *ctx = leader->ctx; + struct perf_counter *counter; + int bit; + + list_for_each_entry(counter, &ctx->counters, list) { + if (counter->record_type != PERF_RECORD_SIMPLE || + counter == leader) + continue; + + if (counter->active) { + /* + * When counter was not in the overflow mask, we have to + * read it from hardware. We read it as well, when it + * has not been read yet and clear the bit in the + * status mask. + */ + bit = counter->hw.idx; + if (!test_bit(bit, (unsigned long *) overflown) || + test_bit(bit, (unsigned long *) status)) { + clear_bit(bit, (unsigned long *) status); + perf_save_and_restart(counter); + } + } + perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, atomic64_counter_read(counter)); + } +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +{ + int bit, cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc; + u64 ack, status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (!status) { + ack_APIC_irq(); + return; + } + + /* Disable counters globally */ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + ack_APIC_irq(); + + cpuc = &per_cpu(cpu_hw_counters, cpu); + +again: + ack = status; + for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { + struct perf_counter *counter = cpuc->counters[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!counter) + continue; + + perf_save_and_restart(counter); + + switch (counter->record_type) { + case PERF_RECORD_SIMPLE: + continue; + case PERF_RECORD_IRQ: + perf_store_irq_data(counter, instruction_pointer(regs)); + break; + case PERF_RECORD_GROUP: + perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + atomic64_counter_read(counter)); + perf_handle_group(counter, &status, &ack); + break; + } + /* + * From NMI context we cannot call into the scheduler to + * do a task wakeup - but we mark these counters as + * wakeup_pending and initate a wakeup callback: + */ + if (nmi) { + counter->wakeup_pending = 1; + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + } else { + wake_up(&counter->waitq); + } + } + + wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0); + + /* + * Repeat if there is more work to be done: + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (status) + goto again; + + /* + * Do not reenable when global enable is off: + */ + if (cpuc->enable_all) + __hw_perf_enable_all(); +} + +void smp_perf_counter_interrupt(struct pt_regs *regs) +{ + irq_enter(); +#ifdef CONFIG_X86_64 + add_pda(apic_perf_irqs, 1); +#else + per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++; +#endif + apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + __smp_perf_counter_interrupt(regs, 0); + + irq_exit(); +} + +/* + * This handler is triggered by NMI contexts: + */ +void perf_counter_notify(struct pt_regs *regs) +{ + struct cpu_hw_counters *cpuc; + unsigned long flags; + int bit, cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); + + for_each_bit(bit, cpuc->used, nr_hw_counters) { + struct perf_counter *counter = cpuc->counters[bit]; + + if (!counter) + continue; + + if (counter->wakeup_pending) { + counter->wakeup_pending = 0; + wake_up(&counter->waitq); + } + } + + local_irq_restore(flags); +} + +void __cpuinit perf_counters_lapic_init(int nmi) +{ + u32 apic_val; + + if (!perf_counters_initialized) + return; + /* + * Enable the performance counter vector in the APIC LVT: + */ + apic_val = apic_read(APIC_LVTERR); + + apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); + if (nmi) + apic_write(APIC_LVTPC, APIC_DM_NMI); + else + apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + apic_write(APIC_LVTERR, apic_val); +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (likely(cmd != DIE_NMI_IPI)) + return NOTIFY_DONE; + + regs = args->regs; + + apic_write(APIC_LVTPC, APIC_DM_NMI); + __smp_perf_counter_interrupt(regs, 1); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { + .notifier_call = perf_counter_nmi_handler +}; + +void __init init_hw_perf_counters(void) +{ + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired Event or not. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return; + + printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); + + printk(KERN_INFO "... version: %d\n", eax.split.version_id); + printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); + nr_hw_counters = eax.split.num_counters; + if (nr_hw_counters > MAX_HW_COUNTERS) { + nr_hw_counters = MAX_HW_COUNTERS; + WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", + nr_hw_counters, MAX_HW_COUNTERS); + } + perf_counter_mask = (1 << nr_hw_counters) - 1; + perf_max_counters = nr_hw_counters; + + printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + + perf_counters_lapic_init(0); + register_die_notifier(&perf_counter_nmi_notifier); + + perf_counters_initialized = true; +} diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3194636..fc013cf 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PERF_VECTOR \ + perf_counter_interrupt smp_perf_counter_interrupt +#endif + /* * Exception entry points. */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc5..d92bc71 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); + seq_printf(p, "CNT: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance counter interrupts\n"); #endif #ifdef CONFIG_SMP seq_printf(p, "RES: "); @@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->apic_perf_irqs; #endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 607db63..6a33b5e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -160,6 +160,9 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); +# endif #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8670b3c..91d785c 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -138,6 +138,11 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupt: */ +#ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); +#endif } void __init native_init_IRQ(void) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b1cc6da..dee553c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,7 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ - +#include #include #include #include @@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } + if (thread_info_flags & _TIF_PERF_COUNTERS) { + clear_thread_flag(TIF_PERF_COUNTERS); + perf_counter_notify(regs); + } + #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395f..496726d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,4 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .long sys_perf_counter_open -- cgit v1.1 From 87b9cf4623ad4e5fc009e48c020593dffd5d3793 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Dec 2008 14:20:16 +0100 Subject: x86, perfcounters: read out MSR_CORE_PERF_GLOBAL_STATUS with counters disabled Impact: make perfcounter NMI and IRQ sequence more robust Make __smp_perf_counter_interrupt() a bit more conservative: first disable all counters, then read out the status. Most invocations are because there are real events, so there's no performance impact. Code flow gets a bit simpler as well this way. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 82440cb..615e953 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) struct cpu_hw_counters *cpuc; u64 ack, status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - if (!status) { - ack_APIC_irq(); - return; - } - /* Disable counters globally */ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); ack_APIC_irq(); cpuc = &per_cpu(cpu_hw_counters, cpu); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (!status) + goto out; + again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { @@ -440,7 +438,7 @@ again: rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (status) goto again; - +out: /* * Do not reenable when global enable is off: */ -- cgit v1.1 From 7e2ae34749edf19e76e594b9c4b2cdde1066afc5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 11:40:46 +0100 Subject: perfcounters, x86: simplify disable/enable of counters Impact: fix spurious missed counter wakeups In the case of NMI events, close a race window that can occur if an NMI hits counter code that temporarily disables+enables a counter, and the NMI leaks into the disabled section. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 40 ++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 615e953..7d528ffc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -136,14 +136,25 @@ void hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); } +static inline void +__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +{ + wrmsr(hwc->config_base + idx, hwc->config, 0); +} + static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) { per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; wrmsr(hwc->counter_base + idx, hwc->next_count, 0); - wrmsr(hwc->config_base + idx, hwc->config, 0); +} + +static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +{ + wrmsr(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } void hw_perf_counter_enable(struct perf_counter *counter) @@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - wrmsr(hwc->config_base + idx, - hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + __hw_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; - counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + __hw_perf_counter_set_period(hwc, idx); __hw_perf_counter_enable(hwc, idx); } @@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(hwc->config_base + idx, hwc->config, 0); + __hw_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } } +/* + * NMI-safe enable method: + */ static void perf_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; + u64 pmc_ctrl; + int err; - wrmsr(hwc->config_base + idx, - hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); + WARN_ON_ONCE(err); - if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) { - __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_counter_set_period(hwc, idx); + + if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) __hw_perf_counter_enable(hwc, idx); - } } static void -- cgit v1.1 From 1e12567678054bc1d4c944ecfad17624b3e49345 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 12:18:18 +0100 Subject: perfcounters, x86: clean up debug code Impact: cleanup Get rid of unused debug code. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7d528ffc..919ec46 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter, { s64 raw = -1; s64 delta; - int err; /* * Get the raw hw counter value: */ - err = rdmsrl_safe(hwc->counter_base + idx, &raw); - WARN_ON_ONCE(err); + rdmsrl(hwc->counter_base + idx, raw); /* * Rebase it to zero (it started counting at -irq_period), @@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter, void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; - int cpu, err, idx; + int cpu, idx; + + if (!nr_hw_counters) + return; local_irq_disable(); cpu = smp_processor_id(); - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow); - WARN_ON_ONCE(err); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); printk(KERN_INFO "\n"); printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); @@ -273,11 +269,8 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); for (idx = 0; idx < nr_hw_counters; idx++) { - err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count); - WARN_ON_ONCE(err); + rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); + rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); next_count = per_cpu(prev_next_count[idx], cpu); @@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter) unsigned long addr = hwc->counter_base + hwc->idx; s64 offs, val = -1LL; s32 val32; - int err; /* Careful: NMI might modify the counter offset */ do { offs = hwc->prev_count; - err = rdmsrl_safe(addr, &val); - WARN_ON_ONCE(err); + rdmsrl(addr, val); } while (offs != hwc->prev_count); val32 = (s32) val; @@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; u64 pmc_ctrl; - int err; - err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); - WARN_ON_ONCE(err); + rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); __hw_perf_save_counter(counter, hwc, idx); __hw_perf_counter_set_period(hwc, idx); -- cgit v1.1 From 43874d238d5f208854a73c3225ca2a22833eec8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 12:23:59 +0100 Subject: perfcounters: consolidate global-disable codepaths Impact: cleanup Simplify global disable handling. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 919ec46..6a93d1f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[MAX_HW_COUNTERS]; unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; - int enable_all; }; /* @@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) return 0; } -static void __hw_perf_enable_all(void) -{ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); -} - void hw_perf_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - cpuc->enable_all = 1; - __hw_perf_enable_all(); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } void hw_perf_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - cpuc->enable_all = 0; wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); } @@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); + u64 ack, status, saved_global; struct cpu_hw_counters *cpuc; - u64 ack, status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); /* Disable counters globally */ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); @@ -445,10 +435,9 @@ again: goto again; out: /* - * Do not reenable when global enable is off: + * Restore - do not reenable when global enable is off: */ - if (cpuc->enable_all) - __hw_perf_enable_all(); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0); } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.1 From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Dec 2008 21:43:39 +0100 Subject: perf counters: protect them against CSTATE transitions Impact: fix rare lost events problem There are CPUs whose performance counters misbehave on CSTATE transitions, so provide a way to just disable/enable them around deep idle methods. (hw_perf_enable_all() is cheap on x86.) Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6a93d1f..0a7f3be 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -119,10 +120,21 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_disable_all(void) +void hw_perf_restore_ctrl(u64 ctrl) { + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); + +u64 hw_perf_disable_all(void) +{ + u64 ctrl; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; } +EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) -- cgit v1.1 From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:35:37 +0100 Subject: perf counters: expand use of counter->event Impact: change syscall, cleanup Make use of the new perf_counters event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0a7f3be..30e7ebf 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +int hw_perf_counter_init(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; + u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->__irq_period; + hwc->irq_period = counter->event.hw_event_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->next_count = -((s32) hwc->irq_period); /* - * Negative event types mean raw encoded event+umask values: + * Raw event type provide the config in the event structure */ - if (hw_event_type < 0) { - counter->hw_event_type = -hw_event_type; - counter->hw_event_type &= ~PERF_COUNT_NMI; + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type == PERF_COUNT_RAW) { + hwc->config |= counter->event.hw_raw_ctrl; } else { - hw_event_type &= ~PERF_COUNT_NMI; if (hw_event_type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event_type]; } - hwc->config |= counter->hw_event_type; counter->wakeup_pending = 0; return 0; @@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, counter->event.hw_event_type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -418,7 +417,8 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + counter->event.hw_event_type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); -- cgit v1.1 From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Dec 2008 12:33:23 +0100 Subject: perf counters: restructure the API Impact: clean up new API Thorough cleanup of the new perf counters API, we now get clean separation of the various concepts: - introduce perf_counter_hw_event to separate out the event source details - move special type flags into separate attributes: PERF_COUNT_NMI, PERF_COUNT_RAW - extend the type to u64 and reserve it fully to the architecture in the raw type case. And make use of all these changes in the core and x86 perfcounters code. Also change the syscall signature to: asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd); ( Note that group_fd is unused for now - it's reserved for the counter groups abstraction. ) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 30e7ebf..ef1936a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); */ int hw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; - u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 0; if (capable(CAP_SYS_ADMIN)) { hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event_type & PERF_COUNT_NMI) + if (hw_event->nmi) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->event.hw_event_period; + hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter) if (!hwc->irq_period) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -((s32) hwc->irq_period); + hwc->next_count = -(s32)hwc->irq_period; /* * Raw event type provide the config in the event structure */ - hw_event_type &= ~PERF_COUNT_NMI; - if (hw_event_type == PERF_COUNT_RAW) { - hwc->config |= counter->event.hw_raw_ctrl; + if (hw_event->raw) { + hwc->config |= hw_event->type; } else { - if (hw_event_type >= max_intel_perfmon_events) + if (hw_event->type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event->type]; } counter->wakeup_pending = 0; @@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) int bit; list_for_each_entry(counter, &ctx->counters, list) { - if (counter->record_type != PERF_RECORD_SIMPLE || + if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || counter == leader) continue; @@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->event.hw_event_type); + perf_store_irq_data(leader, counter->hw_event.type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -410,7 +409,7 @@ again: perf_save_and_restart(counter); - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: continue; case PERF_RECORD_IRQ: @@ -418,7 +417,7 @@ again: break; case PERF_RECORD_GROUP: perf_store_irq_data(counter, - counter->event.hw_event_type); + counter->hw_event.type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); -- cgit v1.1 From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 08:38:42 +0100 Subject: perf counters: add support for group counters Impact: add group counters This patch adds the "counter groups" abstraction. Groups of counters behave much like normal 'single' counters, with a few semantic and behavioral extensions on top of that. A counter group is created by creating a new counter with the open() syscall's group-leader group_fd file descriptor parameter pointing to another, already existing counter. Groups of counters are scheduled in and out in one atomic group, and they are also roundrobin-scheduled atomically. Counters that are member of a group can also record events with an (atomic) extended timestamp that extends to all members of the group, if the record type is set to PERF_RECORD_GROUP. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ef1936a..54b4ad0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter) } static void -perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { - struct perf_counter_context *ctx = leader->ctx; - struct perf_counter *counter; + struct perf_counter *counter, *group_leader = sibling->group_leader; int bit; - list_for_each_entry(counter, &ctx->counters, list) { - if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || - counter == leader) - continue; + /* + * Store the counter's own timestamp first: + */ + perf_store_irq_data(sibling, sibling->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(sibling)); - if (counter->active) { + /* + * Then store sibling timestamps (if any): + */ + list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + if (!counter->active) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it @@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event.type); - perf_store_irq_data(leader, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(counter)); } } @@ -416,10 +420,6 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, - counter->hw_event.type); - perf_store_irq_data(counter, - atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); break; } -- cgit v1.1 From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 12:46:46 +0100 Subject: perf counters: hw driver API Impact: restructure code, introduce hw_ops driver abstraction Introduce this abstraction to handle counter details: struct hw_perf_counter_ops { void (*hw_perf_counter_enable) (struct perf_counter *counter); void (*hw_perf_counter_disable) (struct perf_counter *counter); void (*hw_perf_counter_read) (struct perf_counter *counter); }; This will be useful to support assymetric hw details, and it will also be useful to implement "software counters". (Counters that count kernel managed sw events such as pagefaults, context-switches, wall-clock time or task-local time.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 54b4ad0..718b635 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; @@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void) EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void -__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) { wrmsr(hwc->config_base + idx, hwc->config, 0); } @@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) wrmsr(hwc->counter_base + idx, hwc->next_count, 0); } -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -void hw_perf_counter_enable(struct perf_counter *counter) +static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; __hw_perf_counter_set_period(hwc, idx); - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } #ifdef CONFIG_X86_64 @@ -282,20 +282,20 @@ void perf_counter_print_debug(void) local_irq_enable(); } -void hw_perf_counter_disable(struct perf_counter *counter) +static void x86_perf_counter_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; __hw_perf_save_counter(counter, hwc, idx); } -void hw_perf_counter_read(struct perf_counter *counter) +static void x86_perf_counter_read(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; unsigned long addr = hwc->counter_base + hwc->idx; @@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } static void @@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } + +static struct hw_perf_counter_ops x86_perf_counter_ops = { + .hw_perf_counter_enable = x86_perf_counter_enable, + .hw_perf_counter_disable = x86_perf_counter_disable, + .hw_perf_counter_read = x86_perf_counter_read, +}; + +struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +{ + int err; + + err = __hw_perf_counter_init(counter); + if (err) + return NULL; + + return &x86_perf_counter_ops; +} -- cgit v1.1 From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:21:10 +0100 Subject: perf counters: implement PERF_COUNT_CPU_CLOCK Impact: add new perf-counter type The 'CPU clock' counter counts the amount of CPU clock time that is elapsing, in nanoseconds. (regardless of how much of it the task is spending on a CPU executing) This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 718b635..43c8e9a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter) __x86_perf_counter_enable(hwc, idx); } -#ifdef CONFIG_X86_64 -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) -{ - atomic64_set(&counter->count, val); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic64_read(&counter->count); -} -#else -/* - * Todo: add proper atomic64_t support to 32-bit x86: - */ -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) -{ - u32 *val32 = (void *)&val64; - - atomic_set(counter->count32 + 0, *(val32 + 0)); - atomic_set(counter->count32 + 1, *(val32 + 1)); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic_read(counter->count32 + 0) | - (u64) atomic_read(counter->count32 + 1) << 32; -} -#endif - static void __hw_perf_save_counter(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter) } while (offs != hwc->prev_count); val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; + val = (s64)hwc->irq_period + (s64)val32; atomic64_counter_set(counter, hwc->prev_count + val); } @@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } -static struct hw_perf_counter_ops x86_perf_counter_ops = { +static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, .hw_perf_counter_read = x86_perf_counter_read, }; -struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +const struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) { int err; -- cgit v1.1 From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:45:51 +0100 Subject: perf counters: consolidate hw_perf save/restore APIs Impact: cleanup Rename them to better match up the usual IRQ disable/enable APIs: hw_perf_disable_all() => hw_perf_save_disable() hw_perf_restore_ctrl() => hw_perf_restore() Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 43c8e9a..3e1dbeb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -118,13 +118,13 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore_ctrl(u64 ctrl) +void hw_perf_restore(u64 ctrl) { wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } -EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); +EXPORT_SYMBOL_GPL(hw_perf_restore); -u64 hw_perf_disable_all(void) +u64 hw_perf_save_disable(void) { u64 ctrl; @@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); return ctrl; } -EXPORT_SYMBOL_GPL(hw_perf_disable_all); +EXPORT_SYMBOL_GPL(hw_perf_save_disable); static inline void __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) -- cgit v1.1 From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 15:17:03 +0100 Subject: perf counters: clean up state transitions Impact: cleanup Introduce a proper enum for the 3 states of a counter: PERF_COUNTER_STATE_OFF = -1 PERF_COUNTER_STATE_INACTIVE = 0 PERF_COUNTER_STATE_ACTIVE = 1 and rename counter->active to counter->state and propagate the changes everywhere. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3e1dbeb..4854cca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Then store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (!counter->active) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it -- cgit v1.1 From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 13 Dec 2008 09:00:03 +0100 Subject: perfcounters: restructure x86 counter math Impact: restructure code Change counter math from absolute values to clear delta logic. We try to extract elapsed deltas from the raw hw counter - and put that into the generic counter. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++++++----------------- 1 file changed, 124 insertions(+), 106 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b903f8d..5afae13 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -54,6 +54,48 @@ const int intel_perfmon_event_map[] = const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static void +x86_perf_counter_update(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + u64 prev_raw_count, new_raw_count, delta; + + WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); + /* + * Careful: an NMI might modify the previous counter value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic counter atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->counter_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (counter-)time and add that to the generic counter. + * + * Careful, not all hw sign-extends above the physical width + * of the count, so we do that by clipping the delta to 32 bits: + */ + delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + WARN_ON_ONCE((int)delta < 0); + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + +/* * Setup the hardware configuration for a given hw_event_type */ static int __hw_perf_counter_init(struct perf_counter *counter) @@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if (!hwc->irq_period) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -(s32)hwc->irq_period; + atomic64_set(&hwc->period_left, hwc->irq_period); /* * Raw event type provide the config in the event structure @@ -118,12 +160,6 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore(u64 ctrl) -{ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); -} -EXPORT_SYMBOL_GPL(hw_perf_restore); - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void) } EXPORT_SYMBOL_GPL(hw_perf_save_disable); +void hw_perf_restore(u64 ctrl) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore); + static inline void -__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { - wrmsr(hwc->config_base + idx, hwc->config, 0); + int err; + + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + WARN_ON_ONCE(err); } -static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); -static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static void +__hw_perf_counter_set_period(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { - per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + s32 left = atomic64_read(&hwc->period_left); + s32 period = hwc->irq_period; + + WARN_ON_ONCE(period <= 0); + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + } - wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + WARN_ON_ONCE(left <= 0); + + per_cpu(prev_left[idx], smp_processor_id()) = left; + + /* + * The hw counter starts counting from this counter offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)(s64)-left); + + wrmsr(hwc->counter_base + idx, -left, 0); } -static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void +__x86_perf_counter_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - __hw_perf_counter_set_period(hwc, idx); - __x86_perf_counter_enable(hwc, idx); -} - -static void __hw_perf_save_counter(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 raw = -1; - s64 delta; - - /* - * Get the raw hw counter value: - */ - rdmsrl(hwc->counter_base + idx, raw); - - /* - * Rebase it to zero (it started counting at -irq_period), - * to see the delta since ->prev_count: - */ - delta = (s64)hwc->irq_period + (s64)(s32)raw; - - atomic64_counter_set(counter, hwc->prev_count + delta); - - /* - * Adjust the ->prev_count offset - if we went beyond - * irq_period of units, then we got an IRQ and the counter - * was set back to -irq_period: - */ - while (delta >= (s64)hwc->irq_period) { - hwc->prev_count += hwc->irq_period; - delta -= (s64)hwc->irq_period; - } - - /* - * Calculate the next raw counter value we'll write into - * the counter at the next sched-in time: - */ - delta -= (s64)hwc->irq_period; - - hwc->next_count = (s32)delta; + __hw_perf_counter_set_period(counter, hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; if (!nr_hw_counters) @@ -241,14 +286,14 @@ void perf_counter_print_debug(void) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); - next_count = per_cpu(prev_next_count[idx], cpu); + prev_left = per_cpu(prev_left[idx], cpu); printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", - cpu, idx, next_count); + printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + cpu, idx, prev_left); } local_irq_enable(); } @@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; - __hw_perf_save_counter(counter, hwc, idx); -} -static void x86_perf_counter_read(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - unsigned long addr = hwc->counter_base + hwc->idx; - s64 offs, val = -1LL; - s32 val32; - - /* Careful: NMI might modify the counter offset */ - do { - offs = hwc->prev_count; - rdmsrl(addr, val); - } while (offs != hwc->prev_count); - - val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; - atomic64_counter_set(counter, hwc->prev_count + val); + /* + * Drain the remaining delta count out of a counter + * that we are disabling: + */ + x86_perf_counter_update(counter, hwc, idx); } static void perf_store_irq_data(struct perf_counter *counter, u64 data) @@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } /* - * NMI-safe enable method: + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: */ static void perf_save_and_restart(struct perf_counter *counter) { @@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - __hw_perf_save_counter(counter, hwc, idx); - __hw_perf_counter_set_period(hwc, idx); + x86_perf_counter_update(counter, hwc, idx); + __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } static void perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { struct perf_counter *counter, *group_leader = sibling->group_leader; - int bit; - - /* - * Store the counter's own timestamp first: - */ - perf_store_irq_data(sibling, sibling->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(sibling)); /* - * Then store sibling timestamps (if any): + * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - /* - * When counter was not in the overflow mask, we have to - * read it from hardware. We read it as well, when it - * has not been read yet and clear the bit in the - * status mask. - */ - bit = counter->hw.idx; - if (!test_bit(bit, (unsigned long *) overflown) || - test_bit(bit, (unsigned long *) status)) { - clear_bit(bit, (unsigned long *) status); - perf_save_and_restart(counter); - } - } + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } @@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } +static void x86_perf_counter_read(struct perf_counter *counter) +{ + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, -- cgit v1.1 From 2b9ff0db19b5e2c77000b7201525f9c3d6e8328d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 18:36:30 +0100 Subject: perfcounters: fix non-intel-perfmon CPUs Do not write MSR_CORE_PERF_GLOBAL_CTRL on CPUs where it does not exist. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5afae13..6d30f60 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -157,6 +157,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) void hw_perf_enable_all(void) { + if (unlikely(!perf_counters_initialized)) + return; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } @@ -164,14 +167,21 @@ u64 hw_perf_save_disable(void) { u64 ctrl; + if (unlikely(!perf_counters_initialized)) + return 0; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; } EXPORT_SYMBOL_GPL(hw_perf_save_disable); void hw_perf_restore(u64 ctrl) { + if (unlikely(!perf_counters_initialized)) + return; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } EXPORT_SYMBOL_GPL(hw_perf_restore); -- cgit v1.1 From 75f224cf7700ed6006574dc3f2efa29860727570 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 21:58:46 +0100 Subject: perfcounters: fix lapic initialization Fix non-working NMI sampling in certain bootup scenarios. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6d30f60..8a154bd 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -557,10 +557,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + perf_counters_initialized = true; + perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); - - perf_counters_initialized = true; } static void x86_perf_counter_read(struct perf_counter *counter) -- cgit v1.1 From 94c46572a6d9bb497eda0a14099d9f1360d57d5d Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 22:37:58 +0530 Subject: x86: perf_counter.c intel_perfmon_event_map and max_intel_perfmon_events should be static Impact: cleanup, avoid sparse warnings, reduce kernel size a bit Fixes these sparse warnings: arch/x86/kernel/cpu/perf_counter.c:44:11: warning: symbol 'intel_perfmon_event_map' was not declared. Should it be static? arch/x86/kernel/cpu/perf_counter.c:54:11: warning: symbol 'max_intel_perfmon_events' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8a154bd..bdbdb56 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,7 +41,7 @@ struct cpu_hw_counters { */ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); -const int intel_perfmon_event_map[] = +static const int intel_perfmon_event_map[] = { [PERF_COUNT_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -51,7 +51,7 @@ const int intel_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Propagate counter elapsed time into the generic counter. -- cgit v1.1 From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:04:16 +0100 Subject: perfcounters: remove warnings Impact: remove debug checks Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index bdbdb56..89fad5d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter, { u64 prev_raw_count, new_raw_count, delta; - WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); /* * Careful: an NMI might modify the previous counter value. * @@ -89,7 +88,6 @@ again: * of the count, so we do that by clipping the delta to 32 bits: */ delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); - WARN_ON_ONCE((int)delta < 0); atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); @@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter, int err; err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); - WARN_ON_ONCE(err); } static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); @@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, s32 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; - WARN_ON_ONCE(period <= 0); - /* * If we are way outside a reasoable range then just skip forward: */ @@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->period_left, left); } - WARN_ON_ONCE(left <= 0); - per_cpu(prev_left[idx], smp_processor_id()) = left; /* -- cgit v1.1 From 5c167b8585c8d91206b395d57011ead7711e322f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:02:19 +0100 Subject: x86, perfcounters: rename intel_arch_perfmon.h => perf_counter.h Impact: rename include file We'll be providing an asm/perf_counter.h to the generic perfcounter code, so use the already existing x86 file for this purpose and rename it. Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 2 +- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 0579ec1..4f859ac 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4461011..ad331b4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 89fad5d..a4a3a09 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include static bool perf_counters_initialized __read_mostly; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9abd48b..d6f5b9f 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include #include -#include +#include struct nmi_watchdog_ctlblk { unsigned int cccr_msr; -- cgit v1.1 From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:09:13 +0100 Subject: x86, perfcounters: prepare for fixed-mode PMCs Impact: refactor the x86 code for fixed-mode PMCs Extend the data structures and rename the existing facilities to allow for a 'generic' versus 'fixed' counter distinction. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 53 +++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a4a3a09..fc3af86 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; -/* No support for fixed function counters yet */ - -#define MAX_HW_COUNTERS 8 - struct cpu_hw_counters { - struct perf_counter *counters[MAX_HW_COUNTERS]; - unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + struct perf_counter *generic[X86_PMC_MAX_GENERIC]; + unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; + + struct perf_counter *fixed[X86_PMC_MAX_FIXED]; + unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; }; /* @@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl) EXPORT_SYMBOL_GPL(hw_perf_restore); static inline void -__x86_perf_counter_disable(struct perf_counter *counter, +__pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; @@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter, err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, } static void -__x86_perf_counter_enable(struct perf_counter *counter, +__pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, @@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter, /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void x86_perf_counter_enable(struct perf_counter *counter) +static void pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); - cpuc->counters[idx] = counter; + cpuc->generic[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } void perf_counter_print_debug(void) @@ -301,16 +300,16 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void x86_perf_counter_disable(struct perf_counter *counter) +static void pmc_generic_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; + cpuc->generic[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } static void @@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -412,7 +411,7 @@ again: } /* * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these counters as + * do a task wakeup - but we mark these generic as * wakeup_pending and initate a wakeup callback: */ if (nmi) { @@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs) cpuc = &per_cpu(cpu_hw_counters, cpu); for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; if (!counter) continue; @@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > MAX_HW_COUNTERS) { - nr_hw_counters = MAX_HW_COUNTERS; + if (nr_hw_counters > X86_PMC_MAX_GENERIC) { + nr_hw_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, MAX_HW_COUNTERS); + nr_hw_counters, X86_PMC_MAX_GENERIC); } perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; @@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_perf_counter_read(struct perf_counter *counter) +static void pmc_generic_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = x86_perf_counter_enable, - .hw_perf_counter_disable = x86_perf_counter_disable, - .hw_perf_counter_read = x86_perf_counter_read, + .hw_perf_counter_enable = pmc_generic_enable, + .hw_perf_counter_disable = pmc_generic_disable, + .hw_perf_counter_read = pmc_generic_read, }; const struct hw_perf_counter_ops * -- cgit v1.1 From 703e937c83bbad79075a7846e062e447c2fee6a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 10:51:15 +0100 Subject: perfcounters: add fixed-mode PMC enumeration Enumerate fixed-mode PMCs based on CPUID, and feed that into the perfcounter code. Does not use fixed-mode PMCs yet. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fc3af86..2fca50c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; +static int nr_hw_counters_fixed __read_mostly; + struct cpu_hw_counters { struct perf_counter *generic[X86_PMC_MAX_GENERIC]; unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; @@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { void __init init_hw_perf_counters(void) { union cpuid10_eax eax; - unsigned int unused; unsigned int ebx; + unsigned int unused; + union cpuid10_edx edx; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); + cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return; printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); + printk(KERN_INFO "... version: %d\n", eax.split.version_id); + printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; if (nr_hw_counters > X86_PMC_MAX_GENERIC) { nr_hw_counters = X86_PMC_MAX_GENERIC; @@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; - printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); - printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + + nr_hw_counters_fixed = edx.split.num_counters_fixed; + if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) { + nr_hw_counters_fixed = X86_PMC_MAX_FIXED; + WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", + nr_hw_counters_fixed, X86_PMC_MAX_FIXED); + } + printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed); perf_counters_initialized = true; -- cgit v1.1 From 862a1a5f346fe7e9181ea51eaae48cf2cd70f746 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 13:09:20 +0100 Subject: x86, perfcounters: refactor code for fixed-function PMCs Impact: clean up Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 73 ++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2fca50c..358af52 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly; /* * Number of (generic) HW counters: */ -static int nr_hw_counters __read_mostly; -static u32 perf_counter_mask __read_mostly; +static int nr_counters_generic __read_mostly; +static u64 perf_counter_mask __read_mostly; -static int nr_hw_counters_fixed __read_mostly; +static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { - struct perf_counter *generic[X86_PMC_MAX_GENERIC]; - unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; - - struct perf_counter *fixed[X86_PMC_MAX_FIXED]; - unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; + struct perf_counter *counters[X86_PMC_IDX_MAX]; + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; /* @@ -159,7 +156,7 @@ void hw_perf_enable_all(void) if (unlikely(!perf_counters_initialized)) return; - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask); } u64 hw_perf_save_disable(void) @@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void) return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); return ctrl; } @@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } EXPORT_SYMBOL_GPL(hw_perf_restore); @@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +static int fixed_mode_idx(struct hw_perf_counter *hwc) +{ + return -1; +} + /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ @@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { - idx = find_first_zero_bit(cpuc->used, nr_hw_counters); + idx = find_first_zero_bit(cpuc->used, nr_counters_generic); set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); - cpuc->generic[idx] = counter; + cpuc->counters[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -270,7 +272,7 @@ void perf_counter_print_debug(void) u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; - if (!nr_hw_counters) + if (!nr_counters_generic) return; local_irq_disable(); @@ -286,7 +288,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); - for (idx = 0; idx < nr_hw_counters; idx++) { + for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); @@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->generic[idx] = NULL; + cpuc->counters[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); /* Disable counters globally */ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; - for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->generic[bit]; + for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) { + struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -424,7 +426,7 @@ again: } } - wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); /* * Repeat if there is more work to be done: @@ -436,7 +438,7 @@ out: /* * Restore - do not reenable when global enable is off: */ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); } void smp_perf_counter_interrupt(struct pt_regs *regs) @@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->generic[bit]; + for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { + struct perf_counter *counter = cpuc->counters[bit]; if (!counter) continue; @@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); - nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > X86_PMC_MAX_GENERIC) { - nr_hw_counters = X86_PMC_MAX_GENERIC; + nr_counters_generic = eax.split.num_counters; + if (nr_counters_generic > X86_PMC_MAX_GENERIC) { + nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, X86_PMC_MAX_GENERIC); + nr_counters_generic, X86_PMC_MAX_GENERIC); } - perf_counter_mask = (1 << nr_hw_counters) - 1; - perf_max_counters = nr_hw_counters; + perf_counter_mask = (1 << nr_counters_generic) - 1; + perf_max_counters = nr_counters_generic; printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); - nr_hw_counters_fixed = edx.split.num_counters_fixed; - if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) { - nr_hw_counters_fixed = X86_PMC_MAX_FIXED; + nr_counters_fixed = edx.split.num_counters_fixed; + if (nr_counters_fixed > X86_PMC_MAX_FIXED) { + nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - nr_hw_counters_fixed, X86_PMC_MAX_FIXED); + nr_counters_fixed, X86_PMC_MAX_FIXED); } - printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed); + printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed); + + perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; + printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; perf_counters_lapic_init(0); -- cgit v1.1 From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:20:28 +0100 Subject: perfcounters: hw ops rename Impact: rename field names Shorten them. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 358af52..b675571 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter) } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = pmc_generic_enable, - .hw_perf_counter_disable = pmc_generic_disable, - .hw_perf_counter_read = pmc_generic_read, + .enable = pmc_generic_enable, + .disable = pmc_generic_disable, + .read = pmc_generic_read, }; const struct hw_perf_counter_ops * -- cgit v1.1 From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 13:50:42 +0100 Subject: perfcounters: enable lowlevel pmc code to schedule counters Allow lowlevel ->enable() op to return an error if a counter can not be added. This can be used to handle counter constraints. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b675571..74090a3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void pmc_generic_enable(struct perf_counter *counter) +static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); + + return 0; } void perf_counter_print_debug(void) -- cgit v1.1 From 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:28:12 +0100 Subject: x86, perfcounters: print out the ->used bitmask Impact: extend debug printouts Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 74090a3..f3359c2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter) idx = find_first_zero_bit(cpuc->used, nr_counters_generic); if (idx == nr_counters_generic) return -EAGAIN; + set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter) void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; + struct cpu_hw_counters *cpuc; int cpu, idx; if (!nr_counters_generic) @@ -282,6 +284,7 @@ void perf_counter_print_debug(void) local_irq_disable(); cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -291,6 +294,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); -- cgit v1.1 From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:17:29 +0100 Subject: perfcounters: add PERF_COUNT_BUS_CYCLES Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref on x86. (which is a good enough approximation) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f3359c2..86b2fdd 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); static const int intel_perfmon_event_map[] = { - [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, [PERF_COUNT_CACHE_MISSES] = 0x412e, [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_BUS_CYCLES] = 0x013c, }; static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); -- cgit v1.1 From 2f18d1e8d07ae67dd0afce875287756d4bd31a46 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Dec 2008 11:10:42 +0100 Subject: x86, perfcounters: add support for fixed-function pmcs Impact: extend performance counter support on x86 Intel CPUs Modern Intel CPUs have 3 "fixed-function" performance counters, which count these hardware events: Instr_Retired.Any CPU_CLK_Unhalted.Core CPU_CLK_Unhalted.Ref Add support for them to the performance counters subsystem. Their use is transparent to user-space: the counter scheduler is extended to automatically recognize the cases where a fixed-function PMC can be utilized instead of a generic PMC. In such cases the generic PMC is kept available for more counters. The above fixed-function events map to these generic counter hw events: PERF_COUNT_INSTRUCTIONS PERF_COUNT_CPU_CYCLES PERF_COUNT_BUS_CYCLES (The 'bus' cycles are in reality often CPU-ish cycles, just with a fixed frequency.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 149 +++++++++++++++++++++++++++++++------ 1 file changed, 125 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 86b2fdd..da46eca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -26,6 +26,7 @@ static bool perf_counters_initialized __read_mostly; */ static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; +static u64 counter_value_mask __read_mostly; static int nr_counters_fixed __read_mostly; @@ -120,9 +121,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, @@ -184,15 +182,33 @@ void hw_perf_restore(u64 ctrl) EXPORT_SYMBOL_GPL(hw_perf_restore); static inline void +__pmc_fixed_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + int err; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void __pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) + return __pmc_fixed_disable(counter, hwc, idx); + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -202,8 +218,9 @@ static void __hw_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { - s32 left = atomic64_read(&hwc->period_left); + s64 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; + int err; /* * If we are way outside a reasoable range then just skip forward: @@ -224,21 +241,64 @@ __hw_perf_counter_set_period(struct perf_counter *counter, * The hw counter starts counting from this counter offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)(s64)-left); + atomic64_set(&hwc->prev_count, (u64)-left); - wrmsr(hwc->counter_base + idx, -left, 0); + err = checking_wrmsrl(hwc->counter_base + idx, + (u64)(-left) & counter_value_mask); +} + +static inline void +__pmc_fixed_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8) and ring-3 counting (0x2), + * and enable ring-0 counting if allowed: + */ + bits = 0x8ULL | 0x2ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); } static void __pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) + return __pmc_fixed_enable(counter, hwc, idx); + wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -static int fixed_mode_idx(struct hw_perf_counter *hwc) +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { + unsigned int event; + + if (unlikely(hwc->nmi)) + return -1; + + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS])) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES])) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES])) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + return -1; } @@ -249,16 +309,39 @@ static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; + int idx; - /* Try to get the previous counter again */ - if (test_and_set_bit(idx, cpuc->used)) { - idx = find_first_zero_bit(cpuc->used, nr_counters_generic); - if (idx == nr_counters_generic) - return -EAGAIN; + idx = fixed_mode_idx(counter, hwc); + if (idx >= 0) { + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used)) + goto try_generic; - set_bit(idx, cpuc->used); + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->counter_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic counter again */ + if (test_and_set_bit(idx, cpuc->used)) { +try_generic: + idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; + + set_bit(idx, cpuc->used); + hwc->idx = idx; + } + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; } perf_counters_lapic_init(hwc->nmi); @@ -266,6 +349,10 @@ static int pmc_generic_enable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); cpuc->counters[idx] = counter; + /* + * Make it visible before enabling the hw: + */ + smp_wmb(); __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -275,7 +362,7 @@ static int pmc_generic_enable(struct perf_counter *counter) void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; struct cpu_hw_counters *cpuc; int cpu, idx; @@ -290,11 +377,13 @@ void perf_counter_print_debug(void) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); printk(KERN_INFO "\n"); printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { @@ -303,13 +392,19 @@ void perf_counter_print_debug(void) prev_left = per_cpu(prev_left[idx], cpu); - printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); - printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } + for (idx = 0; idx < nr_counters_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } local_irq_enable(); } @@ -323,6 +418,11 @@ static void pmc_generic_disable(struct perf_counter *counter) clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the counter: + */ + smp_wmb(); /* * Drain the remaining delta count out of a counter @@ -353,14 +453,11 @@ static void perf_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; - u64 pmc_ctrl; - - rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); x86_perf_counter_update(counter, hwc, idx); __hw_perf_counter_set_period(counter, hwc, idx); - if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (counter->state == PERF_COUNTER_STATE_ACTIVE) __pmc_generic_enable(counter, hwc, idx); } @@ -373,6 +470,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); perf_store_irq_data(sibling, atomic64_read(&counter->count)); @@ -403,7 +501,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; - for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) { + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); @@ -561,6 +659,9 @@ void __init init_hw_perf_counters(void) perf_max_counters = nr_counters_generic; printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + counter_value_mask = (1ULL << eax.split.bit_width) - 1; + printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); nr_counters_fixed = edx.split.num_counters_fixed; -- cgit v1.1 From 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 27 Dec 2008 19:15:43 +0530 Subject: x86: perf_counter remove unwanted hw_perf_enable_all Impact: clean, reduce kernel size a bit, avoid sparse warnings Fixes sparse warnings: arch/x86/kernel/cpu/perf_counter.c:153:6: warning: symbol 'hw_perf_enable_all' was not declared. Should it be static? arch/x86/kernel/cpu/perf_counter.c:279:3: warning: returning void-valued expression arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index da46eca..9376771 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -150,14 +150,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -void hw_perf_enable_all(void) -{ - if (unlikely(!perf_counters_initialized)) - return; - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask); -} - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -200,12 +192,10 @@ static inline void __pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { - int err; - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - return __pmc_fixed_disable(counter, hwc, idx); - - err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + __pmc_fixed_disable(counter, hwc, idx); + else + wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -276,10 +266,10 @@ __pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - return __pmc_fixed_enable(counter, hwc, idx); - - wrmsr(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + __pmc_fixed_enable(counter, hwc, idx); + else + wrmsr(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } static int -- cgit v1.1 From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 10:13:01 +0100 Subject: perfcounters: throttle on too high IRQ rates Starting kerneltop with only -c 100 seems to be a bad idea, it can easily lock the system due to perfcounter IRQ overload. So add throttling: if a new IRQ arrives in a shorter than PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them from the next timer tick. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 ++ arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 7b434e5..849c230 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); + + perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9376771..1a040b1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 last_interrupt; + u64 global_enable; + int throttled; }; /* @@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, saved_global; - struct cpu_hw_counters *cpuc; + u64 ack, status, now; + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); /* Disable counters globally */ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + now = sched_clock(); + if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) + cpuc->throttled = 1; + cpuc->last_interrupt = now; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) @@ -533,9 +539,29 @@ again: goto again; out: /* - * Restore - do not reenable when global enable is off: + * Restore - do not reenable when global enable is off or throttled: */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + if (!cpuc->throttled) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); +} + +void perf_counter_unthrottle(void) +{ + struct cpu_hw_counters *cpuc; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + if (unlikely(!perf_counters_initialized)) + return; + + cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + if (cpuc->throttled) { + if (printk_ratelimit()) + printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->throttled = 0; + } } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.1 From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:36:16 +0100 Subject: perfcounters: ratelimit performance counter interrupts Ratelimit performance counter interrupts to 100KHz per CPU. This replaces the irq-delta-time based method. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1a040b1..a56d4cf 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 last_interrupt; + unsigned long interrupts; u64 global_enable; - int throttled; }; /* @@ -471,13 +470,18 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) } /* + * Maximum interrupt frequency of 100KHz per CPU + */ +#define PERFMON_MAX_INTERRUPTS 100000/HZ + +/* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, now; + u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); @@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - now = sched_clock(); - if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) - cpuc->throttled = 1; - cpuc->last_interrupt = now; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) goto out; @@ -541,13 +540,14 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (!cpuc->throttled) + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; + u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -556,12 +556,15 @@ void perf_counter_unthrottle(void) return; cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); - if (cpuc->throttled) { + if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); - cpuc->throttled = 0; } + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); + if (unlikely(cpuc->global_enable && !global_enable)) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.1 From 3415dd9146c574bffe8f012c096bfc2bc62b9508 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:16:53 +0100 Subject: perfcounters fix section mismatch warning in perf_counter.c::perf_counters_lapic_init() Fix: WARNING: arch/x86/kernel/built-in.o(.text+0xdd0f): Section mismatch in reference from the function pmc_generic_enable() to the function .cpuinit.text:perf_counters_lapic_init() The function pmc_generic_enable() references the function __cpuinit perf_counters_lapic_init(). This is often because pmc_generic_enable lacks a __cpuinit annotation or the annotation of perf_counters_lapic_init is wrong. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a56d4cf..46c436c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -605,7 +605,7 @@ void perf_counter_notify(struct pt_regs *regs) local_irq_restore(flags); } -void __cpuinit perf_counters_lapic_init(int nmi) +void perf_counters_lapic_init(int nmi) { u32 apic_val; -- cgit v1.1 From bb3f0b59ad005d2d2ecbbe9bd048eab6d1ecbd31 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 25 Jan 2009 02:38:09 -0800 Subject: x86: make irqinit_32.c more like irqinit_64.c, v2 Impact: cleanup 1. add smp_intr_init and apic_intr_init for 32bit, the same as 64bit 2. move the apic_intr_init() call before set gate with interrupt[i] 3. for 64bit, if ia32_emulation is not used, will make per_cpu to use 0x80 vector. [ v2: should use !test_bit() instead of test_bit() with 32bit ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 56 ++++++++++++++++++++++++++------------------ arch/x86/kernel/irqinit_64.c | 7 +++--- arch/x86/kernel/traps.c | 15 +++++------- 3 files changed, 43 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index c56496f..ddf3eb7 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -120,28 +120,8 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) +static void __init smp_intr_init(void) { - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); - } - - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper @@ -170,8 +150,13 @@ void __init native_init_IRQ(void) set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +} +static void __init apic_intr_init(void) +{ #ifdef CONFIG_X86_LOCAL_APIC + smp_intr_init(); + /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -181,12 +166,37 @@ void __init native_init_IRQ(void) # ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); # endif -#endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) +# ifdef CONFIG_X86_MCE_P4THERMAL /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +# endif #endif +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + apic_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 6a71bfc..16e1fc6 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -162,6 +162,9 @@ void __init native_init_IRQ(void) int i; init_ISA_irqs(); + + apic_intr_init(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become @@ -169,12 +172,10 @@ void __init native_init_IRQ(void) */ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) + if (!test_bit(vector, used_vectors)) set_intr_gate(vector, interrupt[i]); } - apic_intr_init(); - if (!acpi_ioapic) setup_irq(2, &irq2); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ed5aee5..d36a502 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -979,8 +979,13 @@ void __init trap_init(void) #endif set_intr_gate(19, &simd_coprocessor_error); + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + #ifdef CONFIG_IA32_EMULATION set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_32 @@ -997,17 +1002,9 @@ void __init trap_init(void) } set_system_trap_gate(SYSCALL_VECTOR, &system_call); -#endif - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_X86_64 - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else set_bit(SYSCALL_VECTOR, used_vectors); #endif + /* * Should be a barrier for any external CPU state: */ -- cgit v1.1 From 15081c61362618a0c81cc8d04e45e7427bc1ed71 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 1 Feb 2009 22:07:39 +0530 Subject: x86: irqinit_32.c fix compilation warning Fix: arch/x86/kernel/irqinit_32.c:124: warning: 'smp_intr_init' defined but not used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ddf3eb7..520e6c1 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -154,9 +154,9 @@ static void __init smp_intr_init(void) static void __init apic_intr_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC smp_intr_init(); +#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); -- cgit v1.1 From 5b75af0a02fcf3b8899f38ff6f22164c5d8e2fdd Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Feb 2009 17:11:34 +0100 Subject: perfcounters: fix "perf counters kill oprofile" bug With oprofile as a module, and unloaded by profiling script, both oprofile and kerneltop work fine.. unless you leave kerneltop running when you start profiling, then you may see badness. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 46c436c..8bb2133 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -643,7 +643,9 @@ perf_counter_nmi_handler(struct notifier_block *self, } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler + .notifier_call = perf_counter_nmi_handler, + .next = NULL, + .priority = 1 }; void __init init_hw_perf_counters(void) -- cgit v1.1 From d278c48435625cb6b7edcf6a547620768b175709 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 9 Feb 2009 07:38:50 +0100 Subject: perf_counters: account NMI interrupts I noticed that kerneltop interrupts were accounted as NMI, but not their perf counter origin. Account NMI performance counter interrupts. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8bb2133..9901e46 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -495,6 +495,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) goto out; again: + inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_counter *counter = cpuc->counters[bit]; @@ -570,7 +571,6 @@ void perf_counter_unthrottle(void) void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); - inc_irq_stat(apic_perf_irqs); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); __smp_perf_counter_interrupt(regs, 0); -- cgit v1.1 From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 14:35:35 +1100 Subject: perf_counters: allow users to count user, kernel and/or hypervisor events Impact: new perf_counter feature This extends the perf_counter_hw_event struct with bits that specify that events in user, kernel and/or hypervisor mode should not be counted (i.e. should be excluded), and adds code to program the PMU mode selection bits accordingly on x86 and powerpc. For software counters, we don't currently have the infrastructure to distinguish which mode an event occurs in, so we currently fail the counter initialization if the setting of the hw_event.exclude_* bits would require us to distinguish. Context switches and CPU migrations are currently considered to occur in kernel mode. On x86, this changes the previous policy that only root can count kernel events. Now non-root users can count kernel events or exclude them. Non-root users still can't use NMI events, though. On x86 we don't appear to have any way to control whether hypervisor events are counted or not, so hw_event.exclude_hv is ignored. On powerpc, the selection of whether to count events in user, kernel and/or hypervisor mode is PMU-wide, not per-counter, so this adds a check that the hw_event.exclude_* settings are the same as other events on the PMU. Counters being added to a group have to have the same settings as the other hardware counters in the group. Counters and groups can only be enabled in hw_perf_group_sched_in or power_perf_enable if they have the same settings as any other counters already on the PMU. If we are not running on a hypervisor, the exclude_hv setting is ignored (by forcing it to 0) since we can't ever get any hypervisor events. Signed-off-by: Paul Mackerras --- arch/x86/kernel/cpu/perf_counter.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9901e46..383d4c6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EINVAL; /* - * Count user events, and generate PMC IRQs: + * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ - hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + hwc->config = ARCH_PERFMON_EVENTSEL_INT; /* - * If privileged enough, count OS events too, and allow - * NMI events as well: + * Count user and OS events unless requested not to. */ - hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN)) { + if (!hw_event->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event->nmi) - hwc->nmi = 1; - } + + /* + * If privileged enough, allow NMI events: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; /* @@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter, int err; /* - * Enable IRQ generation (0x8) and ring-3 counting (0x2), - * and enable ring-0 counting if allowed: + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: */ - bits = 0x8ULL | 0x2ULL; + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; bits <<= (idx * 4); -- cgit v1.1 From b56a3802dc6df29aa27d2c12edf420258091ad66 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 27 Feb 2009 18:09:09 +0530 Subject: x86: prepare perf_counter to add more cpus Introduced struct pmc_x86_ops to add more cpus. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 106 +++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 383d4c6..a3c8852 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -3,6 +3,7 @@ * * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * Copyright(C) 2009 Jaswinder Singh Rajput * * For licencing details see kernel-base/COPYING */ @@ -38,10 +39,24 @@ struct cpu_hw_counters { }; /* - * Intel PerfMon v3. Used on Core2 and later. + * struct pmc_x86_ops - performance counter x86 ops */ +struct pmc_x86_ops { + u64 (*save_disable_all) (void); + void (*restore_all) (u64 ctrl); + unsigned eventsel; + unsigned perfctr; + int (*event_map) (int event); + int max_events; +}; + +static struct pmc_x86_ops *pmc_ops; + static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); +/* + * Intel PerfMon v3. Used on Core2 and later. + */ static const int intel_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x003c, @@ -53,7 +68,10 @@ static const int intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +static int pmc_intel_event_map(int event) +{ + return intel_perfmon_event_map[event]; +} /* * Propagate counter elapsed time into the generic counter. @@ -144,38 +162,48 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (hw_event->raw) { hwc->config |= hw_event->type; } else { - if (hw_event->type >= max_intel_perfmon_events) + if (hw_event->type >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event->type]; + hwc->config |= pmc_ops->event_map(hw_event->type); } counter->wakeup_pending = 0; return 0; } -u64 hw_perf_save_disable(void) +static u64 pmc_intel_save_disable_all(void) { u64 ctrl; - if (unlikely(!perf_counters_initialized)) - return 0; - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); return ctrl; } + +u64 hw_perf_save_disable(void) +{ + if (unlikely(!perf_counters_initialized)) + return 0; + + return pmc_ops->save_disable_all(); +} EXPORT_SYMBOL_GPL(hw_perf_save_disable); +static void pmc_intel_restore_all(u64 ctrl) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); +} + void hw_perf_restore(u64 ctrl) { if (unlikely(!perf_counters_initialized)) return; - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + pmc_ops->restore_all(ctrl); } EXPORT_SYMBOL_GPL(hw_perf_restore); @@ -291,11 +319,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -339,8 +367,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = pmc_ops->eventsel; + hwc->counter_base = pmc_ops->perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -386,8 +414,8 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); + rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); + rdmsrl(pmc_ops->perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -655,29 +683,56 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; -void __init init_hw_perf_counters(void) +static struct pmc_x86_ops pmc_intel_ops = { + .save_disable_all = pmc_intel_save_disable_all, + .restore_all = pmc_intel_restore_all, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = pmc_intel_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), +}; + +static struct pmc_x86_ops *pmc_intel_init(void) { union cpuid10_eax eax; unsigned int ebx; unsigned int unused; union cpuid10_edx edx; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - /* * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return; + return NULL; printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); + printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + nr_counters_generic = eax.split.num_counters; + nr_counters_fixed = edx.split.num_counters_fixed; + counter_value_mask = (1ULL << eax.split.bit_width) - 1; + + return &pmc_intel_ops; +} + +void __init init_hw_perf_counters(void) +{ + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + pmc_ops = pmc_intel_init(); + break; + } + if (!pmc_ops) + return; + + printk(KERN_INFO "... num counters: %d\n", nr_counters_generic); if (nr_counters_generic > X86_PMC_MAX_GENERIC) { nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -686,13 +741,8 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_counters_generic) - 1; perf_max_counters = nr_counters_generic; - printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); - counter_value_mask = (1ULL << eax.split.bit_width) - 1; printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); - printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); - - nr_counters_fixed = edx.split.num_counters_fixed; if (nr_counters_fixed > X86_PMC_MAX_FIXED) { nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", -- cgit v1.1 From f87ad35d37fa543925210550f7db20a54c83ed70 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 27 Feb 2009 20:15:14 +0530 Subject: x86: AMD Support for perf_counter Supported basic performance counter for AMD K7 and later: $ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null Performance counter stats for 'ls': 12.298610 task clock ticks (msecs) 3298477 CPU cycles (events) 1406354 instructions (events) 749035 cache references (events) 16939 cache misses (events) 100589 branches (events) 11159 branch misses (events) 7.627540 cpu clock ticks (msecs) 12.298610 task clock ticks (msecs) 500 pagefaults (events) 6 context switches (events) 3 CPU migrations (events) Wall-clock time elapsed: 8.672290 msecs Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ++ arch/x86/kernel/cpu/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5..edcde52 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -368,6 +368,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); + /* Enable Performance counter for K7 and later */ + if (c->x86 > 6 && c->x86 <= 0x11) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a3c8852..266618a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -74,6 +74,24 @@ static int pmc_intel_event_map(int event) } /* + * AMD Performance Monitor K7 and later. + */ +static const int amd_perfmon_event_map[] = +{ + [PERF_COUNT_CPU_CYCLES] = 0x0076, + [PERF_COUNT_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_CACHE_MISSES] = 0x0081, + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_BRANCH_MISSES] = 0x00c5, +}; + +static int pmc_amd_event_map(int event) +{ + return amd_perfmon_event_map[event]; +} + +/* * Propagate counter elapsed time into the generic counter. * Can only be executed on the CPU where the counter is active. * Returns the delta events processed. @@ -151,8 +169,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) - hwc->irq_period = 0x7FFFFFFF; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) + hwc->irq_period = 0x7FFFFFFF; atomic64_set(&hwc->period_left, hwc->irq_period); @@ -184,6 +203,22 @@ static u64 pmc_intel_save_disable_all(void) return ctrl; } +static u64 pmc_amd_save_disable_all(void) +{ + int idx; + u64 val, ctrl = 0; + + for (idx = 0; idx < nr_counters_generic; idx++) { + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + ctrl |= (1 << idx); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } + + return ctrl; +} + u64 hw_perf_save_disable(void) { if (unlikely(!perf_counters_initialized)) @@ -198,6 +233,20 @@ static void pmc_intel_restore_all(u64 ctrl) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } +static void pmc_amd_restore_all(u64 ctrl) +{ + u64 val; + int idx; + + for (idx = 0; idx < nr_counters_generic; idx++) { + if (ctrl & (1 << idx)) { + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } + } +} + void hw_perf_restore(u64 ctrl) { if (unlikely(!perf_counters_initialized)) @@ -314,6 +363,9 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return -1; + if (unlikely(hwc->nmi)) return -1; @@ -401,6 +453,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -411,6 +464,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); + } printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { @@ -588,6 +642,9 @@ void perf_counter_unthrottle(void) if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + if (unlikely(!perf_counters_initialized)) return; @@ -692,6 +749,15 @@ static struct pmc_x86_ops pmc_intel_ops = { .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; +static struct pmc_x86_ops pmc_amd_ops = { + .save_disable_all = pmc_amd_save_disable_all, + .restore_all = pmc_amd_restore_all, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = pmc_amd_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), +}; + static struct pmc_x86_ops *pmc_intel_init(void) { union cpuid10_eax eax; @@ -719,6 +785,16 @@ static struct pmc_x86_ops *pmc_intel_init(void) return &pmc_intel_ops; } +static struct pmc_x86_ops *pmc_amd_init(void) +{ + nr_counters_generic = 4; + nr_counters_fixed = 0; + + printk(KERN_INFO "AMD Performance Monitoring support detected.\n"); + + return &pmc_amd_ops; +} + void __init init_hw_perf_counters(void) { if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -728,6 +804,9 @@ void __init init_hw_perf_counters(void) case X86_VENDOR_INTEL: pmc_ops = pmc_intel_init(); break; + case X86_VENDOR_AMD: + pmc_ops = pmc_amd_init(); + break; } if (!pmc_ops) return; -- cgit v1.1 From 169e41eb7f5464c077a7e0e129f025759d04cc54 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 28 Feb 2009 18:37:49 +0530 Subject: x86: decent declarations in perf_counter.c Impact: cleanup making decent declrations for struct pmc_x86_ops and fix checkpatch error: ERROR: Macros with complex values should be enclosed in parenthesis Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 266618a..a1f3646 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -42,12 +42,12 @@ struct cpu_hw_counters { * struct pmc_x86_ops - performance counter x86 ops */ struct pmc_x86_ops { - u64 (*save_disable_all) (void); - void (*restore_all) (u64 ctrl); - unsigned eventsel; - unsigned perfctr; - int (*event_map) (int event); - int max_events; + u64 (*save_disable_all)(void); + void (*restore_all)(u64 ctrl); + unsigned eventsel; + unsigned perfctr; + int (*event_map)(int event); + int max_events; }; static struct pmc_x86_ops *pmc_ops; @@ -561,7 +561,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) /* * Maximum interrupt frequency of 100KHz per CPU */ -#define PERFMON_MAX_INTERRUPTS 100000/HZ +#define PERFMON_MAX_INTERRUPTS (100000/HZ) /* * This handler is triggered by the local APIC, so the APIC IRQ handling -- cgit v1.1 From a1ef58f442542d8b3e3b963339fbc522c36e827c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 28 Feb 2009 18:45:39 +0530 Subject: x86: use pr_info in perf_counter.c Impact: cleanup using pr_info in perf_counter.c fixes various 80 characters warnings and also indenting for conditional statement Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a1f3646..3b65f19 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -454,18 +454,18 @@ void perf_counter_print_debug(void) cpuc = &per_cpu(cpu_hw_counters, cpu); if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - - printk(KERN_INFO "\n"); - printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); - printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); - printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); - printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); @@ -473,17 +473,17 @@ void perf_counter_print_debug(void) prev_left = per_cpu(prev_left[idx], cpu); - printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n", + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); - printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n", + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n", + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for (idx = 0; idx < nr_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); - printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n", + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } local_irq_enable(); @@ -773,10 +773,10 @@ static struct pmc_x86_ops *pmc_intel_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return NULL; - printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); - printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + pr_info("Intel Performance Monitoring support detected.\n"); + pr_info("... version: %d\n", eax.split.version_id); + pr_info("... bit width: %d\n", eax.split.bit_width); + pr_info("... mask length: %d\n", eax.split.mask_length); nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; @@ -790,7 +790,7 @@ static struct pmc_x86_ops *pmc_amd_init(void) nr_counters_generic = 4; nr_counters_fixed = 0; - printk(KERN_INFO "AMD Performance Monitoring support detected.\n"); + pr_info("AMD Performance Monitoring support detected.\n"); return &pmc_amd_ops; } @@ -811,7 +811,7 @@ void __init init_hw_perf_counters(void) if (!pmc_ops) return; - printk(KERN_INFO "... num counters: %d\n", nr_counters_generic); + pr_info("... num counters: %d\n", nr_counters_generic); if (nr_counters_generic > X86_PMC_MAX_GENERIC) { nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -820,18 +820,18 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_counters_generic) - 1; perf_max_counters = nr_counters_generic; - printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); + pr_info("... value mask: %016Lx\n", counter_value_mask); if (nr_counters_fixed > X86_PMC_MAX_FIXED) { nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", nr_counters_fixed, X86_PMC_MAX_FIXED); } - printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed); + pr_info("... fixed counters: %d\n", nr_counters_fixed); perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; - printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; perf_counters_lapic_init(0); -- cgit v1.1 From b0f3f28e0f14eb335f67bfaae33ce8b8d74fd58b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2009 18:08:27 +0100 Subject: perfcounters: IRQ and NMI support on AMD CPUs The below completes the K7+ performance counter support: - IRQ support - NMI support KernelTop output works now as well. Signed-off-by: Peter Zijlstra Cc: Jaswinder Singh Rajput Cc: Paul Mackerras LKML-Reference: <1236273633.5187.286.camel@laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 272 +++++++++++++++++++++++++++++++------ 1 file changed, 228 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3b65f19..6ebe9ab 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -28,6 +28,7 @@ static bool perf_counters_initialized __read_mostly; static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; static u64 counter_value_mask __read_mostly; +static int counter_value_bits __read_mostly; static int nr_counters_fixed __read_mostly; @@ -35,7 +36,9 @@ struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 global_enable; + u64 throttle_ctrl; + u64 active_mask; + int enabled; }; /* @@ -43,21 +46,28 @@ struct cpu_hw_counters { */ struct pmc_x86_ops { u64 (*save_disable_all)(void); - void (*restore_all)(u64 ctrl); + void (*restore_all)(u64); + u64 (*get_status)(u64); + void (*ack_status)(u64); + void (*enable)(int, u64); + void (*disable)(int, u64); unsigned eventsel; unsigned perfctr; - int (*event_map)(int event); + u64 (*event_map)(int); + u64 (*raw_event)(u64); int max_events; }; static struct pmc_x86_ops *pmc_ops; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { + .enabled = 1, +}; /* * Intel PerfMon v3. Used on Core2 and later. */ -static const int intel_perfmon_event_map[] = +static const u64 intel_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -68,15 +78,29 @@ static const int intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static int pmc_intel_event_map(int event) +static u64 pmc_intel_event_map(int event) { return intel_perfmon_event_map[event]; } +static u64 pmc_intel_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FF +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00 +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000 + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_COUNTER_MASK) + + return event & CORE_EVNTSEL_MASK; +} + /* * AMD Performance Monitor K7 and later. */ -static const int amd_perfmon_event_map[] = +static const u64 amd_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x0076, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -86,11 +110,25 @@ static const int amd_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -static int pmc_amd_event_map(int event) +static u64 pmc_amd_event_map(int event) { return amd_perfmon_event_map[event]; } +static u64 pmc_amd_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FF +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00 +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000 + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_COUNTER_MASK) + + return event & K7_EVNTSEL_MASK; +} + /* * Propagate counter elapsed time into the generic counter. * Can only be executed on the CPU where the counter is active. @@ -179,7 +217,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (hw_event->raw) { - hwc->config |= hw_event->type; + hwc->config |= pmc_ops->raw_event(hw_event->type); } else { if (hw_event->type >= pmc_ops->max_events) return -EINVAL; @@ -205,18 +243,24 @@ static u64 pmc_intel_save_disable_all(void) static u64 pmc_amd_save_disable_all(void) { - int idx; - u64 val, ctrl = 0; + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + int enabled, idx; + + enabled = cpuc->enabled; + cpuc->enabled = 0; + barrier(); for (idx = 0; idx < nr_counters_generic; idx++) { + u64 val; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) - ctrl |= (1 << idx); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } } - return ctrl; + return enabled; } u64 hw_perf_save_disable(void) @@ -226,6 +270,9 @@ u64 hw_perf_save_disable(void) return pmc_ops->save_disable_all(); } +/* + * Exported because of ACPI idle + */ EXPORT_SYMBOL_GPL(hw_perf_save_disable); static void pmc_intel_restore_all(u64 ctrl) @@ -235,11 +282,18 @@ static void pmc_intel_restore_all(u64 ctrl) static void pmc_amd_restore_all(u64 ctrl) { - u64 val; + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; + cpuc->enabled = ctrl; + barrier(); + if (!ctrl) + return; + for (idx = 0; idx < nr_counters_generic; idx++) { - if (ctrl & (1 << idx)) { + if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) { + u64 val; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); @@ -254,8 +308,112 @@ void hw_perf_restore(u64 ctrl) pmc_ops->restore_all(ctrl); } +/* + * Exported because of ACPI idle + */ EXPORT_SYMBOL_GPL(hw_perf_restore); +static u64 pmc_intel_get_status(u64 mask) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static u64 pmc_amd_get_status(u64 mask) +{ + u64 status = 0; + int idx; + + for (idx = 0; idx < nr_counters_generic; idx++) { + s64 val; + + if (!(mask & (1 << idx))) + continue; + + rdmsrl(MSR_K7_PERFCTR0 + idx, val); + val <<= (64 - counter_value_bits); + if (val >= 0) + status |= (1 << idx); + } + + return status; +} + +static u64 hw_perf_get_status(u64 mask) +{ + if (unlikely(!perf_counters_initialized)) + return 0; + + return pmc_ops->get_status(mask); +} + +static void pmc_intel_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static void pmc_amd_ack_status(u64 ack) +{ +} + +static void hw_perf_ack_status(u64 ack) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->ack_status(ack); +} + +static void pmc_intel_enable(int idx, u64 config) +{ + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, + config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static void pmc_amd_enable(int idx, u64 config) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + set_bit(idx, (unsigned long *)&cpuc->active_mask); + if (cpuc->enabled) + config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + wrmsrl(MSR_K7_EVNTSEL0 + idx, config); +} + +static void hw_perf_enable(int idx, u64 config) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->enable(idx, config); +} + +static void pmc_intel_disable(int idx, u64 config) +{ + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); +} + +static void pmc_amd_disable(int idx, u64 config) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + clear_bit(idx, (unsigned long *)&cpuc->active_mask); + wrmsrl(MSR_K7_EVNTSEL0 + idx, config); + +} + +static void hw_perf_disable(int idx, u64 config) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->disable(idx, config); +} + static inline void __pmc_fixed_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int __idx) @@ -278,7 +436,7 @@ __pmc_generic_disable(struct perf_counter *counter, if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); else - wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + hw_perf_disable(idx, hwc->config); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -354,8 +512,7 @@ __pmc_generic_enable(struct perf_counter *counter, if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); else - wrmsr(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + hw_perf_enable(idx, hwc->config); } static int @@ -567,22 +724,20 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + int ret = 0; - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); - - /* Disable counters globally */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - ack_APIC_irq(); + cpuc->throttle_ctrl = hw_perf_save_disable(); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + status = hw_perf_get_status(cpuc->throttle_ctrl); if (!status) goto out; + ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -618,12 +773,12 @@ again: } } - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); + hw_perf_ack_status(ack); /* * Repeat if there is more work to be done: */ - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + status = hw_perf_get_status(cpuc->throttle_ctrl); if (status) goto again; out: @@ -631,32 +786,27 @@ out: * Restore - do not reenable when global enable is off or throttled: */ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + hw_perf_restore(cpuc->throttle_ctrl); + + return ret; } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; - u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - if (unlikely(!perf_counters_initialized)) return; - cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + hw_perf_restore(cpuc->throttle_ctrl); } - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); - if (unlikely(cpuc->global_enable && !global_enable)) - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); cpuc->interrupts = 0; } @@ -664,8 +814,8 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + ack_APIC_irq(); __smp_perf_counter_interrupt(regs, 0); - irq_exit(); } @@ -722,16 +872,23 @@ perf_counter_nmi_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; + int ret; + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; - if (likely(cmd != DIE_NMI_IPI)) + default: return NOTIFY_DONE; + } regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - __smp_perf_counter_interrupt(regs, 1); + ret = __smp_perf_counter_interrupt(regs, 1); - return NOTIFY_STOP; + return ret ? NOTIFY_STOP : NOTIFY_OK; } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { @@ -743,18 +900,28 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct pmc_x86_ops pmc_intel_ops = { .save_disable_all = pmc_intel_save_disable_all, .restore_all = pmc_intel_restore_all, + .get_status = pmc_intel_get_status, + .ack_status = pmc_intel_ack_status, + .enable = pmc_intel_enable, + .disable = pmc_intel_disable, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = pmc_intel_event_map, + .raw_event = pmc_intel_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; static struct pmc_x86_ops pmc_amd_ops = { .save_disable_all = pmc_amd_save_disable_all, .restore_all = pmc_amd_restore_all, + .get_status = pmc_amd_get_status, + .ack_status = pmc_amd_ack_status, + .enable = pmc_amd_enable, + .disable = pmc_amd_disable, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = pmc_amd_event_map, + .raw_event = pmc_amd_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; @@ -787,8 +954,25 @@ static struct pmc_x86_ops *pmc_intel_init(void) static struct pmc_x86_ops *pmc_amd_init(void) { + u64 old; + int bits; + nr_counters_generic = 4; nr_counters_fixed = 0; + counter_value_mask = ~0ULL; + + rdmsrl(MSR_K7_PERFCTR0, old); + wrmsrl(MSR_K7_PERFCTR0, counter_value_mask); + /* + * read the truncated mask + */ + rdmsrl(MSR_K7_PERFCTR0, counter_value_mask); + wrmsrl(MSR_K7_PERFCTR0, old); + + bits = 32 + fls(counter_value_mask >> 32); + if (bits == 32) + bits = fls((u32)counter_value_mask); + counter_value_bits = bits; pr_info("AMD Performance Monitoring support detected.\n"); -- cgit v1.1 From b5e8acf66ff5db707c7e08df49fdf6b415878442 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2009 20:34:21 +0100 Subject: perfcounters: IRQ and NMI support on AMD CPUs, fix The BKGD suggests that counter width on AMD CPUs is 48 for all existing models (it certainly is for mine). Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6ebe9ab..f585371 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -959,20 +959,8 @@ static struct pmc_x86_ops *pmc_amd_init(void) nr_counters_generic = 4; nr_counters_fixed = 0; - counter_value_mask = ~0ULL; - - rdmsrl(MSR_K7_PERFCTR0, old); - wrmsrl(MSR_K7_PERFCTR0, counter_value_mask); - /* - * read the truncated mask - */ - rdmsrl(MSR_K7_PERFCTR0, counter_value_mask); - wrmsrl(MSR_K7_PERFCTR0, old); - - bits = 32 + fls(counter_value_mask >> 32); - if (bits == 32) - bits = fls((u32)counter_value_mask); - counter_value_bits = bits; + counter_value_mask = 0x0000FFFFFFFFFFFFULL; + counter_value_bits = 48; pr_info("AMD Performance Monitoring support detected.\n"); -- cgit v1.1 From 184fe4ab1f2e4dfa45584889bb3820031648386b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 8 Mar 2009 11:34:19 +0100 Subject: x86: perf_counter cleanup Use and actual unsigned long bitmap instead of casting our way around. Signed-off-by: Peter Zijlstra Cc: Jaswinder Singh Rajput LKML-Reference: <1236508459.22914.3645.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f585371..1df4210 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -37,7 +37,7 @@ struct cpu_hw_counters { unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; - u64 active_mask; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; }; @@ -291,7 +291,7 @@ static void pmc_amd_restore_all(u64 ctrl) return; for (idx = 0; idx < nr_counters_generic; idx++) { - if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) { + if (test_bit(idx, cpuc->active_mask)) { u64 val; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); @@ -377,7 +377,7 @@ static void pmc_amd_enable(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, (unsigned long *)&cpuc->active_mask); + set_bit(idx, cpuc->active_mask); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -401,7 +401,7 @@ static void pmc_amd_disable(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - clear_bit(idx, (unsigned long *)&cpuc->active_mask); + clear_bit(idx, cpuc->active_mask); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } -- cgit v1.1 From e255357764f92afcafafbd4879b222b8c752065a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 8 Mar 2009 17:09:49 +0530 Subject: x86: perf_counter cleanup Remove unused variables and duplicate header file. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1df4210..155bc3c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -17,7 +17,6 @@ #include #include -#include #include static bool perf_counters_initialized __read_mostly; @@ -954,9 +953,6 @@ static struct pmc_x86_ops *pmc_intel_init(void) static struct pmc_x86_ops *pmc_amd_init(void) { - u64 old; - int bits; - nr_counters_generic = 4; nr_counters_fixed = 0; counter_value_mask = 0x0000FFFFFFFFFFFFULL; -- cgit v1.1 From bc44fb5f7d3e764ed7698c835a1a0f35aba2eb3d Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:42:18 +0100 Subject: x86, bts: detect size of DS fields Impact: more robust DS feature enumeration Detect the size of the pointer-type fields in the DS area configuration via the DTES64 features rather than based on the cpuid. Rename a variable to denote that size to reflect that it only covers the pointer-type fields. Add more boot-time diagnostics giving the detected size and the sizes of BTS and PEBS records. Use the size of the BTS/PEBS record to indicate that the respective feature is not available (if the record size is zero). Signed-off-by: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 84 +++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 87b67e3..6e5ec67 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -39,7 +39,7 @@ struct ds_configuration { /* the size of one pointer-typed field in the DS structure and in the BTS and PEBS buffers in bytes; this covers the first 8 DS fields related to buffer management. */ - unsigned char sizeof_field; + unsigned char sizeof_ptr_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; /* a series of bit-masks to control various features indexed @@ -142,14 +142,14 @@ enum ds_qualifier { static inline unsigned long ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } static inline void ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, unsigned long value) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; } @@ -410,7 +410,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, * Later architectures use 64bit pointers throughout, whereas earlier * architectures use 32bit pointers in 32bit mode. * - * We compute the base address for the first 8 fields based on: + * We compute the base address for the fields based on: * - the field size stored in the DS configuration * - the relative field position * @@ -441,13 +441,13 @@ enum bts_field { static inline unsigned long bts_get(const char *base, enum bts_field field) { - base += (ds_cfg.sizeof_field * field); + base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } static inline void bts_set(char *base, enum bts_field field, unsigned long val) { - base += (ds_cfg.sizeof_field * field);; + base += (ds_cfg.sizeof_ptr_field * field);; (*(unsigned long *)base) = val; } @@ -593,6 +593,10 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, struct ds_context *context; int error; + error = -EOPNOTSUPP; + if (!ds_cfg.sizeof_rec[qual]) + goto out; + error = -EINVAL; if (!base) goto out; @@ -635,10 +639,6 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, unsigned long irq; int error; - error = -EOPNOTSUPP; - if (!ds_cfg.ctl[dsf_bts]) - goto out; - /* buffer overflow notification is not yet implemented */ error = -EOPNOTSUPP; if (ovfl) @@ -848,7 +848,8 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + *(u64 *)(tracer->ds.context->ds + + (ds_cfg.sizeof_ptr_field * 8)); return &tracer->trace; } @@ -884,7 +885,8 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) if (!tracer) return -EINVAL; - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + *(u64 *)(tracer->ds.context->ds + + (ds_cfg.sizeof_ptr_field * 8)) = value; return 0; } @@ -894,52 +896,54 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), - - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, - .sizeof_rec[ds_pebs] = 8 * 18, }; static void -ds_configure(const struct ds_configuration *cfg) +ds_configure(const struct ds_configuration *cfg, + struct cpuinfo_x86 *cpu) { + unsigned long nr_pebs_fields = 0; + + printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); + +#ifdef __i386__ + nr_pebs_fields = 10; +#else + nr_pebs_fields = 18; +#endif + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + ds_cfg.sizeof_ptr_field = + (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); - if (!cpu_has_bts) { - ds_cfg.ctl[dsf_bts] = 0; + ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3; + ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; + + if (!cpu_has(cpu, X86_FEATURE_BTS)) { + ds_cfg.sizeof_rec[ds_bts] = 0; printk(KERN_INFO "[ds] bts not available\n"); } - if (!cpu_has_pebs) + if (!cpu_has(cpu, X86_FEATURE_PEBS)) { + ds_cfg.sizeof_rec[ds_pebs] = 0; printk(KERN_INFO "[ds] pebs not available\n"); + } + + printk(KERN_INFO "[ds] sizes: address: %u bit, ", + 8 * ds_cfg.sizeof_ptr_field); + printk("bts/pebs record: %u/%u bytes\n", + ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } @@ -951,12 +955,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x9: case 0xd: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_pentium_m, c); break; case 0xf: case 0x17: /* Core2 */ case 0x1c: /* Atom */ - ds_configure(&ds_cfg_core2_atom); + ds_configure(&ds_cfg_core2_atom, c); break; case 0x1a: /* i7 */ default: @@ -969,7 +973,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_netburst, c); break; default: /* sorry, don't know about them */ -- cgit v1.1 From 8a327f6d1b05f5ce16572b4413a5df1d0e872283 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:45:07 +0100 Subject: x86, bts: add selftest for BTS Perform a selftest of branch trace store when a cpu is initialized. WARN and disable branch trace store support if the selftest fails. Signed-off-by: Markus Metzger LKML-Reference: <20090313104507.A30125@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ds.c | 21 ++++ arch/x86/kernel/ds_selftest.c | 241 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ds_selftest.h | 15 +++ 4 files changed, 278 insertions(+) create mode 100644 arch/x86/kernel/ds_selftest.c create mode 100644 arch/x86/kernel/ds_selftest.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 339ce35..a0c9e13 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,7 @@ obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o obj-$(CONFIG_X86_DS) += ds.o +obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 6e5ec67..51c936c 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -29,6 +29,7 @@ #include #include +#include "ds_selftest.h" /* * The configuration for a particular DS hardware implementation. @@ -940,6 +941,26 @@ ds_configure(const struct ds_configuration *cfg, printk(KERN_INFO "[ds] pebs not available\n"); } + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; + + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + printk(KERN_INFO "[ds] sizes: address: %u bit, ", 8 * ds_cfg.sizeof_ptr_field); printk("bts/pebs record: %u/%u bytes\n", diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c new file mode 100644 index 0000000..8c46fbf --- /dev/null +++ b/arch/x86/kernel/ds_selftest.c @@ -0,0 +1,241 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger , 2009 + */ + +#include "ds_selftest.h" + +#include +#include + +#include + + +#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ + + +static int ds_selftest_bts_consistency(const struct bts_trace *trace) +{ + int error = 0; + + if (!trace) { + printk(KERN_CONT "failed to access trace..."); + /* Bail out. Other tests are pointless. */ + return -1; + } + + if (!trace->read) { + printk(KERN_CONT "bts read not available..."); + error = -1; + } + + /* Do some sanity checks on the trace configuration. */ + if (!trace->ds.n) { + printk(KERN_CONT "empty bts buffer..."); + error = -1; + } + if (!trace->ds.size) { + printk(KERN_CONT "bad bts trace setup..."); + error = -1; + } + if (trace->ds.end != + (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { + printk(KERN_CONT "bad bts buffer setup..."); + error = -1; + } + if ((trace->ds.top < trace->ds.begin) || + (trace->ds.end <= trace->ds.top)) { + printk(KERN_CONT "bts top out of bounds..."); + error = -1; + } + + return error; +} + +static int ds_selftest_bts_read(struct bts_tracer *tracer, + const struct bts_trace *trace, + const void *from, const void *to) +{ + const unsigned char *at; + + /* + * Check a few things which do not belong to this test. + * They should be covered by other tests. + */ + if (!trace) + return -1; + + if (!trace->read) + return -1; + + if (to < from) + return -1; + + if (from < trace->ds.begin) + return -1; + + if (trace->ds.end < to) + return -1; + + if (!trace->ds.size) + return -1; + + /* Now to the test itself. */ + for (at = from; (void *)at < to; at += trace->ds.size) { + struct bts_struct bts; + size_t index; + int error; + + if (((void *)at - trace->ds.begin) % trace->ds.size) { + printk(KERN_CONT + "read from non-integer index..."); + return -1; + } + index = ((void *)at - trace->ds.begin) / trace->ds.size; + + memset(&bts, 0, sizeof(bts)); + error = trace->read(tracer, at, &bts); + if (error < 0) { + printk(KERN_CONT + "error reading bts trace at [%lu] (0x%p)...", + index, at); + return error; + } + + switch (bts.qualifier) { + case BTS_BRANCH: + break; + default: + printk(KERN_CONT + "unexpected bts entry %llu at [%lu] (0x%p)...", + bts.qualifier, index, at); + return -1; + } + } + + return 0; +} + +int ds_selftest_bts(void) +{ + const struct bts_trace *trace; + struct bts_tracer *tracer; + int error = 0; + void *top; + unsigned char buffer[DS_SELFTEST_BUFFER_SIZE]; + + printk(KERN_INFO "[ds] bts selftest..."); + + tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + if (IS_ERR(tracer)) { + error = PTR_ERR(tracer); + tracer = NULL; + + printk(KERN_CONT + "initialization failed (err: %d)...", error); + goto out; + } + + /* The return should already give us enough trace. */ + ds_suspend_bts(tracer); + + /* Let's see if we can access the trace. */ + trace = ds_read_bts(tracer); + + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + /* If everything went well, we should have a few trace entries. */ + if (trace->ds.top == trace->ds.begin) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning, but continue. + */ + printk(KERN_CONT "no trace/overflow..."); + } + + /* Let's try to read the trace we collected. */ + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.top); + if (error < 0) + goto out; + + /* + * Let's read the trace again. + * Since we suspended tracing, we should get the same result. + */ + top = trace->ds.top; + + trace = ds_read_bts(tracer); + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + if (top != trace->ds.top) { + printk(KERN_CONT "suspend not working..."); + error = -1; + goto out; + } + + /* Let's collect some more trace - see if resume is working. */ + ds_resume_bts(tracer); + ds_suspend_bts(tracer); + + trace = ds_read_bts(tracer); + + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + if (trace->ds.top == top) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning and check the full trace. + */ + printk(KERN_CONT + "no resume progress/overflow..."); + + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.end); + } else if (trace->ds.top < top) { + /* + * We had a buffer overflow - the entire buffer should + * contain trace records. + */ + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.end); + } else { + /* + * It is quite likely that the buffer did not overflow. + * Let's just check the delta trace. + */ + error = ds_selftest_bts_read(tracer, trace, + top, trace->ds.top); + } + if (error < 0) + goto out; + + error = 0; + + /* The final test: release the tracer while tracing is suspended. */ + out: + ds_release_bts(tracer); + + printk(KERN_CONT "%s.\n", (error ? "failed" : "passed")); + + return error; +} + +int ds_selftest_pebs(void) +{ + return 0; +} diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h new file mode 100644 index 0000000..0e6e19d --- /dev/null +++ b/arch/x86/kernel/ds_selftest.h @@ -0,0 +1,15 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger , 2009 + */ + +#ifdef CONFIG_X86_DS_SELFTEST +extern int ds_selftest_bts(void); +extern int ds_selftest_pebs(void); +#else +static inline int ds_selftest_bts(void) { return 0; } +static inline int ds_selftest_pebs(void) { return 0; } +#endif /* CONFIG_X86_DS_SELFTEST */ -- cgit v1.1 From b8e47195451c5d3f62620b2b1b5928669afd56eb Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:46:42 +0100 Subject: x86, bts: correct comment style in ds.c Correct the comment style in ds.c. Signed-off-by: Markus Metzger LKML-Reference: <20090313104642.A30149@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 79 ++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 51c936c..d9cab71 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -35,25 +35,22 @@ * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the name of the configuration */ + /* The name of the configuration. */ const char *name; - /* the size of one pointer-typed field in the DS structure and - in the BTS and PEBS buffers in bytes; - this covers the first 8 DS fields related to buffer management. */ + /* The size of pointer-typed fields in DS, BTS, and PEBS. */ unsigned char sizeof_ptr_field; - /* the size of a BTS/PEBS record in bytes */ + /* The size of a BTS/PEBS record in bytes. */ unsigned char sizeof_rec[2]; - /* a series of bit-masks to control various features indexed - * by enum ds_feature */ + /* Control bit-masks indexed by enum ds_feature. */ unsigned long ctl[dsf_ctl_max]; }; static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) -#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ -#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ +#define MAX_SIZEOF_DS (12 * 8) /* Maximal size of a DS configuration. */ +#define MAX_SIZEOF_BTS (3 * 8) /* Maximal size of a BTS record. */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment. */ #define BTS_CONTROL \ (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ @@ -67,28 +64,28 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); * to identify tracers. */ struct ds_tracer { - /* the DS context (partially) owned by this tracer */ + /* The DS context (partially) owned by this tracer. */ struct ds_context *context; - /* the buffer provided on ds_request() and its size in bytes */ + /* The buffer provided on ds_request() and its size in bytes. */ void *buffer; size_t size; }; struct bts_tracer { - /* the common DS part */ + /* The common DS part. */ struct ds_tracer ds; - /* the trace including the DS configuration */ + /* The trace including the DS configuration. */ struct bts_trace trace; - /* buffer overflow notification function */ + /* Buffer overflow notification function. */ bts_ovfl_callback_t ovfl; }; struct pebs_tracer { - /* the common DS part */ + /* The common DS part. */ struct ds_tracer ds; - /* the trace including the DS configuration */ + /* The trace including the DS configuration. */ struct pebs_trace trace; - /* buffer overflow notification function */ + /* Buffer overflow notification function. */ pebs_ovfl_callback_t ovfl; }; @@ -214,18 +211,16 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + /* The DS configuration; goes into MSR_IA32_DS_AREA. */ unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ + /* The owner of the BTS and PEBS configuration, respectively. */ struct bts_tracer *bts_master; struct pebs_tracer *pebs_master; - /* use count */ + /* Use count. */ unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ + /* Pointer to the context pointer field. */ struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ + /* The traced task; NULL for current cpu. */ struct task_struct *task; }; @@ -350,14 +345,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, unsigned long write_size, adj_write_size; /* - * write as much as possible without producing an + * Write as much as possible without producing an * overflow interrupt. * - * interrupt_threshold must either be + * Interrupt_threshold must either be * - bigger than absolute_maximum or * - point to a record between buffer_base and absolute_maximum * - * index points to a valid record. + * Index points to a valid record. */ base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); @@ -366,8 +361,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, write_end = min(end, int_th); - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ + /* + * If we are already beyond the interrupt threshold, + * we fill the entire buffer. + */ if (write_end <= index) write_end = end; @@ -384,7 +381,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; - /* zero out trailing bytes */ + /* Zero out trailing bytes. */ memset((char *)index + write_size, 0, adj_write_size - write_size); index += adj_write_size; @@ -556,7 +553,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, unsigned int flags) { unsigned long buffer, adj; - /* adjust the buffer address and size to meet alignment + /* + * Adjust the buffer address and size to meet alignment * constraints: * - buffer is double-word aligned * - size is multiple of record size @@ -578,7 +576,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, trace->begin = (void *)buffer; trace->top = trace->begin; trace->end = (void *)(buffer + size); - /* The value for 'no threshold' is -1, which will set the + /* + * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ trace->ith = (void *)(buffer + size - ith); @@ -602,7 +601,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* we require some space to do alignment adjustments below */ + /* We require some space to do alignment adjustments below. */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) goto out; @@ -640,7 +639,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; @@ -700,7 +699,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; @@ -983,9 +982,9 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x1c: /* Atom */ ds_configure(&ds_cfg_core2_atom, c); break; - case 0x1a: /* i7 */ + case 0x1a: /* Core i7 */ default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; @@ -997,12 +996,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_netburst, c); break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } } -- cgit v1.1 From e9a22d1fb94050b7d600019c32e6b672d539054b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 11:54:40 +0100 Subject: x86, bts: cleanups Impact: cleanup, no code changed Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 142 ++++++++++++++++++++++++------------------ arch/x86/kernel/ds_selftest.h | 2 +- 2 files changed, 81 insertions(+), 63 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index d9cab71..7363e01 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -19,43 +19,52 @@ * Markus Metzger , 2007-2009 */ - -#include - -#include +#include #include -#include +#include #include +#include #include -#include + +#include #include "ds_selftest.h" /* - * The configuration for a particular DS hardware implementation. + * The configuration for a particular DS hardware implementation: */ struct ds_configuration { - /* The name of the configuration. */ - const char *name; - /* The size of pointer-typed fields in DS, BTS, and PEBS. */ - unsigned char sizeof_ptr_field; - /* The size of a BTS/PEBS record in bytes. */ - unsigned char sizeof_rec[2]; - /* Control bit-masks indexed by enum ds_feature. */ - unsigned long ctl[dsf_ctl_max]; + /* The name of the configuration: */ + const char *name; + + /* The size of pointer-typed fields in DS, BTS, and PEBS: */ + unsigned char sizeof_ptr_field; + + /* The size of a BTS/PEBS record in bytes: */ + unsigned char sizeof_rec[2]; + + /* Control bit-masks indexed by enum ds_feature: */ + unsigned long ctl[dsf_ctl_max]; }; static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) -#define MAX_SIZEOF_DS (12 * 8) /* Maximal size of a DS configuration. */ -#define MAX_SIZEOF_BTS (3 * 8) /* Maximal size of a BTS record. */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment. */ +/* Maximal size of a DS configuration: */ +#define MAX_SIZEOF_DS (12 * 8) -#define BTS_CONTROL \ - (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ - ds_cfg.ctl[dsf_bts_overflow]) +/* Maximal size of a BTS record: */ +#define MAX_SIZEOF_BTS (3 * 8) +/* BTS and PEBS buffer alignment: */ +#define DS_ALIGNMENT (1 << 3) + +/* Mask of control bits in the DS MSR register: */ +#define BTS_CONTROL \ + ( ds_cfg.ctl[dsf_bts] | \ + ds_cfg.ctl[dsf_bts_kernel] | \ + ds_cfg.ctl[dsf_bts_user] | \ + ds_cfg.ctl[dsf_bts_overflow] ) /* * A BTS or PEBS tracer. @@ -65,28 +74,32 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); */ struct ds_tracer { /* The DS context (partially) owned by this tracer. */ - struct ds_context *context; + struct ds_context *context; /* The buffer provided on ds_request() and its size in bytes. */ - void *buffer; - size_t size; + void *buffer; + size_t size; }; struct bts_tracer { - /* The common DS part. */ - struct ds_tracer ds; - /* The trace including the DS configuration. */ - struct bts_trace trace; - /* Buffer overflow notification function. */ - bts_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct bts_trace trace; + + /* Buffer overflow notification function: */ + bts_ovfl_callback_t ovfl; }; struct pebs_tracer { - /* The common DS part. */ - struct ds_tracer ds; - /* The trace including the DS configuration. */ - struct pebs_trace trace; - /* Buffer overflow notification function. */ - pebs_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct pebs_trace trace; + + /* Buffer overflow notification function: */ + pebs_ovfl_callback_t ovfl; }; /* @@ -95,6 +108,7 @@ struct pebs_tracer { * * The DS configuration consists of the following fields; different * architetures vary in the size of those fields. + * * - double-word aligned base linear address of the BTS buffer * - write pointer into the BTS buffer * - end linear address of the BTS buffer (one byte beyond the end of @@ -133,19 +147,20 @@ enum ds_field { }; enum ds_qualifier { - ds_bts = 0, + ds_bts = 0, ds_pebs }; -static inline unsigned long ds_get(const unsigned char *base, - enum ds_qualifier qual, enum ds_field field) +static inline unsigned long +ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } -static inline void ds_set(unsigned char *base, enum ds_qualifier qual, - enum ds_field field, unsigned long value) +static inline void +ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, + unsigned long value) { base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; @@ -157,7 +172,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, */ static DEFINE_SPINLOCK(ds_lock); - /* * We either support (system-wide) per-cpu or per-thread allocation. * We distinguish the two based on the task_struct pointer, where a @@ -211,17 +225,21 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* The DS configuration; goes into MSR_IA32_DS_AREA. */ - unsigned char ds[MAX_SIZEOF_DS]; - /* The owner of the BTS and PEBS configuration, respectively. */ - struct bts_tracer *bts_master; - struct pebs_tracer *pebs_master; - /* Use count. */ + /* The DS configuration; goes into MSR_IA32_DS_AREA: */ + unsigned char ds[MAX_SIZEOF_DS]; + + /* The owner of the BTS and PEBS configuration, respectively: */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + + /* Use count: */ unsigned long count; - /* Pointer to the context pointer field. */ - struct ds_context **this; - /* The traced task; NULL for current cpu. */ - struct task_struct *task; + + /* Pointer to the context pointer field: */ + struct ds_context **this; + + /* The traced task; NULL for current cpu: */ + struct task_struct *task; }; static DEFINE_PER_CPU(struct ds_context *, system_context_array); @@ -328,9 +346,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) * The remainder of any partially written record is zeroed out. * * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data + * qual: the buffer type + * record: the data to write + * size: the size of the data */ static int ds_write(struct ds_context *context, enum ds_qualifier qual, const void *record, size_t size) @@ -429,12 +447,12 @@ enum bts_field { bts_to, bts_flags, - bts_qual = bts_from, - bts_jiffies = bts_to, - bts_pid = bts_flags, + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, - bts_qual_mask = (bts_qual_max - 1), - bts_escape = ((unsigned long)-1 & ~bts_qual_mask) + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; static inline unsigned long bts_get(const char *base, enum bts_field field) @@ -461,8 +479,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val) * * return: bytes read/written on success; -Eerrno, otherwise */ -static int bts_read(struct bts_tracer *tracer, const void *at, - struct bts_struct *out) +static int +bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) { if (!tracer) return -EINVAL; diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h index 0e6e19d..2ba8745 100644 --- a/arch/x86/kernel/ds_selftest.h +++ b/arch/x86/kernel/ds_selftest.h @@ -12,4 +12,4 @@ extern int ds_selftest_pebs(void); #else static inline int ds_selftest_bts(void) { return 0; } static inline int ds_selftest_pebs(void) { return 0; } -#endif /* CONFIG_X86_DS_SELFTEST */ +#endif -- cgit v1.1 From 79258a354e0c69be94ae2871809a195bf4a647b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 12:02:08 +0100 Subject: x86, bts: detect size of DS fields, fix Impact: build fix One usage site was missed in the sizeof_field -> sizeof_ptr_field rename. Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 7363e01..5fd5333 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -983,7 +983,7 @@ ds_configure(const struct ds_configuration *cfg, printk("bts/pebs record: %u/%u bytes\n", ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) -- cgit v1.1 From c78a3956b982418186e40978a51636a2b43221bc Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 18 Mar 2009 19:27:00 +0100 Subject: x86, bts: use atomic memory allocation Ds_request_bts() needs to allocate memory. It uses GFP_KERNEL. Hw-branch-tracer calls ds_request_bts() within on_each_cpu(). Use atomic memory allocation to allow it to be used in that context. Signed-off-by: Markus Metzger LKML-Reference: <20090318192700.A6038@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5fd5333..b1d6e1f 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -255,8 +255,13 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) struct ds_context *new_context = NULL; unsigned long irq; - /* Chances are small that we already have a context. */ - new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + /* + * Chances are small that we already have a context. + * + * Contexts for per-cpu tracing are allocated using + * smp_call_function(). We must not sleep. + */ + new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC); if (!new_context) return NULL; @@ -662,8 +667,12 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (ovfl) goto out; + /* + * Per-cpu tracing is typically requested using smp_call_function(). + * We must not sleep. + */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) goto out; tracer->ovfl = ovfl; @@ -722,8 +731,12 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (ovfl) goto out; + /* + * Per-cpu tracing is typically requested using smp_call_function(). + * We must not sleep. + */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) goto out; tracer->ovfl = ovfl; -- cgit v1.1 From b8bcfe997e46150fedcc3f5b26b846400122fdd9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:05:19 -0800 Subject: x86/paravirt: remove lazy mode in interrupts Impact: simplification, robustness Make paravirt_lazy_mode() always return PARAVIRT_LAZY_NONE when in an interrupt. This prevents interrupt code from accidentally inheriting an outer lazy state, and instead does everything synchronously. Outer batched operations are left deferred. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra Cc: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 63dd358..8ab250a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -282,6 +282,9 @@ void paravirt_leave_lazy_cpu(void) enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + return __get_cpu_var(paravirt_lazy_mode); } -- cgit v1.1 From 7fd7d83d49914f03aefffba6aee09032fcd54cce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:24:03 -0800 Subject: x86/pvops: replace arch_enter_lazy_cpu_mode with arch_start_context_switch Impact: simplification, prepare for later changes Make lazy cpu mode more specific to context switching, so that it makes sense to do more context-switch specific things in the callbacks. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/paravirt.c | 13 ------------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 3 files changed, 2 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8ab250a..5eea954 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -301,19 +301,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_enable(); } -void arch_flush_lazy_cpu_mode(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { - WARN_ON(preempt_count() == 1); - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } - - preempt_enable(); -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 14014d7..57e49a8 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index abb7e6a..7115e60 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(); /* * Switch FS and GS. -- cgit v1.1 From b407fc57b815b2016186220baabc76cc8264206e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:46:21 -0800 Subject: x86/paravirt: flush pending mmu updates on context switch Impact: allow preemption during lazy mmu updates If we're in lazy mmu mode when context switching, leave lazy mmu mode, but remember the task's state in TIF_LAZY_MMU_UPDATES. When we resume the task, check this flag and re-enter lazy mmu mode if its set. This sets things up for allowing lazy mmu mode while preemptible, though that won't actually be active until the next change. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/kvm.c | 2 +- arch/x86/kernel/paravirt.c | 13 ++++++++++--- arch/x86/kernel/vmi_32.c | 14 ++++++++++---- 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 478bca9..5d7f6e7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -201,7 +201,7 @@ static void kvm_leave_lazy_mmu(void) struct kvm_para_state *state = kvm_para_state(); mmu_queue_flush(state); - paravirt_leave_lazy(paravirt_get_lazy_mode()); + paravirt_leave_lazy_mmu(); state->mode = paravirt_get_lazy_mode(); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5eea954..430a0e3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -252,7 +252,7 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode) __get_cpu_var(paravirt_lazy_mode) = mode; } -void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +static void leave_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); BUG_ON(preemptible()); @@ -267,17 +267,24 @@ void paravirt_enter_lazy_mmu(void) void paravirt_leave_lazy_mmu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_MMU); + leave_lazy(PARAVIRT_LAZY_MMU); } void paravirt_enter_lazy_cpu(void) { + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_thread_flag(TIF_LAZY_MMU_UPDATES); + } enter_lazy(PARAVIRT_LAZY_CPU); } void paravirt_leave_lazy_cpu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_CPU); + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } enum paravirt_lazy_mode paravirt_get_lazy_mode(void) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 2cc4a90..950929c 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -473,16 +473,22 @@ static void vmi_enter_lazy_cpu(void) vmi_ops.set_lazy_mode(2); } +static void vmi_leave_lazy_cpu(void) +{ + vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_cpu(); +} + static void vmi_enter_lazy_mmu(void) { paravirt_enter_lazy_mmu(); vmi_ops.set_lazy_mode(1); } -static void vmi_leave_lazy(void) +static void vmi_leave_lazy_mmu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_mmu(); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -718,12 +724,12 @@ static inline int __init activate_vmi(void) para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, set_lazy_mode, SetLazyMode); - para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ -- cgit v1.1 From 224101ed69d3fbb486868e0f6e0f9fa37302efb4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 11:18:57 -0800 Subject: x86/paravirt: finish change from lazy cpu to context switch start/end Impact: fix lazy context switch API Pass the previous and next tasks into the context switch start end calls, so that the called functions can properly access the task state (esp in end_context_switch, in which the next task is not yet completely current). Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/paravirt.c | 14 ++++++-------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vmi_32.c | 12 ++++++------ 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 430a0e3..cf14375 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -270,20 +270,20 @@ void paravirt_leave_lazy_mmu(void) leave_lazy(PARAVIRT_LAZY_MMU); } -void paravirt_enter_lazy_cpu(void) +void paravirt_start_context_switch(struct task_struct *prev) { if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); - set_thread_flag(TIF_LAZY_MMU_UPDATES); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } enter_lazy(PARAVIRT_LAZY_CPU); } -void paravirt_leave_lazy_cpu(void) +void paravirt_end_context_switch(struct task_struct *next) { leave_lazy(PARAVIRT_LAZY_CPU); - if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES)) + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } @@ -399,10 +399,8 @@ struct pv_cpu_ops pv_cpu_ops = { .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, - .lazy_mode = { - .enter = paravirt_nop, - .leave = paravirt_nop, - }, + .start_context_switch = paravirt_nop, + .end_context_switch = paravirt_nop, }; struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 57e49a8..d766c76 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_end_context_switch(); + arch_end_context_switch(next_p); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7115e60..e8a9aaf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_end_context_switch(); + arch_end_context_switch(next_p); /* * Switch FS and GS. diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 950929c..55a5d69 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -467,16 +467,16 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_enter_lazy_cpu(void) +static void vmi_start_context_switch(struct task_struct *prev) { - paravirt_enter_lazy_cpu(); + paravirt_start_context_switch(prev); vmi_ops.set_lazy_mode(2); } -static void vmi_leave_lazy_cpu(void) +static void vmi_end_context_switch(struct task_struct *next) { vmi_ops.set_lazy_mode(0); - paravirt_leave_lazy_cpu(); + paravirt_end_context_switch(next); } static void vmi_enter_lazy_mmu(void) @@ -722,9 +722,9 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); - para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu, + para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, -- cgit v1.1 From 2829b449276aed45f3d649efb21e3418e39dd5d1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:53:19 -0800 Subject: x86/paravirt: allow preemption with lazy mmu mode Impact: remove obsolete checks, simplification Lift restrictions on preemption with lazy mmu mode, as it is now allowed. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/paravirt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cf14375..bf2e86e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -247,7 +247,6 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - BUG_ON(preemptible()); __get_cpu_var(paravirt_lazy_mode) = mode; } @@ -255,7 +254,6 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode) static void leave_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); - BUG_ON(preemptible()); __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; } @@ -272,6 +270,8 @@ void paravirt_leave_lazy_mmu(void) void paravirt_start_context_switch(struct task_struct *prev) { + BUG_ON(preemptible()); + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); @@ -281,6 +281,8 @@ void paravirt_start_context_switch(struct task_struct *prev) void paravirt_end_context_switch(struct task_struct *next) { + BUG_ON(preemptible()); + leave_lazy(PARAVIRT_LAZY_CPU); if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) @@ -300,7 +302,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } -- cgit v1.1 From ab2f75f0b760d2b0c9a875b669a1b51dce02c85a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 00:18:50 -0800 Subject: x86/paravirt: use percpu_ rather than __get_cpu_var Impact: minor optimisation percpu_read/write is a slightly more direct way of getting to percpu data. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/paravirt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bf2e86e..254e8aa 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -246,16 +246,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - __get_cpu_var(paravirt_lazy_mode) = mode; + percpu_write(paravirt_lazy_mode, mode); } static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); + BUG_ON(percpu_read(paravirt_lazy_mode) != mode); - __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -294,7 +294,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) if (in_interrupt()) return PARAVIRT_LAZY_NONE; - return __get_cpu_var(paravirt_lazy_mode); + return percpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) -- cgit v1.1 From 595258aaeac4cc6e187b98b1bf29bb176febe763 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:28 +0100 Subject: perf_counter: x86: fix 32-bit irq_period assumption No need to assume the irq_period is 32bit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 155bc3c..1cedc34 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s32 period = hwc->irq_period; + s64 period = hwc->irq_period; int err; /* -- cgit v1.1 From 60b3df9c1e24a18aabb412da9905208c5f04ebea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:30 +0100 Subject: perf_counter: add comment to barrier We need to ensure the enabled=0 write happens before we start disabling the actual counters, so that a pcm_amd_enable() will not enable one underneath us. I think the race is impossible anyway, we always balance the ops within any one context and perform enable() with IRQs disabled. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1cedc34..a2e3b76 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,6 +247,10 @@ static u64 pmc_amd_save_disable_all(void) enabled = cpuc->enabled; cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * counters proper, so that pcm_amd_enable() does the right thing. + */ barrier(); for (idx = 0; idx < nr_counters_generic; idx++) { -- cgit v1.1 From 82bae4f8c2fd64a2bb1e2e72c508853ed2b4a299 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:31 +0100 Subject: perf_counter: x86: use ULL postfix for 64bit constants Fix a build warning on 32bit machines by explicitly marking the constants as 64-bit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a2e3b76..22dab06 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -84,9 +84,9 @@ static u64 pmc_intel_event_map(int event) static u64 pmc_intel_raw_event(u64 event) { -#define CORE_EVNTSEL_EVENT_MASK 0x000000FF -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00 -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000 +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ @@ -116,9 +116,9 @@ static u64 pmc_amd_event_map(int event) static u64 pmc_amd_raw_event(u64 event) { -#define K7_EVNTSEL_EVENT_MASK 0x7000000FF -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00 -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000 +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ -- cgit v1.1 From 7bb497bd885eedd0f56dfe3cc1b5ff20710d33b9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Mar 2009 08:59:21 +0100 Subject: perf_counter: fix crash on perfmon v1 systems Impact: fix boot crash on Intel Perfmon Version 1 systems Intel Perfmon v1 does not support the global MSRs, nor does it offer the generalized MSR ranges. So support v2 and later CPUs only. Also mark pmc_ops as read-mostly - to avoid false cacheline sharing. Cc: Paul Mackerras Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 22dab06..6cba9d4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -57,12 +57,14 @@ struct pmc_x86_ops { int max_events; }; -static struct pmc_x86_ops *pmc_ops; +static struct pmc_x86_ops *pmc_ops __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; +static __read_mostly int intel_perfmon_version; + /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -613,7 +615,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + if (intel_perfmon_version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -930,10 +932,10 @@ static struct pmc_x86_ops pmc_amd_ops = { static struct pmc_x86_ops *pmc_intel_init(void) { + union cpuid10_edx edx; union cpuid10_eax eax; - unsigned int ebx; unsigned int unused; - union cpuid10_edx edx; + unsigned int ebx; /* * Check whether the Architectural PerfMon supports @@ -943,8 +945,12 @@ static struct pmc_x86_ops *pmc_intel_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return NULL; + intel_perfmon_version = eax.split.version_id; + if (intel_perfmon_version < 2) + return NULL; + pr_info("Intel Performance Monitoring support detected.\n"); - pr_info("... version: %d\n", eax.split.version_id); + pr_info("... version: %d\n", intel_perfmon_version); pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); -- cgit v1.1 From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:18 +0100 Subject: perf_counter: revamp syscall input ABI Impact: modify ABI The hardware/software classification in hw_event->type became a little strained due to the addition of tracepoint tracing. Instead split up the field and provide a type field to explicitly specify the counter type, while using the event_id field to specify which event to use. Raw counters still work as before, only the raw config now goes into raw_event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.836807573@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cba9d4..d844ae4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw) { - hwc->config |= pmc_ops->raw_event(hw_event->type); + if (hw_event->raw_type) { + hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); } else { - if (hw_event->type >= pmc_ops->max_events) + if (hw_event->event_id >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->type); + hwc->config |= pmc_ops->event_map(hw_event->event_id); } counter->wakeup_pending = 0; @@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, counter->hw_event.event_config); perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } -- cgit v1.1 From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:19 +0100 Subject: perf_counter: unify irq output code Impact: cleanup Having 3 slightly different copies of the same code around does nobody any good. First step in revamping the output format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.929962222@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 53 +------------------------------------- 1 file changed, 1 insertion(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d844ae4..902282d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); } -static void perf_store_irq_data(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - /* * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: @@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter) __pmc_generic_enable(counter, hwc, idx); } -static void -perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) -{ - struct perf_counter *counter, *group_leader = sibling->group_leader; - - /* - * Store sibling timestamps (if any): - */ - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.event_config); - perf_store_irq_data(sibling, atomic64_read(&counter->count)); - } -} - /* * Maximum interrupt frequency of 100KHz per CPU */ @@ -754,28 +724,7 @@ again: continue; perf_save_and_restart(counter); - - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - continue; - case PERF_RECORD_IRQ: - perf_store_irq_data(counter, instruction_pointer(regs)); - break; - case PERF_RECORD_GROUP: - perf_handle_group(counter, &status, &ack); - break; - } - /* - * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these generic as - * wakeup_pending and initate a wakeup callback: - */ - if (nmi) { - counter->wakeup_pending = 1; - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); - } else { - wake_up(&counter->waitq); - } + perf_counter_output(counter, nmi, regs); } hw_perf_ack_status(ack); -- cgit v1.1 From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:06 +0100 Subject: perf_counter: remove the event config bitfields Since the bitfields turned into a bit of a mess, remove them and rely on good old masks. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.059499915@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 902282d..3f95b0c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw_type) { - hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); + if (perf_event_raw(hw_event)) { + hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); } else { - if (hw_event->event_id >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->event_id); + hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } counter->wakeup_pending = 0; -- cgit v1.1 From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:02 +0200 Subject: perf_counter: unify and fix delayed counter wakeup While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ----------------------------- arch/x86/kernel/signal.c | 6 ------ 2 files changed, 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f95b0c..7aab177 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) */ hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } - counter->wakeup_pending = 0; return 0; } @@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } -/* - * This handler is triggered by NMI contexts: - */ -void perf_counter_notify(struct pt_regs *regs) -{ - struct cpu_hw_counters *cpuc; - unsigned long flags; - int bit, cpu; - - local_irq_save(flags); - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); - - for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; - - if (!counter) - continue; - - if (counter->wakeup_pending) { - counter->wakeup_pending = 0; - wake_up(&counter->waitq); - } - } - - local_irq_restore(flags); -} - void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 611615a..0a813b1 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ -#include #include #include #include @@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } - if (thread_info_flags & _TIF_PERF_COUNTERS) { - clear_thread_flag(TIF_PERF_COUNTERS); - perf_counter_notify(regs); - } - #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ -- cgit v1.1 From 9ea98e191255ee642e64a5745014424fc63f83b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:09 +0200 Subject: perf_counter: x86: proper error propagation for the x86 hw_perf_counter_init() Now that Paul cleaned up the error propagation paths, pass down the x86 error as well. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.792822360@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7aab177..b8885cc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -954,7 +954,7 @@ hw_perf_counter_init(struct perf_counter *counter) err = __hw_perf_counter_init(counter); if (err) - return NULL; + return ERR_PTR(err); return &x86_perf_counter_ops; } -- cgit v1.1 From d7d59fb323833682b117b528d77eeb8ef587036a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:15 +0200 Subject: perf_counter: x86: callchain support Provide the x86 perf_callchain() implementation. Code based on the ftrace/sysprof code from Soeren Sandmann Pedersen. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: Soeren Sandmann Pedersen Cc: Frederic Weisbecker Cc: Steven Rostedt Orig-LKML-Reference: <20090330171024.341993293@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 154 +++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b8885cc..e16dfaf 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -16,8 +16,10 @@ #include #include #include +#include #include +#include static bool perf_counters_initialized __read_mostly; @@ -958,3 +960,155 @@ hw_perf_counter_init(struct perf_counter *counter) return &x86_perf_counter_ops; } + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +{ + if (entry->nr < MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + /* Don't bother with IRQ stacks for now */ + return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + unsigned long bp; + char *stack; + + callchain_store(entry, instruction_pointer(regs)); + + stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER + bp = frame_pointer(regs); +#else + bp = 0; +#endif + + dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); +} + + +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + + regs = (struct pt_regs *)current->thread.sp0 - 1; + fp = (void __user *)regs->bp; + + callchain_store(entry, regs->ip); + + while (entry->nr < MAX_STACK_DEPTH) { + frame.next_fp = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < user_stack_pointer(regs)) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_fp; + } +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(nmi_entry); + else + entry = &__get_cpu_var(irq_entry); + + entry->nr = 0; + + perf_do_callchain(regs, entry); + + return entry; +} -- cgit v1.1 From 4e935e47177c3b26cf383e79849bae2a464d0160 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:16 +0200 Subject: perf_counter: pmc arbitration Follow the example set by powerpc and try to play nice with oprofile and the nmi watchdog. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.459968444@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 75 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e16dfaf..2a946a1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -20,6 +20,7 @@ #include #include +#include static bool perf_counters_initialized __read_mostly; @@ -172,6 +173,65 @@ again: atomic64_sub(delta, &hwc->period_left); } +static atomic_t num_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < nr_counters_generic; i++) { + if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < nr_counters_generic; i++) { + if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) + goto eventsel_fail; + } + + return true; + +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(pmc_ops->eventsel + i); + + i = nr_counters_generic; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(pmc_ops->perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +} + +static void release_pmc_hardware(void) +{ + int i; + + for (i = 0; i < nr_counters_generic; i++) { + release_perfctr_nmi(pmc_ops->perfctr + i); + release_evntsel_nmi(pmc_ops->eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ + if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -179,10 +239,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; + int err; if (unlikely(!perf_counters_initialized)) return -EINVAL; + err = 0; + if (atomic_inc_not_zero(&num_counters)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&num_counters); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -230,6 +303,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } + counter->destroy = hw_perf_counter_destroy; + return 0; } -- cgit v1.1 From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:03 +0200 Subject: perf_counter: add more context information Put in counts to tell which ips belong to what context. ----- | | hv | -- nr | | kernel | -- | | user ----- Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.493101305@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2a946a1..c74e20d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { unsigned long bp; char *stack; + int nr = entry->nr; callchain_store(entry, instruction_pointer(regs)); @@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) #endif dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + + entry->kernel = entry->nr - nr; } @@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) { struct stack_frame frame; const void __user *fp; + int nr = entry->nr; regs = (struct pt_regs *)current->thread.sp0 - 1; fp = (void __user *)regs->bp; @@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, frame.return_address); fp = frame.next_fp; } + + entry->user = entry->nr - nr; } static void @@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry = &__get_cpu_var(irq_entry); entry->nr = 0; + entry->hv = 0; + entry->kernel = 0; + entry->user = 0; perf_do_callchain(regs, entry); -- cgit v1.1 From b6276f353bf490add62dcf7db0ebd75baa3e1a37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:03 +0200 Subject: perf_counter: x86: self-IPI for pending work Implement set_perf_counter_pending() with a self-IPI so that it will run ASAP in a usable context. For now use a second IRQ vector, because the primary vector pokes the apic in funny ways that seem to confuse things. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.724626696@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 ++++++++++++++ arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 5 +++++ arch/x86/kernel/irqinit_32.c | 1 + arch/x86/kernel/irqinit_64.c | 1 + 5 files changed, 23 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c74e20d..4384158 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -849,6 +849,20 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_counter_do_pending(); + irq_exit(); +} + +void set_perf_counter_pending(void) +{ + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +} + void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3f129d9..1d46cba 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1028,6 +1028,8 @@ apicinterrupt SPURIOUS_APIC_VECTOR \ #ifdef CONFIG_PERF_COUNTERS apicinterrupt LOCAL_PERF_VECTOR \ perf_counter_interrupt smp_perf_counter_interrupt +apicinterrupt LOCAL_PENDING_VECTOR \ + perf_pending_interrupt smp_perf_pending_interrupt #endif /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9c27543..d465487 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,6 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, "PND: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); + seq_printf(p, " Performance pending work\n"); #endif if (generic_interrupt_extension) { seq_printf(p, "PLT: "); @@ -171,6 +175,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_pending_irqs; #endif if (generic_interrupt_extension) sum += irq_stats(cpu)->generic_irqs; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 925d87c..3190a6b 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -166,6 +166,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); # ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif # ifdef CONFIG_X86_MCE_P4THERMAL diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 665e2ab..53ceb26 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -156,6 +156,7 @@ static void __init apic_intr_init(void) /* Performance monitoring interrupt: */ #ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); #endif } -- cgit v1.1 From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:04 +0200 Subject: perf_counter: theres more to overflow than writing events Prepare for more generic overflow handling. The new perf_counter_overflow() method will handle the generic bits of the counter overflow, and can return a !0 return value, in which case the counter should be (soft) disabled, so that it won't count until it's properly disabled. XXX: do powerpc and swcounter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.812109629@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4384158..1116a41 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,8 @@ again: continue; perf_save_and_restart(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs)) + __pmc_generic_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); -- cgit v1.1 From cac94f979326212831c0ea44ed9ea1622b4f4e93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:33 +0200 Subject: x86, bts: fix race when bts tracer is removed When the bts tracer is removed while the traced task is running, the write to clear the bts tracer pointer races with context switch code. Read the tracer once during a context switch. When a new tracer is installed, the bts tracer is set in the ds context before the tracer is initialized in order to claim the context for that tracer. This may result in write accesses using an uninitialized trace configuration when scheduling timestamps have been requested. Store active tracing flags separately and only set active flags after the tracing configuration has been initialized. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144548.881338000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 58 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index b1d6e1f..c730155 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -89,6 +89,9 @@ struct bts_tracer { /* Buffer overflow notification function: */ bts_ovfl_callback_t ovfl; + + /* Active flags affecting trace collection. */ + unsigned int flags; }; struct pebs_tracer { @@ -799,6 +802,8 @@ void ds_suspend_bts(struct bts_tracer *tracer) if (!tracer) return; + tracer->flags = 0; + task = tracer->ds.context->task; if (!task || (task == current)) @@ -820,6 +825,8 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!tracer) return; + tracer->flags = tracer->trace.ds.flags; + task = tracer->ds.context->task; control = ds_cfg.ctl[dsf_bts]; @@ -1037,43 +1044,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } +static inline void ds_take_timestamp(struct ds_context *context, + enum bts_qualifier qualifier, + struct task_struct *task) +{ + struct bts_tracer *tracer = context->bts_master; + struct bts_struct ts; + + /* Prevent compilers from reading the tracer pointer twice. */ + barrier(); + + if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) + return; + + memset(&ts, 0, sizeof(ts)); + ts.qualifier = qualifier; + ts.variant.timestamp.jiffies = jiffies_64; + ts.variant.timestamp.pid = task->pid; + + bts_write(tracer, &ts); +} + /* * Change the DS configuration from tracing prev to tracing next. */ void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - struct ds_context *prev_ctx = prev->thread.ds_ctx; - struct ds_context *next_ctx = next->thread.ds_ctx; + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + unsigned long debugctlmsr = next->thread.debugctlmsr; + + /* Make sure all data is read before we start. */ + barrier(); if (prev_ctx) { update_debugctlmsr(0); - if (prev_ctx->bts_master && - (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_departs, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = prev->pid - }; - bts_write(prev_ctx->bts_master, &ts); - } + ds_take_timestamp(prev_ctx, bts_task_departs, prev); } if (next_ctx) { - if (next_ctx->bts_master && - (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_arrives, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = next->pid - }; - bts_write(next_ctx->bts_master, &ts); - } + ds_take_timestamp(next_ctx, bts_task_arrives, next); wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } - update_debugctlmsr(next->thread.debugctlmsr); + update_debugctlmsr(debugctlmsr); } void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) -- cgit v1.1 From e2b371f00a6f529f6362654239bdec8dcd510760 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:35 +0200 Subject: mm, x86, ptrace, bts: defer branch trace stopping When a ptraced task is unlinked, we need to stop branch tracing for that task. Since the unlink is called with interrupts disabled, and we need interrupts enabled to stop branch tracing, we defer the work. Collect all branch tracing related stuff in a branch tracing context. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: Andrew Morton Cc: Peter Zijlstra Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144550.712401000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 254 +++++++++++++++++++++++++++++++---------------- 1 file changed, 169 insertions(+), 85 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index fe9345c..7c21d1e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -577,17 +578,119 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS +/* + * A branch trace store context. + * + * Contexts may only be installed by ptrace_bts_config() and only for + * ptraced tasks. + * + * Contexts are destroyed when the tracee is detached from the tracer. + * The actual destruction work requires interrupts enabled, so the + * work is deferred and will be scheduled during __ptrace_unlink(). + * + * Contexts hold an additional task_struct reference on the traced + * task, as well as a reference on the tracer's mm. + * + * Ptrace already holds a task_struct for the duration of ptrace operations, + * but since destruction is deferred, it may be executed after both + * tracer and tracee exited. + */ +struct bts_context { + /* The branch trace handle. */ + struct bts_tracer *tracer; + + /* The buffer used to store the branch trace and its size. */ + void *buffer; + unsigned int size; + + /* The mm that paid for the above buffer. */ + struct mm_struct *mm; + + /* The task this context belongs to. */ + struct task_struct *task; + + /* The signal to send on a bts buffer overflow. */ + unsigned int bts_ovfl_signal; + + /* The work struct to destroy a context. */ + struct work_struct work; +}; + +static inline void alloc_bts_buffer(struct bts_context *context, + unsigned int size) +{ + void *buffer; + + buffer = alloc_locked_buffer(size); + if (buffer) { + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + } +} + +static inline void free_bts_buffer(struct bts_context *context) +{ + if (!context->buffer) + return; + + kfree(context->buffer); + context->buffer = NULL; + + refund_locked_buffer_memory(context->mm, context->size); + context->size = 0; + + mmput(context->mm); + context->mm = NULL; +} + +static void free_bts_context_work(struct work_struct *w) +{ + struct bts_context *context; + + context = container_of(w, struct bts_context, work); + + ds_release_bts(context->tracer); + put_task_struct(context->task); + free_bts_buffer(context); + kfree(context); +} + +static inline void free_bts_context(struct bts_context *context) +{ + INIT_WORK(&context->work, free_bts_context_work); + schedule_work(&context->work); +} + +static inline struct bts_context *alloc_bts_context(struct task_struct *task) +{ + struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); + if (context) { + context->task = task; + task->bts = context; + + get_task_struct(task); + } + + return context; +} + static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; struct bts_struct bts; const unsigned char *at; int error; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; at = trace->ds.top - ((index + 1) * trace->ds.size); if ((void *)at < trace->ds.begin) @@ -596,7 +699,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (!trace->read) return -EOPNOTSUPP; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -610,13 +713,18 @@ static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; const unsigned char *at; int error, drained = 0; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; if (!trace->read) return -EOPNOTSUPP; @@ -627,9 +735,8 @@ static int ptrace_bts_drain(struct task_struct *child, for (at = trace->ds.begin; (void *)at < trace->ds.top; out++, drained++, at += trace->ds.size) { struct bts_struct bts; - int error; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -639,35 +746,18 @@ static int ptrace_bts_drain(struct task_struct *child, memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - error = ds_reset_bts(child->bts); + error = ds_reset_bts(context->tracer); if (error < 0) return error; return drained; } -static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) -{ - child->bts_buffer = alloc_locked_buffer(size); - if (!child->bts_buffer) - return -ENOMEM; - - child->bts_size = size; - - return 0; -} - -static void ptrace_bts_free_buffer(struct task_struct *child) -{ - free_locked_buffer(child->bts_buffer, child->bts_size); - child->bts_buffer = NULL; - child->bts_size = 0; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; struct ptrace_bts_config cfg; unsigned int flags = 0; @@ -677,28 +767,31 @@ static int ptrace_bts_config(struct task_struct *child, if (copy_from_user(&cfg, ucfg, sizeof(cfg))) return -EFAULT; - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - } + context = child->bts; + if (!context) + context = alloc_bts_context(child); + if (!context) + return -ENOMEM; if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) return -EINVAL; - child->thread.bts_ovfl_signal = cfg.signal; return -EOPNOTSUPP; + context->bts_ovfl_signal = cfg.signal; } - if ((cfg.flags & PTRACE_BTS_O_ALLOC) && - (cfg.size != child->bts_size)) { - int error; + ds_release_bts(context->tracer); + context->tracer = NULL; - ptrace_bts_free_buffer(child); + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + free_bts_buffer(context); + if (!cfg.size) + return 0; - error = ptrace_bts_allocate_buffer(child, cfg.size); - if (error < 0) - return error; + alloc_bts_buffer(context, cfg.size); + if (!context->buffer) + return -ENOMEM; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -707,15 +800,13 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - flags); - if (IS_ERR(child->bts)) { - int error = PTR_ERR(child->bts); - - ptrace_bts_free_buffer(child); - child->bts = NULL; + context->tracer = ds_request_bts(child, context->buffer, context->size, + NULL, (size_t)-1, flags); + if (unlikely(IS_ERR(context->tracer))) { + int error = PTR_ERR(context->tracer); + free_bts_buffer(context); + context->tracer = NULL; return error; } @@ -726,20 +817,25 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; const struct bts_trace *trace; struct ptrace_bts_config cfg; + context = child->bts; + if (!context) + return -ESRCH; + if (cfg_size < sizeof(cfg)) return -EIO; - trace = ds_read_bts(child->bts); + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(&cfg, 0, sizeof(cfg)); - cfg.size = trace->ds.end - trace->ds.begin; - cfg.signal = child->thread.bts_ovfl_signal; - cfg.bts_size = sizeof(struct bts_struct); + cfg.size = trace->ds.end - trace->ds.begin; + cfg.signal = context->bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; @@ -758,67 +854,56 @@ static int ptrace_bts_status(struct task_struct *child, static int ptrace_bts_clear(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - return ds_reset_bts(child->bts); + return ds_reset_bts(context->tracer); } static int ptrace_bts_size(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static void ptrace_bts_fork(struct task_struct *tsk) +static inline void ptrace_bts_fork(struct task_struct *tsk) { tsk->bts = NULL; - tsk->bts_buffer = NULL; - tsk->bts_size = 0; - tsk->thread.bts_ovfl_signal = 0; } -static void ptrace_bts_untrace(struct task_struct *child) +/* + * Called from __ptrace_unlink() after the child has been moved back + * to its original parent. + */ +static inline void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { - ds_release_bts(child->bts); + free_bts_context(child->bts); child->bts = NULL; - - /* We cannot update total_vm and locked_vm since - child's mm is already gone. But we can reclaim the - memory. */ - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; } } - -static void ptrace_bts_detach(struct task_struct *child) -{ - /* - * Ptrace_detach() races with ptrace_untrace() in case - * the child dies and is reaped by another thread. - * - * We only do the memory accounting at this point and - * leave the buffer deallocation and the bts tracer - * release to ptrace_bts_untrace() which will be called - * later on with tasklist_lock held. - */ - release_locked_buffer(child->bts_buffer, child->bts_size); -} #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_detach(struct task_struct *child) {} static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ @@ -843,7 +928,6 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -- cgit v1.1 From 8d99b3ac2726e5edd97ad147fa5c1f2acb63a745 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:36 +0200 Subject: x86, bts: wait until traced task has been scheduled out In order to stop branch tracing for a running task, we need to first clear the branch tracing control bits before we may free the tracing buffer. If the traced task is running, the cpu might still trace that task after the branch trace control bits have cleared. Wait until the traced task has been scheduled out before proceeding. A similar problem affects the task debug store context. We first remove the context, then we need to wait until the task has been scheduled out before we can free the context memory. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144551.919636000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index c730155..5cd137a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -299,6 +299,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) static inline void ds_put_context(struct ds_context *context) { + struct task_struct *task; unsigned long irq; if (!context) @@ -313,14 +314,20 @@ static inline void ds_put_context(struct ds_context *context) *(context->this) = NULL; - if (context->task) - clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + task = context->task; + + if (task) + clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (!context->task || (context->task == current)) + if (!task || (task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); spin_unlock_irqrestore(&ds_lock, irq); + /* The context might still be in use for context switching. */ + if (task && (task != current)) + wait_task_context_switch(task); + kfree(context); } @@ -781,15 +788,23 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, void ds_release_bts(struct bts_tracer *tracer) { + struct task_struct *task; + if (!tracer) return; + task = tracer->ds.context->task; + ds_suspend_bts(tracer); WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; - put_tracer(tracer->ds.context->task); + /* Make sure tracing stopped and the tracer is not in use. */ + if (task && (task != current)) + wait_task_context_switch(task); + + put_tracer(task); ds_put_context(tracer->ds.context); kfree(tracer); -- cgit v1.1 From 38f801129ad07b9afa7f9bd3779f61b805416d8c Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:37 +0200 Subject: x86, bts: fix race between per-task and per-cpu branch tracing Per-task branch tracing installs a debug store context with the traced task. This immediately results in the branch trace control bits to be cleared for the next context switch of that task, if not set before. Either per-cpu or per-task tracing are allowed at the same time. An active per-cpu tracing would be disabled even if the per-task tracing request is rejected and the task debug store context removed. Check the tracing type (per-cpu or per-task) before installing a task debug store context. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144552.856000000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 72 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5cd137a..f03f117 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -193,12 +193,28 @@ static DEFINE_SPINLOCK(ds_lock); */ static atomic_t tracers = ATOMIC_INIT(0); -static inline void get_tracer(struct task_struct *task) +static inline int get_tracer(struct task_struct *task) { - if (task) + int error; + + spin_lock_irq(&ds_lock); + + if (task) { + error = -EPERM; + if (atomic_read(&tracers) < 0) + goto out; atomic_inc(&tracers); - else + } else { + error = -EPERM; + if (atomic_read(&tracers) > 0) + goto out; atomic_dec(&tracers); + } + + error = 0; +out: + spin_unlock_irq(&ds_lock); + return error; } static inline void put_tracer(struct task_struct *task) @@ -209,14 +225,6 @@ static inline void put_tracer(struct task_struct *task) atomic_inc(&tracers); } -static inline int check_tracer(struct task_struct *task) -{ - return task ? - (atomic_read(&tracers) >= 0) : - (atomic_read(&tracers) <= 0); -} - - /* * The DS context is either attached to a thread or to a cpu: * - in the former case, the thread_struct contains a pointer to the @@ -677,6 +685,10 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + /* * Per-cpu tracing is typically requested using smp_call_function(). * We must not sleep. @@ -684,7 +696,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; error = ds_request(&tracer->ds, &tracer->trace.ds, @@ -696,13 +708,8 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, spin_lock_irqsave(&ds_lock, irq); error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - - error = -EPERM; if (tracer->ds.context->bts_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->bts_master = tracer; spin_unlock_irqrestore(&ds_lock, irq); @@ -716,13 +723,13 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return tracer; - out_put_tracer: - put_tracer(task); out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } @@ -741,6 +748,10 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + /* * Per-cpu tracing is typically requested using smp_call_function(). * We must not sleep. @@ -748,7 +759,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; error = ds_request(&tracer->ds, &tracer->trace.ds, @@ -759,13 +770,8 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, spin_lock_irqsave(&ds_lock, irq); error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - - error = -EPERM; if (tracer->ds.context->pebs_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->pebs_master = tracer; spin_unlock_irqrestore(&ds_lock, irq); @@ -775,13 +781,13 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return tracer; - out_put_tracer: - put_tracer(task); out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } @@ -804,8 +810,8 @@ void ds_release_bts(struct bts_tracer *tracer) if (task && (task != current)) wait_task_context_switch(task); - put_tracer(task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } @@ -861,16 +867,20 @@ void ds_resume_bts(struct bts_tracer *tracer) void ds_release_pebs(struct pebs_tracer *tracer) { + struct task_struct *task; + if (!tracer) return; + task = tracer->ds.context->task; + ds_suspend_pebs(tracer); WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; - put_tracer(tracer->ds.context->task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } -- cgit v1.1 From 15879d042164650b93d83281ad5f87ad323bfbfe Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:38 +0200 Subject: x86, bts: use trace_clock_global() for timestamps Rename the bts_struct timestamp field to event. Use trace_clock_global() for time measurement. Reported-by: Ingo Molnar Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144553.773216000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index f03f117..2071b99 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -471,7 +472,7 @@ enum bts_field { bts_flags, bts_qual = bts_from, - bts_jiffies = bts_to, + bts_clock = bts_to, bts_pid = bts_flags, bts_qual_mask = (bts_qual_max - 1), @@ -517,8 +518,8 @@ bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) memset(out, 0, sizeof(*out)); if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); - out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); - out->variant.timestamp.pid = bts_get(at, bts_pid); + out->variant.event.clock = bts_get(at, bts_clock); + out->variant.event.pid = bts_get(at, bts_pid); } else { out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); @@ -555,8 +556,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) case bts_task_arrives: case bts_task_departs: bts_set(raw, bts_qual, (bts_escape | in->qualifier)); - bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); - bts_set(raw, bts_pid, in->variant.timestamp.pid); + bts_set(raw, bts_clock, in->variant.event.clock); + bts_set(raw, bts_pid, in->variant.event.pid); break; default: return -EINVAL; @@ -1083,9 +1084,9 @@ static inline void ds_take_timestamp(struct ds_context *context, return; memset(&ts, 0, sizeof(ts)); - ts.qualifier = qualifier; - ts.variant.timestamp.jiffies = jiffies_64; - ts.variant.timestamp.pid = task->pid; + ts.qualifier = qualifier; + ts.variant.event.clock = trace_clock_global(); + ts.variant.event.pid = task->pid; bts_write(tracer, &ts); } -- cgit v1.1 From de79f54f5347ad7ec6ff55ccbb6d4ab2a21f6a93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:40 +0200 Subject: x86, bts, hw-branch-tracer: add _noirq variants to the debug store interface The hw-branch-tracer uses debug store functions from an on_each_cpu() context, which is simply wrong since the functions may sleep. Add _noirq variants for most functions, which may be called with interrupts disabled. Separate per-cpu and per-task tracing and allow per-cpu tracing to be controlled from any cpu. Make the hw-branch-tracer use the new debug store interface, synchronize with hotplug cpu event using get/put_online_cpus(), and remove the unnecessary spinlock. Make the ptrace bts and the ds selftest code use the new interface. Defer the ds selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144555.658136000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 474 ++++++++++++++++++++++++++++++++---------- arch/x86/kernel/ds_selftest.c | 9 +- arch/x86/kernel/ptrace.c | 5 +- 3 files changed, 375 insertions(+), 113 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 2071b99..21a3852 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -245,60 +245,50 @@ struct ds_context { struct pebs_tracer *pebs_master; /* Use count: */ - unsigned long count; + unsigned long count; /* Pointer to the context pointer field: */ struct ds_context **this; - /* The traced task; NULL for current cpu: */ + /* The traced task; NULL for cpu tracing: */ struct task_struct *task; -}; -static DEFINE_PER_CPU(struct ds_context *, system_context_array); + /* The traced cpu; only valid if task is NULL: */ + int cpu; +}; -#define system_context per_cpu(system_context_array, smp_processor_id()) +static DEFINE_PER_CPU(struct ds_context *, cpu_context); -static inline struct ds_context *ds_get_context(struct task_struct *task) +static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &system_context); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; - unsigned long irq; - /* - * Chances are small that we already have a context. - * - * Contexts for per-cpu tracing are allocated using - * smp_call_function(). We must not sleep. - */ - new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC); + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); if (!new_context) return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irq(&ds_lock); context = *p_context; - if (!context) { + if (likely(!context)) { context = new_context; context->this = p_context; context->task = task; + context->cpu = cpu; context->count = 0; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - *p_context = context; } context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); if (context != new_context) kfree(new_context); @@ -306,7 +296,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) return context; } -static inline void ds_put_context(struct ds_context *context) +static void ds_put_context(struct ds_context *context) { struct task_struct *task; unsigned long irq; @@ -328,8 +318,15 @@ static inline void ds_put_context(struct ds_context *context) if (task) clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, 0); + /* + * We leave the (now dangling) pointer to the DS configuration in + * the DS_AREA msr. This is as good or as bad as replacing it with + * NULL - the hardware would crash if we enabled tracing. + * + * This saves us some problems with having to write an msr on a + * different cpu while preventing others from doing the same for the + * next context for that same cpu. + */ spin_unlock_irqrestore(&ds_lock, irq); @@ -340,6 +337,31 @@ static inline void ds_put_context(struct ds_context *context) kfree(context); } +static void ds_install_ds_area(struct ds_context *context) +{ + unsigned long ds; + + ds = (unsigned long)context->ds; + + /* + * There is a race between the bts master and the pebs master. + * + * The thread/cpu access is synchronized via get/put_cpu() for + * task tracing and via wrmsr_on_cpu for cpu tracing. + * + * If bts and pebs are collected for the same task or same cpu, + * the same confiuration is written twice. + */ + if (context->task) { + get_cpu(); + if (context->task == current) + wrmsrl(MSR_IA32_DS_AREA, ds); + set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + put_cpu(); + } else + wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, + (u32)((u64)ds), (u32)((u64)ds >> 32)); +} /* * Call the tracer's callback on a buffer overflow. @@ -622,6 +644,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ + ith *= ds_cfg.sizeof_rec[qual]; trace->ith = (void *)(buffer + size - ith); trace->flags = flags; @@ -630,7 +653,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, enum ds_qualifier qual, struct task_struct *task, - void *base, size_t size, size_t th, unsigned int flags) + int cpu, void *base, size_t size, size_t th) { struct ds_context *context; int error; @@ -643,7 +666,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* We require some space to do alignment adjustments below. */ + /* We need space for alignment adjustments in ds_init_ds_trace(). */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) goto out; @@ -660,25 +683,27 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, tracer->size = size; error = -ENOMEM; - context = ds_get_context(task); + context = ds_get_context(task, cpu); if (!context) goto out; tracer->context = context; - ds_init_ds_trace(trace, qual, base, size, th, flags); + /* + * Defer any tracer-specific initialization work for the context until + * context ownership has been clarified. + */ error = 0; out: return error; } -struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; - unsigned long irq; int error; /* Buffer overflow notification is not yet implemented. */ @@ -690,42 +715,46 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (error < 0) goto out; - /* - * Per-cpu tracing is typically requested using smp_call_function(). - * We must not sleep. - */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_bts, task, base, size, th, flags); + ds_bts, task, cpu, base, size, th); if (error < 0) goto out_tracer; - - spin_lock_irqsave(&ds_lock, irq); + /* Claim the bts part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->bts_master) goto out_unlock; tracer->ds.context->bts_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the bts part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_install_ds_area(tracer->ds.context); tracer->trace.read = bts_read; tracer->trace.write = bts_write; - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + /* Start tracing. */ ds_resume_bts(tracer); return tracer; out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); @@ -735,13 +764,27 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return ERR_PTR(error); } -struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(task, 0, base, size, ovfl, th, flags); +} + +struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); +} + +static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; - unsigned long irq; int error; /* Buffer overflow notification is not yet implemented. */ @@ -753,37 +796,43 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (error < 0) goto out; - /* - * Per-cpu tracing is typically requested using smp_call_function(). - * We must not sleep. - */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_pebs, task, base, size, th, flags); + ds_pebs, task, cpu, base, size, th); if (error < 0) goto out_tracer; - spin_lock_irqsave(&ds_lock, irq); + /* Claim the pebs part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->pebs_master) goto out_unlock; tracer->ds.context->pebs_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the pebs part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags); ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + ds_install_ds_area(tracer->ds.context); + + /* Start tracing. */ ds_resume_pebs(tracer); return tracer; out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); @@ -793,16 +842,26 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return ERR_PTR(error); } -void ds_release_bts(struct bts_tracer *tracer) +struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) { - struct task_struct *task; + return ds_request_pebs(task, 0, base, size, ovfl, th, flags); +} - if (!tracer) - return; +struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); +} - task = tracer->ds.context->task; +static void ds_free_bts(struct bts_tracer *tracer) +{ + struct task_struct *task; - ds_suspend_bts(tracer); + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; @@ -817,9 +876,69 @@ void ds_release_bts(struct bts_tracer *tracer) kfree(tracer); } +void ds_release_bts(struct bts_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_bts(tracer); + ds_free_bts(tracer); +} + +int ds_release_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_bts_noirq(tracer); + ds_free_bts(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void update_task_debugctlmsr(struct task_struct *task, + unsigned long debugctlmsr) +{ + task->thread.debugctlmsr = debugctlmsr; + + get_cpu(); + if (task == current) + update_debugctlmsr(debugctlmsr); + + if (task->thread.debugctlmsr) + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + else + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + put_cpu(); +} + void ds_suspend_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; @@ -827,29 +946,60 @@ void ds_suspend_bts(struct bts_tracer *tracer) tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + WARN_ON(!task && irqs_disabled()); - if (task) { - task->thread.debugctlmsr &= ~BTS_CONTROL; + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr &= ~BTS_CONTROL; - if (!task->thread.debugctlmsr) - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); } -void ds_resume_bts(struct bts_tracer *tracer) +int ds_suspend_bts_noirq(struct bts_tracer *tracer) { struct task_struct *task; - unsigned long control; + unsigned long debugctlmsr, irq; + int cpu, error = 0; if (!tracer) - return; + return 0; - tracer->flags = tracer->trace.ds.flags; + tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr &= ~BTS_CONTROL; + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static unsigned long ds_bts_control(struct bts_tracer *tracer) +{ + unsigned long control; control = ds_cfg.ctl[dsf_bts]; if (!(tracer->trace.ds.flags & BTS_KERNEL)) @@ -857,25 +1007,77 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!(tracer->trace.ds.flags & BTS_USER)) control |= ds_cfg.ctl[dsf_bts_user]; - if (task) { - task->thread.debugctlmsr |= control; - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } - - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() | control); + return control; } -void ds_release_pebs(struct pebs_tracer *tracer) +void ds_resume_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; + tracer->flags = tracer->trace.ds.flags; + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; - ds_suspend_pebs(tracer); + WARN_ON(!task && irqs_disabled()); + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); +} + +int ds_resume_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long debugctlmsr, irq; + int cpu, error = 0; + + if (!tracer) + return 0; + + tracer->flags = tracer->trace.ds.flags; + + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void ds_free_pebs(struct pebs_tracer *tracer) +{ + struct task_struct *task; + + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; @@ -886,16 +1088,68 @@ void ds_release_pebs(struct pebs_tracer *tracer) kfree(tracer); } +void ds_release_pebs(struct pebs_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_pebs(tracer); + ds_free_pebs(tracer); +} + +int ds_release_pebs_noirq(struct pebs_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_pebs_noirq(tracer); + ds_free_pebs(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + void ds_suspend_pebs(struct pebs_tracer *tracer) { } +int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + void ds_resume_pebs(struct pebs_tracer *tracer) { } +int ds_resume_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) @@ -1004,26 +1258,6 @@ ds_configure(const struct ds_configuration *cfg, printk(KERN_INFO "[ds] pebs not available\n"); } - if (ds_cfg.sizeof_rec[ds_bts]) { - int error; - - error = ds_selftest_bts(); - if (error) { - WARN(1, "[ds] selftest failed. disabling bts.\n"); - ds_cfg.sizeof_rec[ds_bts] = 0; - } - } - - if (ds_cfg.sizeof_rec[ds_pebs]) { - int error; - - error = ds_selftest_pebs(); - if (error) { - WARN(1, "[ds] selftest failed. disabling pebs.\n"); - ds_cfg.sizeof_rec[ds_pebs] = 0; - } - } - printk(KERN_INFO "[ds] sizes: address: %u bit, ", 8 * ds_cfg.sizeof_ptr_field); printk("bts/pebs record: %u/%u bytes\n", @@ -1127,3 +1361,29 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) void ds_exit_thread(struct task_struct *tsk) { } + +static __init int ds_selftest(void) +{ + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; + + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + + return 0; +} +device_initcall(ds_selftest); diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 8c46fbf..e5a263c 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -10,11 +10,12 @@ #include #include +#include #include -#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ +#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ static int ds_selftest_bts_consistency(const struct bts_trace *trace) @@ -125,12 +126,12 @@ int ds_selftest_bts(void) struct bts_tracer *tracer; int error = 0; void *top; - unsigned char buffer[DS_SELFTEST_BUFFER_SIZE]; + unsigned char buffer[BUFFER_SIZE]; printk(KERN_INFO "[ds] bts selftest..."); - tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); + tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); if (IS_ERR(tracer)) { error = PTR_ERR(tracer); tracer = NULL; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7c21d1e..adbb243 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -800,8 +800,9 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - context->tracer = ds_request_bts(child, context->buffer, context->size, - NULL, (size_t)-1, flags); + context->tracer = + ds_request_bts_task(child, context->buffer, context->size, + NULL, (size_t)-1, flags); if (unlikely(IS_ERR(context->tracer))) { int error = PTR_ERR(context->tracer); -- cgit v1.1 From 353afeea24cc51aafc0ff21a72ec740b6f0af50c Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:42 +0200 Subject: x86, ds: fix compiler warning Size_t is defined differently on i386 and x86_64. Change type to avoid compiler warning. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144557.523964000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index e5a263c..e1ba510 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -87,7 +87,7 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer, /* Now to the test itself. */ for (at = from; (void *)at < to; at += trace->ds.size) { struct bts_struct bts; - size_t index; + unsigned long index; int error; if (((void *)at - trace->ds.begin) % trace->ds.size) { -- cgit v1.1 From 84f201139245c30777ff858e71b8d7e134b8c3ed Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:43 +0200 Subject: x86, ds: fix bounds check in ds selftest Fix a bad bounds check in the debug store selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144558.450027000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index e1ba510..cccc19a 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -47,8 +47,13 @@ static int ds_selftest_bts_consistency(const struct bts_trace *trace) printk(KERN_CONT "bad bts buffer setup..."); error = -1; } + /* + * We allow top in [begin; end], since its not clear when the + * overflow adjustment happens: after the increment or before the + * write. + */ if ((trace->ds.top < trace->ds.begin) || - (trace->ds.end <= trace->ds.top)) { + (trace->ds.end < trace->ds.top)) { printk(KERN_CONT "bts top out of bounds..."); error = -1; } -- cgit v1.1 From 01f6569ece6915616f6cae1d7d8b46ab8da9c1bd Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:44 +0200 Subject: x86, ds: selftest each cpu Perform debug store selftests on each cpu. Cover both the normal and the _noirq variant of the debug store interface. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144559.394583000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 182 +++++++++++++++++++++++++++++++----------- 1 file changed, 135 insertions(+), 47 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index cccc19a..599a963 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -11,13 +11,21 @@ #include #include #include +#include #include -#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +struct ds_selftest_bts_conf { + struct bts_tracer *tracer; + int error; + int (*suspend)(struct bts_tracer *); + int (*resume)(struct bts_tracer *); +}; + static int ds_selftest_bts_consistency(const struct bts_trace *trace) { int error = 0; @@ -125,36 +133,32 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer, return 0; } -int ds_selftest_bts(void) +static void ds_selftest_bts_cpu(void *arg) { + struct ds_selftest_bts_conf *conf = arg; const struct bts_trace *trace; - struct bts_tracer *tracer; - int error = 0; void *top; - unsigned char buffer[BUFFER_SIZE]; - printk(KERN_INFO "[ds] bts selftest..."); - - tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - if (IS_ERR(tracer)) { - error = PTR_ERR(tracer); - tracer = NULL; + if (IS_ERR(conf->tracer)) { + conf->error = PTR_ERR(conf->tracer); + conf->tracer = NULL; printk(KERN_CONT - "initialization failed (err: %d)...", error); - goto out; + "initialization failed (err: %d)...", conf->error); + return; } - /* The return should already give us enough trace. */ - ds_suspend_bts(tracer); + /* We should meanwhile have enough trace. */ + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; /* Let's see if we can access the trace. */ - trace = ds_read_bts(tracer); + trace = ds_read_bts(conf->tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; /* If everything went well, we should have a few trace entries. */ if (trace->ds.top == trace->ds.begin) { @@ -168,10 +172,11 @@ int ds_selftest_bts(void) } /* Let's try to read the trace we collected. */ - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.top); - if (error < 0) - goto out; + if (conf->error < 0) + return; /* * Let's read the trace again. @@ -179,26 +184,31 @@ int ds_selftest_bts(void) */ top = trace->ds.top; - trace = ds_read_bts(tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + trace = ds_read_bts(conf->tracer); + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; if (top != trace->ds.top) { printk(KERN_CONT "suspend not working..."); - error = -1; - goto out; + conf->error = -1; + return; } /* Let's collect some more trace - see if resume is working. */ - ds_resume_bts(tracer); - ds_suspend_bts(tracer); + conf->error = conf->resume(conf->tracer); + if (conf->error < 0) + return; + + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; - trace = ds_read_bts(tracer); + trace = ds_read_bts(conf->tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; if (trace->ds.top == top) { /* @@ -210,35 +220,113 @@ int ds_selftest_bts(void) printk(KERN_CONT "no resume progress/overflow..."); - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.end); } else if (trace->ds.top < top) { /* * We had a buffer overflow - the entire buffer should * contain trace records. */ - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.end); } else { /* * It is quite likely that the buffer did not overflow. * Let's just check the delta trace. */ - error = ds_selftest_bts_read(tracer, trace, - top, trace->ds.top); + conf->error = + ds_selftest_bts_read(conf->tracer, trace, top, + trace->ds.top); } - if (error < 0) - goto out; + if (conf->error < 0) + return; - error = 0; + conf->error = 0; +} - /* The final test: release the tracer while tracing is suspended. */ - out: - ds_release_bts(tracer); +static int ds_suspend_bts_wrap(struct bts_tracer *tracer) +{ + ds_suspend_bts(tracer); + return 0; +} + +static int ds_resume_bts_wrap(struct bts_tracer *tracer) +{ + ds_resume_bts(tracer); + return 0; +} - printk(KERN_CONT "%s.\n", (error ? "failed" : "passed")); +static void ds_release_bts_noirq_wrap(void *tracer) +{ + (void)ds_release_bts_noirq(tracer); +} - return error; +static int ds_selftest_bts_bad_release_noirq(int cpu, + struct bts_tracer *tracer) +{ + int error = -EPERM; + + /* Try to release the tracer on the wrong cpu. */ + get_cpu(); + if (cpu != smp_processor_id()) { + error = ds_release_bts_noirq(tracer); + if (error != -EPERM) + printk(KERN_CONT "release on wrong cpu..."); + } + put_cpu(); + + return error ? 0 : -1; +} + +int ds_selftest_bts(void) +{ + struct ds_selftest_bts_conf conf; + unsigned char buffer[BUFFER_SIZE]; + int cpu; + + printk(KERN_INFO "[ds] bts selftest..."); + conf.error = 0; + + get_online_cpus(); + for_each_online_cpu(cpu) { + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); + if (conf.error >= 0) { + conf.error = + ds_selftest_bts_bad_release_noirq(cpu, + conf.tracer); + /* We must not release the tracer twice. */ + if (conf.error < 0) + conf.tracer = NULL; + } + smp_call_function_single(cpu, ds_release_bts_noirq_wrap, + conf.tracer, 1); + if (conf.error < 0) + goto out; + } + + conf.error = 0; + out: + put_online_cpus(); + printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); + + return conf.error; } int ds_selftest_pebs(void) -- cgit v1.1 From 3a68eef945b234f286406d96dc690fe17863c203 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:45 +0200 Subject: x86, ds: add task tracing selftest Add selftests to cover per-task branch tracing. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144600.329346000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 71 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 599a963..a40b253 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -280,10 +280,51 @@ static int ds_selftest_bts_bad_release_noirq(int cpu, return error ? 0 : -1; } +static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + ds_release_bts(tracer); + error = 0; + } + + if (error != -EPERM) + printk(KERN_CONT "cpu/task tracing overlap..."); + + return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_task(void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + error = 0; + ds_release_bts(tracer); + } + + if (error != -EPERM) + printk(KERN_CONT "task/cpu tracing overlap..."); + + return error ? 0 : -1; +} + int ds_selftest_bts(void) { struct ds_selftest_bts_conf conf; unsigned char buffer[BUFFER_SIZE]; + unsigned long irq; int cpu; printk(KERN_INFO "[ds] bts selftest..."); @@ -297,6 +338,8 @@ int ds_selftest_bts(void) ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); ds_release_bts(conf.tracer); if (conf.error < 0) goto out; @@ -315,12 +358,40 @@ int ds_selftest_bts(void) if (conf.error < 0) conf.tracer = NULL; } + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); smp_call_function_single(cpu, ds_release_bts_noirq_wrap, conf.tracer, 1); if (conf.error < 0) goto out; } + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + local_irq_save(irq); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts_noirq(conf.tracer); + local_irq_restore(irq); + if (conf.error < 0) + goto out; + conf.error = 0; out: put_online_cpus(); -- cgit v1.1 From 2311f0de21c17b2a8b960677a9cccfbfa52beb35 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:46 +0200 Subject: x86, ds: add leakage warning Add a warning in case a debug store context is not removed before the task it is attached to is freed. Remove the old warning at thread exit. It is too early. Declare the debug store context field in thread_struct unconditionally. Remove ds_copy_thread() and ds_exit_thread() and do the work directly in process*.c. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144601.254472000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 10 ---------- arch/x86/kernel/process.c | 5 +++-- arch/x86/kernel/process_32.c | 3 ++- arch/x86/kernel/process_64.c | 3 ++- 4 files changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 21a3852..71cab3b 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1352,16 +1352,6 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(debugctlmsr); } -void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) -{ - clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); - tsk->thread.ds_ctx = NULL; -} - -void ds_exit_thread(struct task_struct *tsk) -{ -} - static __init int ds_selftest(void) { if (ds_cfg.sizeof_rec[ds_bts]) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca98915..fb5dfb8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -14,6 +14,7 @@ #include #include #include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -45,6 +46,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + + WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } void free_thread_info(struct thread_info *ti) @@ -83,8 +86,6 @@ void exit_thread(void) put_cpu(); kfree(bp); } - - ds_exit_thread(current); } void flush_thread(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84..b5e4bfe 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -290,7 +290,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_max = 0; } - ds_copy_thread(p, current); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41..5a1a1de 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -335,7 +335,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, goto out; } - ds_copy_thread(p, me); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; -- cgit v1.1 From ee811517a5604aa63fae803b7c044712699e1303 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:47 +0200 Subject: x86, ds: use single debug store cpu configuration Use a single configuration for all cpus. Reported-by: Ingo Molnar Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144602.191165000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 71cab3b..443f415 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -47,9 +47,8 @@ struct ds_configuration { /* Control bit-masks indexed by enum ds_feature: */ unsigned long ctl[dsf_ctl_max]; }; -static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +static struct ds_configuration ds_cfg __read_mostly; -#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) /* Maximal size of a DS configuration: */ #define MAX_SIZEOF_DS (12 * 8) @@ -1268,6 +1267,10 @@ ds_configure(const struct ds_configuration *cfg, void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) { + /* Only configure the first cpu. Others are identical. */ + if (ds_cfg.name) + return; + switch (c->x86) { case 0x6: switch (c->x86_model) { -- cgit v1.1 From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:48 +0200 Subject: x86, ptrace: add bts context unconditionally Add the ptrace bts context field to task_struct unconditionally. Initialize the field directly in copy_process(). Remove all the unneeded functionality used to initialize that field. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144603.292754000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index adbb243..b32a8ee 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -887,37 +887,19 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static inline void ptrace_bts_fork(struct task_struct *tsk) -{ - tsk->bts = NULL; -} - /* * Called from __ptrace_unlink() after the child has been moved back * to its original parent. */ -static inline void ptrace_bts_untrace(struct task_struct *child) +void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { free_bts_context(child->bts); child->bts = NULL; } } -#else -static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ -void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - ptrace_bts_fork(child); -} - -void x86_ptrace_untrace(struct task_struct *child) -{ - ptrace_bts_untrace(child); -} - /* * Called by kernel/ptrace.c when detaching.. * -- cgit v1.1 From 6047550d3d26fed88b18a208b31f8b90b5ef3e9b Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:49 +0200 Subject: x86, ds: dont use TIF_DEBUGCTLMSR Debug store already uses TIF_DS_AREA_MSR to trigger debug store context switch handling. No need to use TIF_DEBUGCTLMSR, as well. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144604.256645000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 443f415..cab2832 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -925,11 +925,6 @@ static void update_task_debugctlmsr(struct task_struct *task, get_cpu(); if (task == current) update_debugctlmsr(debugctlmsr); - - if (task->thread.debugctlmsr) - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); put_cpu(); } -- cgit v1.1 From 608780a9048efa3e85fbc4d8649b26805cc588aa Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:50 +0200 Subject: x86, ds: fix bad ds_reset_pebs() Ds_reset_pebs() passed the wrong qualifier to a shared function resulting in a reset of bts, rather than pebs. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144605.206510000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index cab2832..ebfb0fd 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1186,7 +1186,7 @@ int ds_reset_pebs(struct pebs_tracer *tracer) tracer->trace.ds.top = tracer->trace.ds.begin; - ds_set(tracer->ds.context->ds, ds_bts, ds_index, + ds_set(tracer->ds.context->ds, ds_pebs, ds_index, (unsigned long)tracer->trace.ds.top); return 0; -- cgit v1.1 From 150f5164c1258e05b7dea16f29e592f354c48f34 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:51 +0200 Subject: x86, ds: allow small debug store buffers Check the buffer size more precisely to allow buffers for exactly one element provided the base address is already properly aligned. Add a debug store selftest. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144606.139137000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 9 +++++++-- arch/x86/kernel/ds_selftest.c | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index ebfb0fd..4e05157 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -656,6 +656,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, { struct ds_context *context; int error; + size_t req_size; error = -EOPNOTSUPP; if (!ds_cfg.sizeof_rec[qual]) @@ -665,9 +666,13 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* We need space for alignment adjustments in ds_init_ds_trace(). */ + req_size = ds_cfg.sizeof_rec[qual]; + /* We might need space for alignment adjustments. */ + if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) + req_size += DS_ALIGNMENT; + error = -EINVAL; - if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + if (size < req_size) goto out; if (th != (size_t)-1) { diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index a40b253..5f104a0 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -16,8 +16,8 @@ #include -#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ - +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ struct ds_selftest_bts_conf { struct bts_tracer *tracer; @@ -381,7 +381,7 @@ int ds_selftest_bts(void) conf.suspend = ds_suspend_bts_noirq; conf.resume = ds_resume_bts_noirq; conf.tracer = - ds_request_bts_task(current, buffer, BUFFER_SIZE, + ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); local_irq_save(irq); ds_selftest_bts_cpu(&conf); -- cgit v1.1 From 017bc617657c928cb9a0c45a7a7e9f4e66695347 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:52 +0200 Subject: x86, ds: support Core i7 Add debug store support for Core i7. Core i7 adds a reset value for each performance counter and a new PEBS record format. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144607.088997000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 4e05157..48bfe13 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -44,6 +44,9 @@ struct ds_configuration { /* The size of a BTS/PEBS record in bytes: */ unsigned char sizeof_rec[2]; + /* The number of pebs counter reset values in the DS structure. */ + unsigned char nr_counter_reset; + /* Control bit-masks indexed by enum ds_feature: */ unsigned long ctl[dsf_ctl_max]; }; @@ -51,7 +54,7 @@ static struct ds_configuration ds_cfg __read_mostly; /* Maximal size of a DS configuration: */ -#define MAX_SIZEOF_DS (12 * 8) +#define MAX_SIZEOF_DS 0x80 /* Maximal size of a BTS record: */ #define MAX_SIZEOF_BTS (3 * 8) @@ -59,6 +62,12 @@ static struct ds_configuration ds_cfg __read_mostly; /* BTS and PEBS buffer alignment: */ #define DS_ALIGNMENT (1 << 3) +/* Number of buffer pointers in DS: */ +#define NUM_DS_PTR_FIELDS 8 + +/* Size of a pebs reset value in DS: */ +#define PEBS_RESET_FIELD_SIZE 8 + /* Mask of control bits in the DS MSR register: */ #define BTS_CONTROL \ ( ds_cfg.ctl[dsf_bts] | \ @@ -1164,9 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) return NULL; ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); - tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + - (ds_cfg.sizeof_ptr_field * 8)); + + tracer->trace.counters = ds_cfg.nr_counter_reset; + memcpy(tracer->trace.counter_reset, + tracer->ds.context->ds + + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), + ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE); return &tracer->trace; } @@ -1197,13 +1209,18 @@ int ds_reset_pebs(struct pebs_tracer *tracer) return 0; } -int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value) { if (!tracer) return -EINVAL; + if (ds_cfg.nr_counter_reset < counter) + return -EINVAL; + *(u64 *)(tracer->ds.context->ds + - (ds_cfg.sizeof_ptr_field * 8)) = value; + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + + (counter * PEBS_RESET_FIELD_SIZE)) = value; return 0; } @@ -1213,16 +1230,26 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 1, +}; +static const struct ds_configuration ds_cfg_core_i7 = { + .name = "Core i7", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 4, }; static void @@ -1239,6 +1266,32 @@ ds_configure(const struct ds_configuration *cfg, nr_pebs_fields = 18; #endif + /* + * Starting with version 2, architectural performance + * monitoring supports a format specifier. + */ + if ((cpuid_eax(0xa) & 0xff) > 1) { + unsigned long perf_capabilities, format; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + + format = (perf_capabilities >> 8) & 0xf; + + switch (format) { + case 0: + nr_pebs_fields = 18; + break; + case 1: + nr_pebs_fields = 22; + break; + default: + printk(KERN_INFO + "[ds] unknown PEBS format: %lu\n", format); + nr_pebs_fields = 0; + break; + } + } + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; @@ -1262,7 +1315,7 @@ ds_configure(const struct ds_configuration *cfg, printk("bts/pebs record: %u/%u bytes\n", ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field)); + WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -1284,6 +1337,8 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_core2_atom, c); break; case 0x1a: /* Core i7 */ + ds_configure(&ds_cfg_core_i7, c); + break; default: /* Sorry, don't know about them. */ break; -- cgit v1.1 From fcb2ac5bdfa3a7a04fb9749b916f64400f4c35a8 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 8 Apr 2009 13:31:58 +0200 Subject: x86_32: introduce restore_fpu_checking() Impact: cleanup, prepare FPU code unificaton Like on x86_64, return an error from restore_fpu and kill the task if it fails. Also rename restore_fpu to restore_fpu_checking which allows ifdefs to be removed in math_state_restore(). Signed-off-by: Jiri Slaby LKML-Reference: <1239190320-23952-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d2883..d696145 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } -#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } -- cgit v1.1 From a59dacfdc9ba06903652fa4883bf1106278b18ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Oct 2008 14:38:08 +0200 Subject: x86 early quirks: eliminate unused function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup this warning: arch/x86/kernel/early-quirks.c:99: warning: ‘ati_ixp4x0_rev’ defined but not used triggers because ati_ixp4x0_rev() is only used in the ACPI && X86_IO_APIC case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd9..ebdb85c 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func) } #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; @@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } +#endif static void __init ati_bugs(int num, int slot, int func) { -- cgit v1.1 From cdc1cb0d4445f39561a65204d26f89365f917550 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 3 Apr 2009 17:15:14 -0700 Subject: x86: make wakeup_secondary_cpu_via_init static Impact: cleanup Signed-off-by: Yinghai Lu LKML-Reference: <49D6A692.6040400@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef..bddf2cc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -int __devinit +static int __devinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; -- cgit v1.1 From 02421f98ec55c3ff118f358740ff640f096c7ad6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 3 Apr 2009 17:15:53 -0700 Subject: x86: consistent about warm_reset_vector for UN_NON_UNIQUE_APIC Impact: cleanup didn't set it for UV_NON_UNIQUE_APIC, so don't restore it Signed-off-by: Yinghai Lu LKML-Reference: <49D6A6B9.6060501@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bddf2cc..bf8ad63 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -822,10 +822,12 @@ do_rest: /* mark "stuck" area as not stuck */ *((volatile unsigned long *)trampoline_base) = 0; - /* - * Cleanup possible dangling ends... - */ - smpboot_restore_warm_reset_vector(); + if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + } return boot_error; } -- cgit v1.1 From 42d7c5e353cef9062129b0de3ec9ddf10567b9ca Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:21 -0500 Subject: swiotlb: change swiotlb_bus_to[phys,virt] prototypes Add a hwdev argument that is needed on some architectures in order to access a per-device offset that is taken into account when producing a physical address (also needed to get from bus address to virtual address because the physical address is an intermediate step). Also make swiotlb_bus_to_virt weak so architectures can override it. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-8-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 34f12e9..887388a 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } -- cgit v1.1 From 7333a8003cdc0470e8c0ae8b949cbc44f3165ff3 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 25 Mar 2009 10:50:34 +0900 Subject: x86: smarten /proc/interrupts output for new counters Now /proc/interrupts of tip tree has new counters: CNT: Performance counter interrupts Format change of output, as like that by commit: commit 7a81d9a7da03d2f27840d659f97ef140d032f609 x86: smarten /proc/interrupts output should be applied to these new counters too. Signed-off-by: Hidetoshi Seto Cc: Jan Beulich LKML-Reference: <49C98DEA.8060208@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d465487..dccaaa8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,7 +63,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "CNT: "); + seq_printf(p, "%*s: ", prec, "CNT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); -- cgit v1.1 From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:33 +0200 Subject: perf_counter: allow for data addresses to be recorded Paul suggested we allow for data addresses to be recorded along with the traditional IPs as power can provide these. For now, only the software pagefault events provide data addresses, but in the future power might as well for some events. x86 doesn't seem capable of providing this atm. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.394816925@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1116a41..0fcbaab 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,7 @@ again: continue; perf_save_and_restart(counter); - if (perf_counter_overflow(counter, nmi, regs)) + if (perf_counter_overflow(counter, nmi, regs, 0)) __pmc_generic_disable(counter, &counter->hw, bit); } -- cgit v1.1 From e85abf8f432bb2a13733ab7609fbb8e1500af51d Mon Sep 17 00:00:00 2001 From: Gary Hade Date: Wed, 8 Apr 2009 14:07:25 -0700 Subject: x86: consolidate SMP code in io_apic.c Impact: Cleanup Reorganizes the code in arch/x86/kernel/io_apic.c by combining two '#ifdef CONFIG_SMP' regions. In addition to making the code easier to understand the first '#ifdef CONFIG_SMP' region is moved to a location later in the file which will reduce the need for function forward declarations when the code subsequently revised. The only changes other than relocating code to a different position in the file were the removal of the assign_irq_vector() forward declaration which was no longer needed and some line length reduction formatting changes. Signed-off-by: Gary Hade Cc: lcm@us.ibm.com LKML-Reference: <20090408210725.GC11159@us.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 223 ++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 114 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 767fe7e..7c9d045 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -518,120 +518,6 @@ static void ioapic_mask_entry(int apic, int pin) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) -{ - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } - cfg->move_in_progress = 0; -} - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - -/* - * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and - * leaves desc->affinity untouched. - */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned int irq; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; - - irq = desc->irq; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; - - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - - cpumask_copy(desc->affinity, mask); - - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); -} - -static void -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); - } - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - set_ioapic_affinity_irq_desc(desc, mask); -} -#endif /* CONFIG_SMP */ - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -2360,6 +2246,115 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void +__target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +/* + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + /* check that before desc->addinity get updated */ + set_extra_move_desc(desc, mask); + + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); +} + +static void +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + set_ioapic_affinity_irq_desc(desc, mask); +} #ifdef CONFIG_INTR_REMAP -- cgit v1.1 From 7a734e7dd93b9aea08ed51036a9a0e2c9dfd8dac Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:08:28 -0700 Subject: x86, setup: "glove box" BIOS calls -- infrastructure Impact: new interfaces (not yet used) For all the platforms out there, there is an infinite number of buggy BIOSes. This adds infrastructure to treat BIOS interrupts more like toxic waste and "glove box" them -- we switch out the register set, perform the BIOS interrupt, and then restore the previous state. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: Pavel Machek Cc: Rafael J. Wysocki --- arch/x86/kernel/acpi/realmode/Makefile | 2 +- arch/x86/kernel/acpi/realmode/bioscall.S | 1 + arch/x86/kernel/acpi/realmode/regs.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/acpi/realmode/bioscall.S create mode 100644 arch/x86/kernel/acpi/realmode/regs.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0..167bc16 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -9,7 +9,7 @@ always := wakeup.bin targets := wakeup.elf wakeup.lds -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o +wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 0000000..f51eb0b --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 0000000..6206033 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" -- cgit v1.1 From e71e99c294058a61b7a8b9bb6da2f745ac51aa4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 14:30:04 -0400 Subject: x86, function-graph: only save return values on x86_64 Impact: speed up The return to handler portion of the function graph tracer should only need to save the return values. The caller already saved off the registers that the callee can modify. The returning function already saved the registers it modified. When we call our own trace function it too will save the registers that the callee must restore. There's no reason to save off anything more that the registers used to return the values. Note, I did a complete kernel build with this modification and the function graph tracer running on x86_64. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a331ec3..1ac9986 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -147,27 +147,14 @@ END(ftrace_graph_caller) GLOBAL(return_to_handler) subq $80, %rsp + /* Save the return values */ movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - movq %r10, 56(%rsp) - movq %r11, 64(%rsp) + movq %rdx, 8(%rsp) call ftrace_return_to_handler movq %rax, 72(%rsp) - movq 64(%rsp), %r11 - movq 56(%rsp), %r10 - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx + movq 8(%rsp), %rdx movq (%rsp), %rax addq $72, %rsp retq -- cgit v1.1 From bda869c614c937c318547c3ee1d65a316b693c21 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:05:10 +0200 Subject: x86: cacheinfo: use L3 cache index disable feature only for CPUs that support it AMD family 0x11 CPU doesn't support the feature. Some AMD family 0x10 CPUs do not support it or have an erratum, see erratum #382 in "Revision Guide for AMD Family 10h Processors, 41322 Rev. 3.40 February 2009". Signed-off-by: Andreas Herrmann CC: Mark Langsdorf Cc: Andrew Morton LKML-Reference: <20090409130510.GG31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda9..7240126 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -291,6 +291,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { if (index < 3) return; + + if (boot_cpu_data.x86 == 0x11) + return; + + /* see erratum #382 */ + if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + return; + this_leaf->can_disable = 1; } -- cgit v1.1 From 845d8c761ec763871936c62b837c4a9ea6d0fbdb Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:07:29 +0200 Subject: x86: cacheinfo: correct return value when cache_disable feature is not active Impact: bug fix If user writes to "cache_disable" attribute on a CPU that does not support this feature, the process hangs due to an invalid return value in store_cache_disable(). Signed-off-by: Andreas Herrmann Cc: Andrew Morton Cc: Mark Langsdorf LKML-Reference: <20090409130729.GH31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 7240126..1ab46e0 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -771,7 +771,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, unsigned int ret, index, val; if (!this_leaf->can_disable) - return 0; + return -EINVAL; if (strlen(buf) > 15) return -EINVAL; -- cgit v1.1 From afd9fceec55225d33be878927056a548c2eef26c Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:16:17 +0200 Subject: x86: cacheinfo: use cached K8 NB_MISC devices instead of scanning for it Impact: avoid code duplication Signed-off-by: Andreas Herrmann Cc: Andrew Morton Cc: Mark Langsdorf LKML-Reference: <20090409131617.GI31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 37 +++-------------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1ab46e0..0cde071 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@ #include #include +#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs { unsigned long can_disable; }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} -}; -#endif - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -704,30 +697,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ - struct pci_dev *dev = NULL; - int i; - - for (i = 0; i <= node; i++) { - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_id[0], dev)); - if (!dev) - break; - } - return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ - return NULL; -} -#endif - static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) { const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); @@ -739,7 +708,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) if (!this_leaf->can_disable) return sprintf(buf, "Feature not enabled\n"); - dev = get_k8_northbridge(node); + dev = node_to_k8_nb_misc(node); if (!dev) { printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; @@ -783,7 +752,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, return -EINVAL; val |= 0xc0000000; - dev = get_k8_northbridge(node); + dev = node_to_k8_nb_misc(node); if (!dev) { printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; -- cgit v1.1 From f8b201fc7110c3673437254e8ba02451461ece0b Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:18:49 +0200 Subject: x86: cacheinfo: replace sysfs interface for cache_disable feature Impact: replace sysfs attribute Current interface violates against "one-value-per-sysfs-attribute rule". This patch replaces current attribute with two attributes -- one for each L3 Cache Index Disable register. Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409131849.GJ31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 90 +++++++++++++++++------------------ 1 file changed, 45 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0cde071..fc28291 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -697,73 +697,69 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - ssize_t ret = 0; - int i; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; if (!this_leaf->can_disable) - return sprintf(buf, "Feature not enabled\n"); - - dev = node_to_k8_nb_misc(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; - } - for (i = 0; i < 2; i++) { - unsigned int reg; + if (!dev) + return -EINVAL; - pci_read_config_dword(dev, 0x1BC + i * 4, ®); + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "%x\n", reg); +} - ret += sprintf(buf, "%sEntry: %d\n", buf, i); - ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", - buf, - reg & 0x80000000 ? "Disabled" : "Allowed", - reg & 0x40000000 ? "Disabled" : "Allowed"); - ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", - buf, (reg & 0x30000) >> 16, reg & 0xfff); - } - return ret; +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ } +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, - size_t count) +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - unsigned int ret, index, val; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; if (!this_leaf->can_disable) return -EINVAL; - if (strlen(buf) > 15) - return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; - ret = sscanf(buf, "%x %x", &index, &val); - if (ret != 2) - return -EINVAL; - if (index > 1) + if (!dev) return -EINVAL; - val |= 0xc0000000; - dev = node_to_k8_nb_misc(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - } + val |= 0xc0000000; pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); + return count; +} - return 1; +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) struct _cache_attr { struct attribute attr; @@ -785,7 +781,10 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); static struct attribute * default_attrs[] = { &type.attr, @@ -797,7 +796,8 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, - &cache_disable.attr, + &cache_disable_0.attr, + &cache_disable_1.attr, NULL }; -- cgit v1.1 From ba518bea2db21c72d44a6cbfd825b026ef9cdcb6 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:24:06 +0200 Subject: x86: cacheinfo: disable L3 ECC scrubbing when L3 cache index is disabled (Use correct mask to zero out bits 24-28 by Andreas) Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409132406.GK31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc28291..d46a849 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -731,6 +731,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; + unsigned int scrubber = 0; if (!this_leaf->can_disable) return -EINVAL; @@ -745,6 +746,11 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, return -EINVAL; val |= 0xc0000000; + + pci_read_config_dword(dev, 0x58, &scrubber); + scrubber &= ~0x1f000000; + pci_write_config_dword(dev, 0x58, scrubber); + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); -- cgit v1.1 From f465145235313c451164bdfa9037ac254bf00c9a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:18 +0300 Subject: x86: move x86_quirk_pre_intr_init() to irqinit_32.c Impact: cleanup In preparation for unifying irqinit_{32,64}.c, make x86_quirk_pre_intr_init() local to irqinit_32.c. Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 20 +++++++++++++++++++- arch/x86/kernel/setup.c | 18 ------------------ 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 368b0a8..0c0dedc 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -53,7 +53,7 @@ static struct irqaction fpu_irq = { .name = "fpu", }; -void __init init_ISA_irqs(void) +static void __init init_ISA_irqs(void) { int i; @@ -121,6 +121,24 @@ int vector_used_by_percpu_irq(unsigned int vector) /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} + void __init native_init_IRQ(void) { int i; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b415843..523bb69 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -997,24 +997,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 /** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - -/** * x86_quirk_intr_init - post gate setup interrupt initialisation * * Description: -- cgit v1.1 From 7371d9fcb88dc9185be9719f64744a339c537a92 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:19 +0300 Subject: x86: move init_ISA_irqs() in irqinit_32.c to match ordering in irqinit_64.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 48 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 0c0dedc..c5cb769 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -53,30 +53,6 @@ static struct irqaction fpu_irq = { .name = "fpu", }; -static void __init init_ISA_irqs(void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - /* * IRQ2 is cascade interrupt to second interrupt controller */ @@ -118,6 +94,30 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } +static void __init init_ISA_irqs(void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -- cgit v1.1 From 36290d87f5abf260a543e5b711be4ceed03e6b1a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:20 +0300 Subject: x86: introduce smp_intr_init() in irqinit_32.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 61 ++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index c5cb769..df0aad5 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -121,6 +121,38 @@ static void __init init_ISA_irqs(void) /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +static void __init smp_intr_init(void) +{ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for single call function */ + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); +#endif +} + /** * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors * @@ -158,34 +190,7 @@ void __init native_init_IRQ(void) } -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for single call function */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif + smp_intr_init(); #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ -- cgit v1.1 From 22813c45228160b07244a7c4ed7580388ac0f33d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:21 +0300 Subject: x86: introduce apic_intr_init() in irqinit_32.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index df0aad5..9ba68c4 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -171,25 +171,8 @@ static void __init x86_quirk_pre_intr_init(void) init_ISA_irqs(); } -void __init native_init_IRQ(void) +static void __init apic_intr_init(void) { - int i; - - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); - } - - smp_intr_init(); #ifdef CONFIG_X86_LOCAL_APIC @@ -208,6 +191,27 @@ void __init native_init_IRQ(void) /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +} + +void __init native_init_IRQ(void) +{ + int i; + + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + } + + apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); -- cgit v1.1 From d3496c85cae22fb7713af6ed542a6aeae8ee4210 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:22 +0300 Subject: x86: use identical loop constructs in 32-bit and 64-bit native_init_IRQ() Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9ba68c4..1029a18 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -205,7 +205,7 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8cd1053..1c8858b 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -159,15 +159,16 @@ void __init native_init_IRQ(void) int i; init_ISA_irqs(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ + if (i != IA32_SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } apic_intr_init(); -- cgit v1.1 From b0096bb0b640d0a7713618b3472fd0f4adf30a96 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:23 +0300 Subject: x86: unify smp_intr_init() in irqinit_{32,64}.h Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 8 +++++--- arch/x86/kernel/irqinit_64.c | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 1029a18..ef2528d 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -123,7 +123,8 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) { -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -143,14 +144,15 @@ static void __init smp_intr_init(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - /* IPI for single call function */ + /* IPI for generic single function call */ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +#endif /* CONFIG_SMP */ } /** diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1c8858b..9e7c57d 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -107,6 +107,7 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) { #ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -134,6 +135,7 @@ static void __init smp_intr_init(void) set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +#endif /* CONFIG_SMP */ } static void __init apic_intr_init(void) -- cgit v1.1 From 598c73d250ffb112715aa48fb325d79e255be23b Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:24 +0300 Subject: x86: unify init_ISA_irqs() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ef2528d..4488b71 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -98,7 +98,7 @@ static void __init init_ISA_irqs(void) { int i; -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); #endif init_8259A(0); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 9e7c57d..61c9a92 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -84,9 +84,14 @@ static void __init init_ISA_irqs(void) { int i; +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); +#endif init_8259A(0); + /* + * 16 old-style INTA-cycle interrupts: + */ for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); @@ -94,11 +99,8 @@ static void __init init_ISA_irqs(void) desc->action = NULL; desc->depth = 1; - /* - * 16 old-style INTA-cycle interrupts: - */ set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); + handle_level_irq, "XT"); } } -- cgit v1.1 From 320fd99672a44ece6d1cd0d838ba31c8ebbf5979 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:25 +0300 Subject: x86: unify native_init_IRQ() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 53 ++++++++++++++++++---------- arch/x86/kernel/irqinit_64.c | 82 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 4488b71..a780de3 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -22,7 +22,7 @@ #include #include - +#ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 * as the irq is unreliable, and exception 16 works correctly @@ -52,6 +52,7 @@ static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", }; +#endif /* * IRQ2 is cascade interrupt to second interrupt controller @@ -155,24 +156,6 @@ static void __init smp_intr_init(void) #endif /* CONFIG_SMP */ } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - static void __init apic_intr_init(void) { smp_intr_init(); @@ -195,12 +178,36 @@ static void __init apic_intr_init(void) #endif } +#ifdef CONFIG_X86_32 +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} +#endif + void __init native_init_IRQ(void) { int i; +#ifdef CONFIG_X86_32 /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); +#else + init_ISA_irqs(); +#endif /* * Cover the whole vector space, no vector can escape @@ -208,9 +215,15 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +#ifdef CONFIG_X86_32 /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#else + /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ + if (i != IA32_SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#endif } apic_intr_init(); @@ -218,6 +231,7 @@ void __init native_init_IRQ(void) if (!acpi_ioapic) setup_irq(2, &irq2); +#ifdef CONFIG_X86_32 /* * Call quirks after call gates are initialised (usually add in * the architecture specific gates): @@ -232,4 +246,5 @@ void __init native_init_IRQ(void) setup_irq(FPU_IRQ, &fpu_irq); irq_ctx_init(smp_processor_id()); +#endif } diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 61c9a92..ed50e35 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -39,14 +39,46 @@ * (these are usually mapped into the 0x30-0xff vector range) */ +#ifdef CONFIG_X86_32 /* - * IRQ2 is cascade interrupt to second interrupt controller + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + outb(0, 0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .name = "fpu", +}; +#endif +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ static struct irqaction irq2 = { .handler = no_action, .name = "cascade", }; + DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... IRQ0_VECTOR - 1] = -1, [IRQ0_VECTOR] = 0, @@ -158,11 +190,36 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); } +#ifdef CONFIG_X86_32 +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} +#endif + void __init native_init_IRQ(void) { int i; +#ifdef CONFIG_X86_32 + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); +#else init_ISA_irqs(); +#endif /* * Cover the whole vector space, no vector can escape @@ -170,13 +227,36 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +#ifdef CONFIG_X86_32 + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#else /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#endif } apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); + +#ifdef CONFIG_X86_32 + /* + * Call quirks after call gates are initialised (usually add in + * the architecture specific gates): + */ + x86_quirk_intr_init(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +#endif } -- cgit v1.1 From 778838600eb6973bdb6fd11e7f91b43cea4d6f45 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:26 +0300 Subject: x86: unify trivial differences in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 20 ++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 4 ++++ 2 files changed, 24 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index a780de3..72ce942 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -1,20 +1,24 @@ +#include #include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -22,6 +26,22 @@ #include #include +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + #ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ed50e35..687b6c3 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -17,11 +17,14 @@ #include #include +#include #include #include #include #include +#include #include +#include /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -136,6 +139,7 @@ static void __init init_ISA_irqs(void) } } +/* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); static void __init smp_intr_init(void) -- cgit v1.1 From ab19c25abd14db28d7454f00805ea59f22ed6057 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:27 +0300 Subject: x86: unify apic_intr_init() in irqinit_{32,64}.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 9 ++++++++- arch/x86/kernel/irqinit_64.c | 11 +++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 72ce942..f3be5e9 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -180,7 +180,12 @@ static void __init apic_intr_init(void) { smp_intr_init(); -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_64 + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -192,10 +197,12 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); #endif +#ifdef CONFIG_X86_32 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 687b6c3..f3be5e9 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -180,9 +180,12 @@ static void __init apic_intr_init(void) { smp_intr_init(); +#ifdef CONFIG_X86_64 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -192,6 +195,14 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + +#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#endif } #ifdef CONFIG_X86_32 -- cgit v1.1 From 31cb45ef2600d47191d51253ec94b5e3f689260d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:28 +0300 Subject: x86: unify irqinit_{32,64}.c into irqinit.c Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/irqinit.c | 277 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irqinit_32.c | 277 ------------------------------------------- arch/x86/kernel/irqinit_64.c | 277 ------------------------------------------- 4 files changed, 278 insertions(+), 555 deletions(-) create mode 100644 arch/x86/kernel/irqinit.c delete mode 100644 arch/x86/kernel/irqinit_32.c delete mode 100644 arch/x86/kernel/irqinit_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 145cce7..16e3acf 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp) obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit_$(BITS).o +obj-y += setup.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c new file mode 100644 index 0000000..f3be5e9 --- /dev/null +++ b/arch/x86/kernel/irqinit.c @@ -0,0 +1,277 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + +#ifdef CONFIG_X86_32 +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + outb(0, 0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .name = "fpu", +}; +#endif + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { + .handler = no_action, + .name = "cascade", +}; + +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; +} + +static void __init init_ISA_irqs(void) +{ + int i; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +static void __init smp_intr_init(void) +{ +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for generic single function call */ + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); +#endif +#endif /* CONFIG_SMP */ +} + +static void __init apic_intr_init(void) +{ + smp_intr_init(); + +#ifdef CONFIG_X86_64 + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + +#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#endif +} + +#ifdef CONFIG_X86_32 +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} +#endif + +void __init native_init_IRQ(void) +{ + int i; + +#ifdef CONFIG_X86_32 + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); +#else + init_ISA_irqs(); +#endif + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +#ifdef CONFIG_X86_32 + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#else + /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ + if (i != IA32_SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +#endif + } + + apic_intr_init(); + + if (!acpi_ioapic) + setup_irq(2, &irq2); + +#ifdef CONFIG_X86_32 + /* + * Call quirks after call gates are initialised (usually add in + * the architecture specific gates): + */ + x86_quirk_intr_init(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +#endif +} diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c deleted file mode 100644 index f3be5e9..0000000 --- a/arch/x86/kernel/irqinit_32.c +++ /dev/null @@ -1,277 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -#ifdef CONFIG_X86_32 -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - outb(0, 0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .name = "fpu", -}; -#endif - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", -}; - -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) - return 1; - } - - return 0; -} - -static void __init init_ISA_irqs(void) -{ - int i; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_64 - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif - -#ifdef CONFIG_X86_32 -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) - /* thermal monitor LVT interrupt */ - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#endif -} - -#ifdef CONFIG_X86_32 -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} -#endif - -void __init native_init_IRQ(void) -{ - int i; - -#ifdef CONFIG_X86_32 - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); -#else - init_ISA_irqs(); -#endif - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -#ifdef CONFIG_X86_32 - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#else - /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#endif - } - - apic_intr_init(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); - -#ifdef CONFIG_X86_32 - /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -#endif -} diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c deleted file mode 100644 index f3be5e9..0000000 --- a/arch/x86/kernel/irqinit_64.c +++ /dev/null @@ -1,277 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -#ifdef CONFIG_X86_32 -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - outb(0, 0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .name = "fpu", -}; -#endif - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", -}; - -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) - return 1; - } - - return 0; -} - -static void __init init_ISA_irqs(void) -{ - int i; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_64 - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif - -#ifdef CONFIG_X86_32 -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) - /* thermal monitor LVT interrupt */ - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#endif -} - -#ifdef CONFIG_X86_32 -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} -#endif - -void __init native_init_IRQ(void) -{ - int i; - -#ifdef CONFIG_X86_32 - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); -#else - init_ISA_irqs(); -#endif - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -#ifdef CONFIG_X86_32 - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#else - /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#endif - } - - apic_intr_init(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); - -#ifdef CONFIG_X86_32 - /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -#endif -} -- cgit v1.1 From ac3048dfd4740becf8d768844cf47ebee363c9f8 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:29 +0300 Subject: x86: define IA32_SYSCALL_VECTOR on 32-bit to reduce ifdefs Impact: cleanup We can remove some #ifdefs if we define IA32_SYSCALL_VECTOR on 32-bit. Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 6 ------ arch/x86/kernel/traps.c | 5 +---- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f3be5e9..f2c60a5 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -242,15 +242,9 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { -#ifdef CONFIG_X86_32 - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#else /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); -#endif } apic_intr_init(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d2883..2310700 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -969,11 +969,8 @@ void __init trap_init(void) for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); -#ifdef CONFIG_X86_64 set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else - set_bit(SYSCALL_VECTOR, used_vectors); -#endif + /* * Should be a barrier for any external CPU state: */ -- cgit v1.1 From abdb5a5713330e17dfe91ab0d3e29c4744d95162 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 9 Apr 2009 11:52:30 +0300 Subject: x86: remove some ifdefs from native_init_IRQ() Impact: cleanup Reviewed-by Cyrill Gorcunov Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f2c60a5..6269772 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -205,7 +205,6 @@ static void __init apic_intr_init(void) #endif } -#ifdef CONFIG_X86_32 /** * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors * @@ -217,24 +216,21 @@ static void __init apic_intr_init(void) **/ static void __init x86_quirk_pre_intr_init(void) { +#ifdef CONFIG_X86_32 if (x86_quirks->arch_pre_intr_init) { if (x86_quirks->arch_pre_intr_init()) return; } +#endif init_ISA_irqs(); } -#endif void __init native_init_IRQ(void) { int i; -#ifdef CONFIG_X86_32 /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); -#else - init_ISA_irqs(); -#endif /* * Cover the whole vector space, no vector can escape -- cgit v1.1 From 6265ff19ca08df0d96c859ae5e4dc2d9ad07070e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:47:10 +0200 Subject: x86: cacheinfo: complete L2/L3 Cache and TLB associativity field definitions See "CPUID Specification" (AMD Publication #: 25481, Rev. 2.28, April 2008) Signed-off-by: Andreas Herrmann Cc: Mark Langsdorf LKML-Reference: <20090409134710.GA8026@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index d46a849..789efe2 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -200,10 +200,17 @@ union l3_cache { }; static const unsigned short __cpuinitconst assocs[] = { - [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, [0xa] = 32, [0xb] = 48, + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, [0xc] = 64, - [0xf] = 0xffff // ?? + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ }; static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -264,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; if (leaf == 3) - eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + eax->split.num_threads_sharing = + current_cpu_data.x86_max_cores - 1; else eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; -- cgit v1.1 From 47f16ca7631f9c6bad8e6d968cfb1433029b09ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 10 Apr 2009 14:58:05 +0200 Subject: x86, irqinit: preempt merge conflicts To make the topic merge life easier for tip:perfcounters/core, include two (inactive in this topic) IRQ vector initializations here. Also fix build bug - missing kprobes.h inclusion. Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6269772..b424c32 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -195,6 +196,13 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupts: */ +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); +# endif + #endif #ifdef CONFIG_X86_32 -- cgit v1.1 From 2de1f33e99cec5fd79542a1d0e26efb9c36a98bb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 12:55:26 +0530 Subject: x86: apic/x2apic_cluster.c x86_cpu_to_logical_apicid should be static Impact: reduce kernel size a bit, address sparse warning Addresses the problem pointed out by this sparse warning: arch/x86/kernel/apic/x2apic_cluster.c:13:1: warning: symbol 'per_cpu__x86_cpu_to_logical_apicid' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Suresh Siddha LKML-Reference: <1239434726.4418.24.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2..8e4cbb2 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,7 +10,7 @@ #include #include -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { -- cgit v1.1 From 2c1b284e4fa260fd922b9a65c99169e2630c6862 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 00:03:10 +0530 Subject: x86: clean up declarations and variables Impact: cleanup, no code changed - syscalls.h update declarations due to unifications - irq.c declare smp_generic_interrupt() before it gets used - process.c declare sys_fork() and sys_vfork() before they get used - tsc.c rename tsc_khz shadowed variable - apic/probe_32.c declare apic_default before it gets used - apic/nmi.c prev_nmi_count should be unsigned - apic/io_apic.c declare smp_irq_move_cleanup_interrupt() before it gets used - mm/init.c declare direct_gbpages and free_initrd_mem before they get used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 1 + arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/irq.c | 1 + arch/x86/kernel/process.c | 1 + arch/x86/kernel/tsc.c | 8 ++++---- 6 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 767fe7e..870c92d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index d6bd624..0205631 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) } #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2a..440a8bc 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -160,7 +160,6 @@ extern struct apic apic_summit; extern struct apic apic_bigsmp; extern struct apic apic_es7000; extern struct apic apic_es7000_cluster; -extern struct apic apic_default; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9..2188267 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca98915..3e21e38 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7a567eb..a8dc0d0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, tsc_khz; + unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; - tsc_khz = get_hypervisor_tsc_freq(); - if (tsc_khz) { + hv_tsc_khz = get_hypervisor_tsc_freq(); + if (hv_tsc_khz) { printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return tsc_khz; + return hv_tsc_khz; } local_irq_save(flags); -- cgit v1.1 From edea7148a87c099e5d5d4838285cc27e459588b7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:39 +0400 Subject: x86: irq.c - tiny cleanup Impact: cleanup, robustization 1) guard ack_bad_irq with printk_ratelimit since there is no guarantee we will not be flooded one day 2) use pr_emerg() helper Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.277579847@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9..6603492 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -24,7 +24,8 @@ void (*generic_interrupt_extension)(void) = NULL; */ void ack_bad_irq(unsigned int irq) { - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); #ifdef CONFIG_X86_LOCAL_APIC /* @@ -178,7 +179,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_thermal_count; # ifdef CONFIG_X86_64 sum += irq_stats(cpu)->irq_threshold_count; -#endif +# endif #endif return sum; } @@ -219,8 +220,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) #endif if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); } irq_exit(); -- cgit v1.1 From c0eaa4536f08b98fbcfa7fce5b7b0de1bebcb0e1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:40 +0400 Subject: x86: apic - introduce imcr_ helpers Impact: cleanup Distinguish port writting magic into helpers with comments. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.535921550@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 098ec84b..c3be10f5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -98,6 +98,29 @@ early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline imcr_pic_to_apic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go through APIC */ + outb(0x01, 0x23); +} + +static inline imcr_apic_to_pic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go directly to BSP */ + outb(0x00, 0x23); +} #endif #ifdef CONFIG_X86_64 @@ -1727,8 +1750,7 @@ void __init connect_bsp_APIC(void) */ apic_printk(APIC_VERBOSE, "leaving PIC mode, " "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); + imcr_pic_to_apic(); } #endif if (apic->enable_apic_mode) @@ -1756,8 +1778,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) */ apic_printk(APIC_VERBOSE, "disabling APIC mode, " "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); + imcr_apic_to_pic(); return; } #endif -- cgit v1.1 From 08306ce61d6848e6fbf74fa4cc693c3fb29e943f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:41 +0400 Subject: x86: apic - introduce dummy apic operations Impact: refactor, speed up and robustize code In case if apic was disabled by kernel option or by hardware limits we can use dummy operations in apic->write to simplify the ack_APIC_irq() code. At the lame time the patch fixes the missed EOI in do_IRQ function (which has place if kernel is compiled as X86-32 and interrupt without handler happens where apic was not asked to be disabled via kernel option). Note that native_apic_write_dummy() consists of WARN_ON_ONCE to catch any buggy writes on enabled APICs. Could be removed after some time of testing. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.724788431@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 24 ++++++++++++++++++++++++ arch/x86/kernel/irq.c | 10 ++-------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c3be10f5..9b849d4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -232,6 +232,24 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +/* + * bare function to substitute write operation + * and it's _that_ fast :) + */ +void native_apic_write_dummy(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +/* + * right after this call apic->write doesn't do anything + * note that there is no restore operation it works one way + */ +void apic_disable(void) +{ + apic->write = native_apic_write_dummy; +} + void native_apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) @@ -1582,6 +1600,12 @@ void __init init_apic_mappings(void) */ if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = read_apic_id(); + + /* lets check if we may to NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } } /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 6603492..fd57bf3 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -27,7 +27,6 @@ void ack_bad_irq(unsigned int irq) if (printk_ratelimit()) pr_err("unexpected IRQ trap at vector %02x\n", irq); -#ifdef CONFIG_X86_LOCAL_APIC /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N @@ -37,9 +36,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -214,10 +211,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) irq = __get_cpu_var(vector_irq)[vector]; if (!handle_irq(irq, regs)) { -#ifdef CONFIG_X86_64 - if (!disable_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); if (printk_ratelimit()) pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", -- cgit v1.1 From b9b34f24b23ba9e79e07c0980e7fff16af2a67d1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 12 Apr 2009 20:47:42 +0400 Subject: x86: smp.c - align smp_ops assignments Impact: cleanup It's a bit hard to parse by eyes without them being aligned. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090412165058.924175574@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea..f6db48c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) } struct smp_ops smp_ops = { - .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, - .smp_prepare_cpus = native_smp_prepare_cpus, - .smp_cpus_done = native_smp_cpus_done, + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, - .smp_send_reschedule = native_smp_send_reschedule, + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, - .cpu_up = native_cpu_up, - .cpu_die = native_cpu_die, - .cpu_disable = native_cpu_disable, - .play_dead = native_play_dead, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, - .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); -- cgit v1.1 From 0f3fd87ce43727d6b8573191ce89e874533b1429 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 13 Apr 2009 20:24:50 +0100 Subject: perf_counter: fix alignment in /proc/interrupts Trivial fix on columns alignment in /proc/interrupts file. Signed-off-by: Luis Henriques Cc: Peter Zijlstra LKML-Reference: <20090413192449.GA3920@hades.domain.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index dccaaa8..849cfab 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,7 +67,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); - seq_printf(p, "PND: "); + seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); seq_printf(p, " Performance pending work\n"); -- cgit v1.1 From 5cda395f4a262788d8ed79ac8a26a2b821e5f751 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Mon, 13 Apr 2009 17:39:24 +0200 Subject: x86: fix function definitions after: x86: apic - introduce imcr_ helpers The patch "introduce imcr_ helpers" introduced good comments, but also a few new compile warnings. This fixes the function definitions to have a 'void' return type. Signed-off-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov LKML-Reference: <20090413153924.GA20287@mailshack.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 9b849d4..4b48ff9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -106,7 +106,7 @@ static int enabled_via_apicbase; * the BIOS or the operating system must switch out of * PIC Mode by changing the IMCR. */ -static inline imcr_pic_to_apic(void) +static inline void imcr_pic_to_apic(void) { /* select IMCR register */ outb(0x70, 0x22); @@ -114,7 +114,7 @@ static inline imcr_pic_to_apic(void) outb(0x01, 0x23); } -static inline imcr_apic_to_pic(void) +static inline void imcr_apic_to_pic(void) { /* select IMCR register */ outb(0x70, 0x22); -- cgit v1.1 From e7d43a74cb07cbc4b8e9b5e4a914816b33fb0719 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 14 Apr 2009 13:18:28 +0530 Subject: x86: avoid multiple declaration of kstack_depth_to_print Impact: cleanup asm/stacktrace.h is more appropriate so removing other 2 declarations. Signed-off-by: Jaswinder Singh Rajput Cc: Neil Horman LKML-Reference: <1239695308.3033.34.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b..81086c2 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; -extern int kstack_depth_to_print; /* The form of the top of the frame on the stack */ struct stack_frame { -- cgit v1.1 From 7e05575c422d45f393c2d9b5900e97a30bf69bea Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 14 Apr 2009 12:12:29 +0900 Subject: x86: calgary: remove IOMMU_DEBUG CONFIG_IOMMU_DEBUG has depends on CONFIG_GART_IOMMU: config IOMMU_DEBUG bool "Enable IOMMU debugging" depends on GART_IOMMU && DEBUG_KERNEL depends on X86_64 So it's not useful to have CONFIG_IOMMU_DEBUG in Calgary IOMMU code, which does the extra checking of the bitmap space management. And Calgary uses the iommu helper for the bitmap space management now so it would be better to have the extra checking feature in the iommu helper rather than Calgary code (if necessary). Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Cc: Joerg Roedel Cc: alexisb@us.ibm.com LKML-Reference: <20090414120827G.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 54 ++-------------------------------------- 1 file changed, 2 insertions(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e..971a3be 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = { static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -static int debugging = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - unsigned long idx = start; - - BUG_ON(start >= end); - - while (idx < end) { - if (!!test_bit(idx, bitmap) != expected) - return idx; - ++idx; - } - - /* all bits have the expected value */ - return ~0UL; -} -#else /* debugging is disabled */ -static int debugging; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; - unsigned long badbit; unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 0, index, end); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: entry already allocated at " - "0x%lx tbl %p dma 0x%lx npages %u\n", - badbit, tbl, start_addr, npages); - } - iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long badbit; unsigned long badend; unsigned long flags; @@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: bit is off at 0x%lx " - "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - badbit, tbl, dma_addr, entry, npages); - } - iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -1488,9 +1439,8 @@ void __init detect_calgary(void) iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " - "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, - debugging ? "enabled" : "disabled"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) -- cgit v1.1 From 19c1a6f5764d787113fa323ffb18be7991208f82 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 14 Apr 2009 09:43:19 +0900 Subject: x86 gart: reimplement IOMMU_LEAK feature by using DMA_API_DEBUG IOMMU_LEAK, GART's own feature, dumps the used IOMMU entries when IOMMU entries is full, which might be useful to find a bad driver that eats IOMMU entries. DMA_API_DEBUG provides the similar feature, debug_dma_dump_mappings, and it's better than GART's IOMMU_LEAK feature. GART's IOMMU_LEAK feature doesn't say who uses IOMMU entries so it's hard to find a bad driver. This patch reimplements the GART's IOMMU_LEAK feature by using DMA_API_DEBUG. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Cc: Andrew Morton LKML-Reference: <1239669799-23579-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 45 ++++++++----------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58..1e8920d 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -144,48 +144,21 @@ static void flush_gart(void) } #ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0);\ - } while (0) - -#define CLEAR_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; \ - } while (0) - /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; static void dump_leak(void) { - int i; static int dump; - if (dump || !iommu_leak_tab) + if (dump) return; dump = 1; - show_stack(NULL, NULL); - /* Very crude. dump some from the end of the table too */ - printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", - iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i += 2) { - printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], - 0); - printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk(KERN_DEBUG "\n"); + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); } -#else -# define SET_LEAK(x) -# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) @@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); phys_mem += PAGE_SIZE; } return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); @@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); } free_iommu(iommu_page, npages); } @@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } @@ -801,11 +771,12 @@ void __init gart_iommu_init(void) #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - get_order(iommu_pages*sizeof(void *))); - if (!iommu_leak_tab) + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) printk(KERN_DEBUG - "PCI-DMA: Cannot allocate leak trace area\n"); + "PCI-DMA: Cannot trace all the entries\n"); } #endif -- cgit v1.1 From 77857dc07247ed5fa700a197c96ef842d8dbebdf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 15 Apr 2009 11:57:01 -0700 Subject: x86: use used_vectors in init_IRQ() Impact: fix crash with many devices I found this crash: [ 552.616646] general protection fault: 0403 [#1] SMP [ 552.620013] last sysfs file: /sys/devices/pci0000:00/0000:00:02.0/usb1/1-1/1-1:1.0/host13/target13:0:0/13:0:0:0/block/sr0/size [ 552.620013] CPU 0 [ 552.620013] Modules linked in: [ 552.620013] Pid: 0, comm: swapper Not tainted 2.6.30-rc1-tip-01931-g8fcafd8-dirty #28 Sun Fire X4440 [ 552.620013] RIP: 0010:[] [] default_idle+0x7d/0xda [ 552.620013] RSP: 0018:ffffffff81345e68 EFLAGS: 00010246 [ 552.620013] RAX: 0000000000000000 RBX: ffffffff8133d870 RCX: ffffc20000000000 [ 552.620013] RDX: 00000000001d0620 RSI: ffffffff8023bad8 RDI: ffffffff802a3169 [ 552.620013] RBP: ffffffff81345e98 R08: 0000000000000000 R09: ffffffff812244a0 [ 552.620013] R10: ffffffff81345dc8 R11: 7ebe1b6fa0bcac50 R12: 4ec4ec4ec4ec4ec5 [ 552.620013] R13: ffffffff813a54d0 R14: ffffffff813a7a40 R15: 0000000000000000 [ 552.620013] FS: 00000000006d1880(0000) GS:ffffc20000000000(0000) knlGS:0000000000000000 [ 552.620013] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b [ 552.620013] CR2: 00007fec9d936a50 CR3: 000000007d1a9000 CR4: 00000000000006e0 [ 552.620013] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 552.620013] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 552.620013] Process swapper (pid: 0, threadinfo ffffffff81344000,task ffffffff812244a0) [ 552.620013] Stack: [ 552.620013] 0000000000000000 ffffc20000000000 00000000001d0620 7ebe1b6fa0bcac50 [ 552.620013] ffffffff8133d870 4ec4ec4ec4ec4ec5 ffffffff81345ec8 ffffffff8023bd84 [ 552.620013] 4ec4ec4ec4ec4ec5 ffffffff813a54d0 7ebe1b6fa0bcac50 ffffffff8133d870 [ 552.620013] Call Trace: [ 552.620013] [] c1e_idle+0x109/0x124 [ 552.620013] [] cpu_idle+0xb8/0x101 [ 552.620013] [] rest_init+0x7e/0x94 [ 552.620013] [] start_kernel+0x3dc/0x3fd [ 552.620013] [] x86_64_start_reservations+0xb9/0xd4 [ 552.620013] [] x86_64_start_kernel+0xee/0x109 [ 552.620013] Code: 48 8b 04 25 f8 b4 00 00 83 a0 3c e0 ff ff fb 0f ae f0 65 48 8b 04 25 f8 b4 00 00 f6 80 38 e0 ff ff 08 75 09 e8 71 76 06 00 fb f4 06 e8 68 76 06 00 fb 65 48 8b 04 25 f8 b4 00 00 83 88 3c e0 [ 552.620013] RIP [] default_idle+0x7d/0xda [ 552.620013] RSP [ 552.828646] ---[ end trace 4cbfc5c01382af7f ]--- Joerg Roedel said "The 0403 error code means that there was an external interrupt with vector 0x80. Yinghai, my theory is that the kernel on this machine has no 32bit emulation compiled in, right? In this case the selector points to a zero entry which may cause the #gpf right after the hlt. But I have no idea where the external int 0x80 comes from" it turns out that we could use 0x80 for external device on 64-bit when 32-bit emulation is disabled. But we forgot to set the gate for it. try to set gate for it by checking used_vectors. Also move apic_intr_init() early to avoid setting that gate two times. Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Joerg Roedel LKML-Reference: <49E62DFD.6010904@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b424c32..2e08b10 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -240,19 +240,19 @@ void __init native_init_IRQ(void) /* Execute any quirks before the call gates are initialised: */ x86_quirk_pre_intr_init(); + apic_intr_init(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become * 'special' SMP interrupts) */ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* IA32_SYSCALL_VECTOR was reserved in trap_init. */ - if (i != IA32_SYSCALL_VECTOR) + /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ + if (!test_bit(i, used_vectors)) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } - apic_intr_init(); - if (!acpi_ioapic) setup_irq(2, &irq2); -- cgit v1.1 From 9b94b3a19b13e094c10f65f24bc358f6ffe4eacd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 17 Apr 2009 12:07:46 +0200 Subject: x86: fixup numa_node information for AMD CPU northbridge functions Currently the numa_node attribute for these PCI devices is 0 (it corresponds to the numa_node for PCI bus 0). This is not a big issue but incorrect. This inconsistency can be fixed by reading the node number from CPU NB function 0. [ Impact: fill in dev->numa_node information, to optimize DMA allocations ] Signed-off-by: Andreas Herrmann Cc: jbarnes@virtuousgeek.org LKML-Reference: <20090417100746.GG16198@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index e95022e..94ad0c0 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -493,5 +493,42 @@ void force_hpet_resume(void) break; } } +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + set_dev_node(&dev->dev, val & 7); + pci_dev_put(dev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); #endif -- cgit v1.1 From 5d0ae2db6deac4f15dac4f42f23bc56448fc8d4d Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:13 +0800 Subject: x86, intr-remap: fix ack for interrupt remapping Shouldn't call ack_apic_edge() in ir_ack_apic_edge(), because ack_apic_edge() does more than just ack: it also does irq migration in the non-interrupt-remapping case. But there is no such need for interrupt-remapping case, as irq migration is done in the process context. Similarly, ir_ack_apic_level() shouldn't call ack_apic_level, and instead should do the local cpu's EOI + directed EOI to the io-apic. ack_x2APIC_irq() is not neccessary, because ack_APIC_irq() will use MSR write for x2apic, and uncached write for non-x2apic. [ Impact: simplify/standardize intr-remap IRQ acking, fix on !x2apic ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-3-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8499000..ea22a86 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2552,20 +2552,6 @@ eoi_ioapic_irq(struct irq_desc *desc) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_X86_X2APIC -static void ack_x2apic_level(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - ack_x2APIC_irq(); - eoi_ioapic_irq(desc); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2629,9 +2615,6 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); - if (irq_remapped(irq)) - eoi_ioapic_irq(desc); - /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2680,20 +2663,15 @@ static void ack_apic_level(unsigned int irq) #ifdef CONFIG_INTR_REMAP static void ir_ack_apic_edge(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_edge(irq); -#endif - return ack_apic_edge(irq); + ack_APIC_irq(); } static void ir_ack_apic_level(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_level(irq); -#endif - return ack_apic_level(irq); + struct irq_desc *desc = irq_to_desc(irq); + + ack_APIC_irq(); + eoi_ioapic_irq(desc); } #endif /* CONFIG_INTR_REMAP */ -- cgit v1.1 From 937582382c71b75b29fbb92615629494e1a05ac0 Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:14 +0800 Subject: x86, intr-remap: enable interrupt remapping early Currently, when x2apic is not enabled, interrupt remapping will be enabled in init_dmars(), where it is too late to remap ioapic interrupts, that is, ioapic interrupts are really in compatibility mode, not remappable mode. This patch always enables interrupt remapping before ioapic setup, it guarantees all interrupts will be remapped when interrupt remapping is enabled. Thus it doesn't need to set the compatibility interrupt bit. [ Impact: refactor intr-remap init sequence, enable fuller remap mode ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-4-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 76 +++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 83e47fe..0cf1eea 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -141,6 +141,8 @@ static int x2apic_preenabled; static int disable_x2apic; static __init int setup_nox2apic(char *str) { + if (x2apic_enabled()) + panic("Bios already enabled x2apic, can't enforce nox2apic"); disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; @@ -1345,6 +1347,7 @@ void enable_x2apic(void) wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } +#endif /* CONFIG_X86_X2APIC */ void __init enable_IR_x2apic(void) { @@ -1353,32 +1356,21 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; - if (!cpu_has_x2apic) - return; - - if (!x2apic_preenabled && disable_x2apic) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); - return; + ret = dmar_table_init(); + if (ret) { + pr_debug("dmar_table_init() failed with %d:\n", ret); + goto ir_failed; } - if (x2apic_preenabled && disable_x2apic) - panic("Bios already enabled x2apic, can't enforce nox2apic"); - - if (!x2apic_preenabled && skip_ioapic_setup) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); - return; + if (!intr_remapping_supported()) { + pr_debug("intr-remapping not supported\n"); + goto ir_failed; } - ret = dmar_table_init(); - if (ret) { - pr_info("dmar_table_init() failed with %d:\n", ret); - if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); - else - pr_info("Not enabling x2apic,Intr-remapping\n"); + if (!x2apic_preenabled && skip_ioapic_setup) { + pr_info("Skipped enabling intr-remap because of skipping " + "io-apic setup\n"); return; } @@ -1398,20 +1390,25 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - - if (ret && x2apic_preenabled) { - local_irq_restore(flags); - panic("x2apic enabled by bios. But IR enabling failed"); - } +#ifdef CONFIG_X86_X2APIC + if (cpu_has_x2apic) + ret = enable_intr_remapping(EIM_32BIT_APIC_ID); + else +#endif + ret = enable_intr_remapping(EIM_8BIT_APIC_ID); if (ret) goto end_restore; - if (!x2apic) { + pr_info("Enabled Interrupt-remapping\n"); + +#ifdef CONFIG_X86_X2APIC + if (cpu_has_x2apic && !x2apic) { x2apic = 1; enable_x2apic(); + pr_info("Enabled x2apic\n"); } +#endif end_restore: if (ret) @@ -1426,30 +1423,29 @@ end_restore: local_irq_restore(flags); end: - if (!ret) { - if (!x2apic_preenabled) - pr_info("Enabled x2apic and interrupt-remapping\n"); - else - pr_info("Enabled Interrupt-remapping\n"); - } else - pr_err("Failed to enable Interrupt-remapping and x2apic\n"); if (ioapic_entries) free_ioapic_entries(ioapic_entries); + + if (!ret) + return; + +ir_failed: + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else if (cpu_has_x2apic) + pr_info("Not enabling x2apic,Intr-remapping\n"); #else if (!cpu_has_x2apic) return; if (x2apic_preenabled) panic("x2apic enabled prior OS handover," - " enable CONFIG_INTR_REMAP"); - - pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); #endif return; } -#endif /* CONFIG_X86_X2APIC */ + #ifdef CONFIG_X86_64 /* -- cgit v1.1 From 9a2755c3569e4db92bd9b1daadeddb4045b0cccd Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 17 Apr 2009 16:42:16 +0800 Subject: x86, intr-remap: fix x2apic/intr-remap resume Interrupt remapping was decoupled from x2apic. Shouldn't check x2apic before resume interrupt remapping. Otherwise, interrupt remapping won't be resumed when x2apic is not enabled. [ Impact: fix potential intr-remap resume hang on !x2apic ] Signed-off-by: Suresh Siddha Signed-off-by: Weidong Han Acked-by: David Woodhouse Cc: iommu@lists.linux-foundation.org Cc: allen.m.kay@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1239957736-6161-6-git-send-email-weidong.han@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0cf1eea..7b41a32 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2032,7 +2032,7 @@ static int lapic_resume(struct sys_device *dev) return 0; local_irq_save(flags); - if (x2apic) { + if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); @@ -2048,8 +2048,10 @@ static int lapic_resume(struct sys_device *dev) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - enable_x2apic(); } + + if (x2apic) + enable_x2apic(); #else if (!apic_pm_state.active) return 0; @@ -2097,10 +2099,12 @@ static int lapic_resume(struct sys_device *dev) apic_read(APIC_ESR); #ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - reenable_intr_remapping(EIM_32BIT_APIC_ID); + if (intr_remapping_enabled) { + if (x2apic) + reenable_intr_remapping(EIM_32BIT_APIC_ID); + else + reenable_intr_remapping(EIM_8BIT_APIC_ID); - if (x2apic) { unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); @@ -2109,7 +2113,6 @@ static int lapic_resume(struct sys_device *dev) local_irq_restore(flags); - return 0; } -- cgit v1.1 From cece3155d869a50ba534ed161b5a05e8a29dcad0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 18 Apr 2009 23:45:28 +0400 Subject: x86: smpboot - wakeup_secondary should be done via __cpuinit section A caller (do_boot_cpu) already has __cpuinit attribute. Since HOTPLUG_CPU depends on SMP && HOTPLUG it doesn't lead to panic at moment. [ Impact: cleanup ] Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090418194528.GD25510@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bf8ad63..d2e8de9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -int __devinit +int __cpuinit wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __devinit +static int __cpuinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; -- cgit v1.1 From 667c5296cc76fefe0abcb79228952b28d9af45e3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 19 Apr 2009 11:43:11 +0400 Subject: x86: es7000, uv - use __cpuinit for kicking secondary cpus The caller already has __cpuinit attribute. [ Impact: save memory, address section mismatch warning ] Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Pavel Emelyanov LKML-Reference: <20090419074311.GA8670@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 1c11b81..8e07c14 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index de1a50a..873bf71 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -91,7 +91,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP unsigned long val; -- cgit v1.1 From fc1edaf9e7cc4d4696f83dee495b8f158d01c4eb Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:27 -0700 Subject: x86: x2apic, IR: Clean up X86_X2APIC and INTR_REMAP config checks Add x2apic_supported() to clean up CONFIG_X86_X2APIC checks. Fix CONFIG_INTR_REMAP checks. [ Impact: cleanup ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Suresh Siddha Cc: Weidong Han LKML-Reference: <20090420200450.128993000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 49 ++++++++++------------------------------- arch/x86/kernel/apic/io_apic.c | 2 -- arch/x86/kernel/apic/probe_64.c | 2 +- 3 files changed, 13 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7b41a32..2b30e52 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -134,8 +134,8 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif +int x2apic_mode; #ifdef CONFIG_X86_X2APIC -int x2apic; /* x2apic enabled before OS handover */ static int x2apic_preenabled; static int disable_x2apic; @@ -858,7 +858,7 @@ void clear_local_APIC(void) u32 v; /* APIC hasn't been mapped yet */ - if (!x2apic && !apic_phys) + if (!x2apic_mode && !apic_phys) return; maxlvt = lapic_get_maxlvt(); @@ -1330,7 +1330,7 @@ void check_x2apic(void) { if (x2apic_enabled()) { pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); - x2apic_preenabled = x2apic = 1; + x2apic_preenabled = x2apic_mode = 1; } } @@ -1338,7 +1338,7 @@ void enable_x2apic(void) { int msr, msr2; - if (!x2apic) + if (!x2apic_mode) return; rdmsr(MSR_IA32_APICBASE, msr, msr2); @@ -1390,25 +1390,17 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); -#ifdef CONFIG_X86_X2APIC - if (cpu_has_x2apic) - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - else -#endif - ret = enable_intr_remapping(EIM_8BIT_APIC_ID); - + ret = enable_intr_remapping(x2apic_supported()); if (ret) goto end_restore; pr_info("Enabled Interrupt-remapping\n"); -#ifdef CONFIG_X86_X2APIC - if (cpu_has_x2apic && !x2apic) { - x2apic = 1; + if (x2apic_supported() && !x2apic_mode) { + x2apic_mode = 1; enable_x2apic(); pr_info("Enabled x2apic\n"); } -#endif end_restore: if (ret) @@ -1576,7 +1568,7 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { - if (x2apic) { + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; } @@ -2010,10 +2002,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) local_irq_save(flags); disable_local_APIC(); -#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) disable_intr_remapping(); -#endif + local_irq_restore(flags); return 0; } @@ -2023,8 +2015,6 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - -#ifdef CONFIG_INTR_REMAP int ret; struct IO_APIC_route_entry **ioapic_entries = NULL; @@ -2050,17 +2040,8 @@ static int lapic_resume(struct sys_device *dev) mask_8259A(); } - if (x2apic) + if (x2apic_mode) enable_x2apic(); -#else - if (!apic_pm_state.active) - return 0; - - local_irq_save(flags); - if (x2apic) - enable_x2apic(); -#endif - else { /* * Make sure the APICBASE points to the right address @@ -2098,18 +2079,12 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) { - if (x2apic) - reenable_intr_remapping(EIM_32BIT_APIC_ID); - else - reenable_intr_remapping(EIM_8BIT_APIC_ID); - + reenable_intr_remapping(x2apic_mode); unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } -#endif local_irq_restore(flags); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ea22a86..3a45d2e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -736,7 +736,6 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_INTR_REMAP struct IO_APIC_route_entry **alloc_ioapic_entries(void) { int apic; @@ -857,7 +856,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) kfree(ioapic_entries); } -#endif /* * Find the IRQ entry number of a certain pin. diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1783652..bc3e880 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic && (apic != &apic_x2apic_phys && + if (x2apic_mode && (apic != &apic_x2apic_phys && #ifdef CONFIG_X86_UV apic != &apic_x2apic_uv_x && #endif -- cgit v1.1 From 25629d810a52176758401184d9b437fbb7f79195 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:28 -0700 Subject: x86: x2apic, IR: Move eoi_ioapic_irq() into a CONFIG_INTR_REMAP section Address the following complier warning: arch/x86/kernel/apic/io_apic.c:2543: warning: `eoi_ioapic_irq' defined but not used By moving that function (and eoi_ioapic_irq()) into an existing #ifdef CONFIG_INTR_REMAP section of the code. [ Impact: cleanup ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Weidong Han LKML-Reference: <20090420200450.271099000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar Cc: Weidong Han --- arch/x86/kernel/apic/io_apic.c | 66 +++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3a45d2e..4baa9cb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2517,39 +2517,6 @@ static void irq_complete_move(struct irq_desc **descp) static inline void irq_complete_move(struct irq_desc **descp) {} #endif -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2659,6 +2626,39 @@ static void ack_apic_level(unsigned int irq) } #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); -- cgit v1.1 From 39d83a5d684a457046aa2a6dac60f105966e78e9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:29 -0700 Subject: x86: x2apic, IR: Clean up panic() with nox2apic boot option Instead of panic() ignore the "nox2apic" boot option when BIOS has already enabled x2apic prior to OS handover. [ Impact: printk warning instead of panic() when BIOS has enabled x2apic already ] Signed-off-by: Suresh Siddha Cc: dwmw2@infradead.org Cc: Weidong Han LKML-Reference: <20090420200450.425091000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2b30e52..d32f558 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -141,8 +141,12 @@ static int x2apic_preenabled; static int disable_x2apic; static __init int setup_nox2apic(char *str) { - if (x2apic_enabled()) - panic("Bios already enabled x2apic, can't enforce nox2apic"); + if (x2apic_enabled()) { + pr_warning("Bios already enabled x2apic, " + "can't enforce nox2apic"); + return 0; + } + disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; -- cgit v1.1 From ff166cb57a17124af75714a9c11f448f56f1a4a3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 20 Apr 2009 13:02:30 -0700 Subject: x86: x2apic, IR: remove reinit_intr_remapped_IO_APIC() When interrupt-remapping is enabled, we are relying on setup_IO_APIC_irqs() to configure remapped entries in the IO-APIC, which comes little bit later after enabling interrupt-remapping. Meanwhile, restoration of old io-apic entries after enabling interrupt-remapping will not make the interrupts through io-apic functional anyway. So remove the unnecessary reinit_intr_remapped_IO_APIC() step. The longer story: When interrupt-remapping is enabled, IO-APIC entries need to be setup in the re-mappable format (pointing to interrupt-remapping table entries setup by the OS). This remapping configuration is happening in the same place where we traditionally configure IO-APIC (i.e., in setup_IO_APIC_irqs()). So when we enable interrupt-remapping successfully, there is no need to restore old io-apic RTE entries before we actually do a complete configuration shortly in setup_IO_APIC_irqs(). Old IO-APIC RTE's may be in traditional format (non re-mappable) or in re-mappable format pointing to interrupt-remapping table entries setup by BIOS. Restoring both of these will not make IO-APIC functional. We have to rely on setup_IO_APIC_irqs() for proper configuration by OS. So I am removing this unnecessary and broken step. [ Impact: remove unnecessary/broken IO-APIC setup step ] Signed-off-by: Suresh Siddha Acked-by: Weidong Han Cc: dwmw2@infradead.org LKML-Reference: <20090420200450.552359000@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/apic/io_apic.c | 14 -------------- 2 files changed, 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d32f558..1386dbe 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1412,8 +1412,6 @@ end_restore: * IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - else - reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries); unmask_8259A(); local_irq_restore(flags); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4baa9cb..8aef5f9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -833,20 +833,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) return 0; } -void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries) - -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(ioapic_entries); -} - void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) { int apic; -- cgit v1.1 From 782cc5ae6331d63b4febaa312c9d14493aafa9b8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:43:09 +0200 Subject: x86, ds: fix buffer alignment in debug store selftest The debug store selftest code uses a stack-allocated buffer, which is not necessarily correctly aligned. For tests using a buffer to hold a single entry, the buffer that is passed to ds_request must already be suitably aligned. Pass a suitably aligned portion of the bigger buffer. [ Impact: fix hw-branch-tracer self-test failure ] Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com LKML-Reference: <20090424094309.A30145@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 5f104a0..6bc7c19 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -323,13 +323,15 @@ static int ds_selftest_bts_bad_request_task(void *buffer) int ds_selftest_bts(void) { struct ds_selftest_bts_conf conf; - unsigned char buffer[BUFFER_SIZE]; + unsigned char buffer[BUFFER_SIZE], *small_buffer; unsigned long irq; int cpu; printk(KERN_INFO "[ds] bts selftest..."); conf.error = 0; + small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; + get_online_cpus(); for_each_online_cpu(cpu) { conf.suspend = ds_suspend_bts_wrap; @@ -381,7 +383,7 @@ int ds_selftest_bts(void) conf.suspend = ds_suspend_bts_noirq; conf.resume = ds_resume_bts_noirq; conf.tracer = - ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE, + ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); local_irq_save(irq); ds_selftest_bts_cpu(&conf); -- cgit v1.1 From 1cb81b143fa8f0e4629f10690862e2e52ca792ff Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:51:43 +0200 Subject: x86, bts, mm: clean up buffer allocation The current mm interface is asymetric. One function allocates a locked buffer, another function only refunds the memory. Change this to have two functions for accounting and refunding locked memory, respectively; and do the actual buffer allocation in ptrace. [ Impact: refactor BTS buffer allocation code ] Signed-off-by: Markus Metzger Acked-by: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <20090424095143.A30265@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d5252ae..09ecbde 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -617,17 +617,28 @@ struct bts_context { struct work_struct work; }; -static inline void alloc_bts_buffer(struct bts_context *context, - unsigned int size) +static int alloc_bts_buffer(struct bts_context *context, unsigned int size) { - void *buffer; + void *buffer = NULL; + int err = -ENOMEM; - buffer = alloc_locked_buffer(size); - if (buffer) { - context->buffer = buffer; - context->size = size; - context->mm = get_task_mm(current); - } + err = account_locked_memory(current->mm, current->signal->rlim, size); + if (err < 0) + return err; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto out_refund; + + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + + return 0; + + out_refund: + refund_locked_memory(current->mm, size); + return err; } static inline void free_bts_buffer(struct bts_context *context) @@ -638,7 +649,7 @@ static inline void free_bts_buffer(struct bts_context *context) kfree(context->buffer); context->buffer = NULL; - refund_locked_buffer_memory(context->mm, context->size); + refund_locked_memory(context->mm, context->size); context->size = 0; mmput(context->mm); @@ -786,13 +797,15 @@ static int ptrace_bts_config(struct task_struct *child, context->tracer = NULL; if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + int err; + free_bts_buffer(context); if (!cfg.size) return 0; - alloc_bts_buffer(context, cfg.size); - if (!context->buffer) - return -ENOMEM; + err = alloc_bts_buffer(context, cfg.size); + if (err < 0) + return err; } if (cfg.flags & PTRACE_BTS_O_TRACE) -- cgit v1.1 From 0a3ec21fcd311b26ab0f249d62960e127bc20ca8 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 26 Apr 2009 23:07:42 +0200 Subject: x86: beautify vmlinux_64.lds.S Beautify vmlinux_64.lds.S: - Use tabs for indent - Located curly braces like in C code - Rearranged a few comments There is no functional changes in this patch The beautification is done to prepare a unification of the _32 and the _64 variants of the linker scripts. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <20090426210742.GA3464@uranus.ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 448 +++++++++++++++++++++------------------ 1 file changed, 243 insertions(+), 205 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index c874250..6d5a5b0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -15,69 +15,79 @@ OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ data.init PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - _edata = .; /* End of data section */ - } :data + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; + + /* Text and read-only data */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* First the code that has to be first for bootstrapping */ + *(.text.head) + _stext = .; + /* Then the rest */ + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + RODATA - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + /* Align data segment to page size boundary */ . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + /* End of data section */ + _edata = .; + } :data + + + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + *(.data.cacheline_aligned) + } + + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + } #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) @@ -85,37 +95,53 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) - { *(.vsyscall_gtod_data) } - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) - { *(.vsyscall_clock) } - vsyscall_clock = VVIRT(.vsyscall_clock); + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) - { *(.vsyscall_3) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; #undef VSYSCALL_ADDR #undef VSYSCALL_PHYS_ADDR @@ -125,156 +151,168 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); /* init_task */ - *(.data.init_task) - }:data.init + /* init_task */ + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + . = ALIGN(THREAD_SIZE); + *(.data.init_task) + } :data.init - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + *(.data.page_aligned) + } - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + . = ALIGN(PAGE_SIZE); + __smp_alt_begin = .; + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + __smp_alt_end = .; + } + + /* Init code and data */ . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + __initdata_begin = .; + INIT_DATA + __initdata_end = .; + } + + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + . = ALIGN(16); + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + . = ALIGN(8); + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } #ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif #ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) #else - PERCPU(PAGE_SIZE) + PERCPU(PAGE_SIZE) #endif - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } + __init_end = .; + + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ + + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __bss_start = .; /* BSS */ + *(.bss.page_aligned) + *(.bss) + __bss_stop = .; + } - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; } - STABS_DEBUG + _end = . ; - DWARF_DEBUG + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG } - /* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); -- cgit v1.1 From b2ba83ff4f4405cebc10884121ee71338a1a6c94 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 26 Apr 2009 23:38:08 -0700 Subject: x86: apic: Remove duplicated macros XAPIC_DEST_* is dupliicated to the one in apicdef.h [ Impact: cleanup ] Signed-off-by: Yinghai Lu LKML-Reference: <49F552D0.5050505@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/summit_32.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9cfe1f4..344eee4 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); } - -/* In clustered mode, the high nibble of APIC ID is a cluster number. - * The low nibble is a 4-bit bitmap. */ -#define XAPIC_DEST_CPUS_SHIFT 4 -#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) -#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) - #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) static const struct cpumask *summit_target_cpus(void) -- cgit v1.1 From e0e42142bab96404de535cceb85d6533d5ad7942 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 26 Apr 2009 23:39:38 -0700 Subject: x86: Use dmi check in apic_is_clustered() on 64-bit to mark the TSC unstable We will have systems with 2 and more sockets 8cores/2thread, but we treat them as multi chassis - while they could have a stable TSC domain. Use DMI check instead. [ Impact: do not turn possibly stable TSCs off incorrectly ] Signed-off-by: Yinghai Lu Cc: Ravikiran Thirumalai LKML-Reference: <49F5532A.5000802@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 86 +++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1386dbe..28f747d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2138,31 +2138,14 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ #ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) + +static int __cpuinit apic_cluster_num(void) { int i, clusters, zeros; unsigned id; u16 *bios_cpu_apicid; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); @@ -2198,18 +2181,67 @@ __cpuinit int apic_is_clustered_box(void) ++zeros; } - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) + return clusters; +} + +static int __cpuinitdata multi_checked; +static int __cpuinitdata multi; + +static int __cpuinit set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void __cpuinit dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +__cpuinit int apic_is_clustered_box(void) +{ + dmi_check_multi(); + if (multi) return 1; + if (!is_vsmp_box()) + return 0; + /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. + * ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards */ - return (clusters > 2); + if (apic_cluster_num() > 1) + return 1; + + return 0; } #endif -- cgit v1.1 From fcef5911c7ea89b80d5bfc727f402f37c9eefd57 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:58:23 -0700 Subject: x86/irq: remove leftover code from NUMA_MIGRATE_IRQ_DESC The original feature of migrating irq_desc dynamic was too fragile and was causing problems: it caused crashes on systems with lots of cards with MSI-X when user-space irq-balancer was enabled. We now have new patches that create irq_desc according to device numa node. This patch removes the leftover bits of the dynamic balancer. [ Impact: remove dead code ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F654AF.8000808@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 56 +++--------------------------------------- 1 file changed, 4 insertions(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30da617..9fbf0f7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -148,9 +148,6 @@ struct irq_cfg { unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - u8 move_desc_pending : 1; -#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -254,8 +251,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) return 0; } -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - +/* for move_irq_desc */ static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) { @@ -356,19 +352,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->chip_data = NULL; } } - -static void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg = desc->chip_data; - - if (!cfg->move_in_progress) { - /* it means that domain is not changed */ - if (!cpumask_intersects(desc->affinity, mask)) - cfg->move_desc_pending = 1; - } -} -#endif +/* end for move_irq_desc */ #else static struct irq_cfg *irq_cfg(unsigned int irq) @@ -378,13 +362,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq) #endif -#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -} -#endif - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -592,9 +569,6 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) if (assign_irq_vector(irq, cfg, mask)) return BAD_APICID; - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - cpumask_copy(desc->affinity, mask); return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); @@ -2393,8 +2367,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) if (assign_irq_vector(irq, cfg, mask)) return; - set_extra_move_desc(desc, mask); - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); irte.vector = cfg->vector; @@ -2491,34 +2463,14 @@ static void irq_complete_move(struct irq_desc **descp) struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - if (likely(!cfg->move_desc_pending)) - return; - - /* domain has not changed, but affinity did */ - me = smp_processor_id(); - if (cpumask_test_cpu(me, desc->affinity)) { - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; - cfg->move_desc_pending = 0; - } -#endif + if (likely(!cfg->move_in_progress)) return; - } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; -#endif + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); - } } #else static inline void irq_complete_move(struct irq_desc **descp) {} -- cgit v1.1 From d5dedd4507d307eb3f35f21b6e16f336fdc0d82a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:59:21 -0700 Subject: irq: change ->set_affinity() to return status according to Ingo, change set_affinity() in irq_chip should return int, because that way we can handle failure cases in a much cleaner way, in the genirq layer. v2: fix two typos [ Impact: extend API ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: linux-arch@vger.kernel.org LKML-Reference: <49F654E9.4070809@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 64 +++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9fbf0f7..5c7630b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -574,13 +574,14 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); } -static void +static int set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; unsigned long flags; unsigned int dest; unsigned int irq; + int ret = -1; irq = desc->irq; cfg = desc->chip_data; @@ -591,18 +592,21 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); + ret = 0; } spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; } -static void +static int set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc; desc = irq_to_desc(irq); - set_ioapic_affinity_irq_desc(desc, mask); + return set_ioapic_affinity_irq_desc(desc, mask); } #endif /* CONFIG_SMP */ @@ -2348,24 +2352,25 @@ static int ioapic_retrigger_irq(unsigned int irq) * Real vector that is used for interrupting cpu will be coming from * the interrupt-remapping table entry. */ -static void +static int migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; unsigned int dest; unsigned int irq; + int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return; + return ret; irq = desc->irq; if (get_irte(irq, &irte)) - return; + return ret; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; + return ret; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2381,27 +2386,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) send_cleanup_vector(cfg); cpumask_copy(desc->affinity, mask); + + return 0; } /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - migrate_ioapic_irq_desc(desc, mask); + return migrate_ioapic_irq_desc(desc, mask); } -static void set_ir_ioapic_affinity_irq(unsigned int irq, +static int set_ir_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - set_ir_ioapic_affinity_irq_desc(desc, mask); + return set_ir_ioapic_affinity_irq_desc(desc, mask); } #else -static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { + return 0; } #endif @@ -3318,7 +3326,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3327,7 +3335,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3339,13 +3347,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); + + return 0; } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void +static int ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); @@ -3354,11 +3364,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irte irte; if (get_irte(irq, &irte)) - return; + return -1; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3375,6 +3385,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) */ if (cfg->move_in_progress) send_cleanup_vector(cfg); + + return 0; } #endif @@ -3528,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq) #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3537,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3549,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3582,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3591,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3603,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3659,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3667,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); + + return 0; } #endif -- cgit v1.1 From 85ac16d033370caf6f48d743c8dc8103700f5cc5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:00:38 -0700 Subject: x86/irq: change irq_desc_alloc() to take node instead of cpu This simplifies the node awareness of the code. All our allocators only deal with a NUMA node ID locality not with CPU ids anyway - so there's no need to maintain (and transform) a CPU id all across the IRq layer. v2: keep move_irq_desc related [ Impact: cleanup, prepare IRQ code to be NUMA-aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: Jeremy Fitzhardinge LKML-Reference: <49F65536.2020300@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5c7630b..560b887 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -129,12 +129,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +static struct irq_pin_list *get_one_free_irq_2_pin(int node) { struct irq_pin_list *pin; - int node; - - node = cpu_to_node(cpu); pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); @@ -209,12 +206,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) return cfg; } -static struct irq_cfg *get_one_free_irq_cfg(int cpu) +static struct irq_cfg *get_one_free_irq_cfg(int node) { struct irq_cfg *cfg; - int node; - - node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { @@ -235,13 +229,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) return cfg; } -int arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; cfg = desc->chip_data; if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(cpu); + desc->chip_data = get_one_free_irq_cfg(node); if (!desc->chip_data) { printk(KERN_ERR "can not alloc irq_cfg\n"); BUG_ON(1); @@ -253,7 +247,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) /* for move_irq_desc */ static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) { struct irq_pin_list *old_entry, *head, *tail, *entry; @@ -262,7 +256,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) if (!old_entry) return; - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) return; @@ -272,7 +266,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) tail = entry; old_entry = old_entry->next; while (old_entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { entry = head; while (entry) { @@ -312,12 +306,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) } void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { struct irq_cfg *cfg; struct irq_cfg *old_cfg; - cfg = get_one_free_irq_cfg(cpu); + cfg = get_one_free_irq_cfg(node); if (!cfg) return; @@ -328,7 +322,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); - init_copy_irq_2_pin(old_cfg, cfg, cpu); + init_copy_irq_2_pin(old_cfg, cfg, node); } static void free_irq_cfg(struct irq_cfg *old_cfg) @@ -615,13 +609,13 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list *entry; entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", apic, pin); @@ -641,7 +635,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(cpu); + entry->next = get_one_free_irq_2_pin(node); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -650,7 +644,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { @@ -670,7 +664,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); + add_pin_to_irq_node(cfg, node, newapic, newpin); } static inline void io_apic_modify_irq(struct irq_cfg *cfg, @@ -1612,7 +1606,7 @@ static void __init setup_IO_APIC_irqs(void) int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1647,13 +1641,13 @@ static void __init setup_IO_APIC_irqs(void) apic->multi_timer_check(apic_id, irq)) continue; - desc = irq_to_desc_alloc_cpu(irq, cpu); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc for %d\n", irq); continue; } cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); + add_pin_to_irq_node(cfg, node, apic_id, pin); setup_IO_APIC_irq(apic_id, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); @@ -2863,7 +2857,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -2929,7 +2923,7 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); + add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { /* for edge trigger, setup_IO_APIC_irq already @@ -2966,7 +2960,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); enable_8259A_irq(0); if (timer_irq_works()) { @@ -3185,7 +3179,7 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); struct irq_desc *desc_new = NULL; irq = 0; @@ -3194,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want) spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_cpu(new, cpu); + desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; @@ -3968,7 +3962,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p { struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -3976,7 +3970,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p return -EINVAL; } - desc = irq_to_desc_alloc_cpu(irq, cpu); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc %d\n", irq); return 0; @@ -3987,7 +3981,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p */ if (irq >= NR_IRQS_LEGACY) { cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); + add_pin_to_irq_node(cfg, node, ioapic, pin); } setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); -- cgit v1.1 From a2f809b08ae4dddc1015c7dcd8659e5729e45b3e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:01:20 -0700 Subject: irq: change ACPI GSI APIs to also take a device argument We want to use dev_to_node() later on, to be aware of the 'home node' of the GSI in question. [ Impact: cleanup, prepare the IRQ code to be more NUMA aware ] Signed-off-by: Yinghai Lu Acked-by: Len Brown Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: Len Brown Cc: Bjorn Helgaas Cc: Tony Luck Cc: linux-acpi@vger.kernel.org Cc: linux-ia64@vger.kernel.org LKML-Reference: <49F65560.20904@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 8 ++++---- arch/x86/kernel/apic/io_apic.c | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d..6ee96b5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -522,7 +522,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -539,7 +539,7 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -1158,7 +1158,7 @@ void __init mp_config_acpi_legacy_irqs(void) } } -int mp_register_gsi(u32 gsi, int triggering, int polarity) +int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; int ioapic_pin; @@ -1253,7 +1253,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) } } #endif - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); return gsi; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 560b887..d934662 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3958,7 +3958,8 @@ int __init io_apic_get_version(int ioapic) } #endif -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) { struct irq_desc *desc; struct irq_cfg *cfg; -- cgit v1.1 From 024154cfdd802654cb236a18c78b6e37351e2c49 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:01:50 -0700 Subject: irq: change io_apic_set_pci_routing() to use device parameter Make actual use of the device parameter passed down to io_apic_set_pci_routing() - to have the IRQ descriptor on the home node of the device. If no device has been passed down, we assume it's a platform device and use the boot node ID for the IRQ descriptor. [ Impact: optimization, make IO-APIC code more NUMA aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F6557E.3080101@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d934662..82376e0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3963,7 +3963,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, { struct irq_desc *desc; struct irq_cfg *cfg; - int node = cpu_to_node(boot_cpu_id); + int node; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -3971,6 +3971,11 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, return -EINVAL; } + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc %d\n", irq); -- cgit v1.1 From d047f53a2ecce37e3bdf79eac5a326fbaadb3628 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:02:23 -0700 Subject: x86/irq: change MSI irq_desc to be more numa aware Try to get irq_desc on the home node in create_irq_nr(). v2: don't check if we can move it when sparse_irq is not used v3: use move_irq_des, if that node is not what we want [ Impact: optimization, make MSI IRQ descriptors more NUMA aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F6559F.7070005@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 82376e0..9cd4806 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3172,14 +3172,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want) +unsigned int create_irq_nr(unsigned int irq_want, int node) { /* Allocate an unused irq */ unsigned int irq; unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int node = cpu_to_node(boot_cpu_id); struct irq_desc *desc_new = NULL; irq = 0; @@ -3197,6 +3196,13 @@ unsigned int create_irq_nr(unsigned int irq_want) if (cfg_new->vector != 0) continue; + +#ifdef CONFIG_NUMA_IRQ_DESC + /* different node ?*/ + if (desc_new->node != node) + desc = move_irq_desc(desc, node); +#endif + if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; break; @@ -3214,11 +3220,12 @@ unsigned int create_irq_nr(unsigned int irq_want) int create_irq(void) { + int node = cpu_to_node(boot_cpu_id); unsigned int irq_want; int irq; irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) irq = -1; @@ -3476,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) unsigned int irq_want; struct intel_iommu *iommu = NULL; int index = 0; + int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; + node = dev_to_node(&dev->dev); irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) return -1; irq_want = irq + 1; -- cgit v1.1 From aee6a166a5401dcfcb17fcdc055e5edf2a4f4042 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:17 +0200 Subject: x86: beautify vmlinux_32.lds.S Beautify vmlinux_32.lds.S: - Use tabs for indent - Located curly braces like in C code - Rearranged a few comments To see actual differences use "git diff -b" which ignore 'whitespace' changes. The beautification is done to prepare a unification of the _32 and _64 variants of the linker scripts. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-1-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_32.lds.S | 376 +++++++++++++++++++++------------------ 1 file changed, 200 insertions(+), 176 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 62ad500..fffa45a 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -22,196 +22,220 @@ ENTRY(phys_startup_32) jiffies = jiffies_64; PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - /* writeable */ - . = ALIGN(PAGE_SIZE); - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ - DATA_DATA - CONSTRUCTORS + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; + + /* Text and read-only data */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* read-only */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RODATA + + /* writeable */ + . = ALIGN(PAGE_SIZE); + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS } :data - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - _edata = .; /* End of data section */ - } - - . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* will be freed after init */ - . = ALIGN(PAGE_SIZE); /* Init code and data */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } + + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } + + . = ALIGN(32); + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ + . = ALIGN(32); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + + /* End of data section */ + _edata = .; + } + + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } + + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + } + /* will be freed after init + * Following ALIGN() is required to make sure no other data falls on the + * same page where __smp_alt_end is pointing as that page might be freed + * after boot. Always make sure that ALIGN() directive is present after + * the section which contains __smp_alt_end. + */ + . = ALIGN(PAGE_SIZE); + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + . = ALIGN(4); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + . = ALIGN(4); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + #if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif - PERCPU(PAGE_SIZE) - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + PERCPU(PAGE_SIZE) + . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } + /* freed after init ends here */ + + /* BSS */ + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __init_end = .; + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = . ; } - STABS_DEBUG + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.discard) + } - DWARF_DEBUG + STABS_DEBUG + DWARF_DEBUG } /* -- cgit v1.1 From 17ce265d6a1789eae5eb739a3bb7fcffdb3e87c5 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:18 +0200 Subject: x86, vmlinux.lds: unify header/footer Merge everything except PHDRS and SECTIONS into vmlinux.lds.S. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 77 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 37 ------------------- arch/x86/kernel/vmlinux_64.lds.S | 42 ---------------------- 3 files changed, 77 insertions(+), 79 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee61..d113642 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,5 +1,82 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares + * + * Modernisation and unification done by Sam Ravnborg + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + +#ifdef CONFIG_X86_32 +#define LOAD_OFFSET __PAGE_OFFSET +#else +#define LOAD_OFFSET __START_KERNEL_map +#endif + +#include +#include +#include +#include +#include +#include + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else # include "vmlinux_64.lds.S" #endif + + +#ifdef CONFIG_X86_32 +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + +#ifdef CONFIG_SMP +ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif + diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index fffa45a..4c985fc 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,26 +1,3 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares ; - * - * Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ - -#define LOAD_OFFSET __PAGE_OFFSET - -#include -#include -#include -#include -#include - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) -jiffies = jiffies_64; - PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -237,17 +214,3 @@ SECTIONS STABS_DEBUG DWARF_DEBUG } - -/* - * Build-time check on the image size: - */ -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_KEXEC -/* Link time checks */ -#include - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 6d5a5b0..7f1cc3d 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,19 +1,3 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares ; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include -#include -#include - -#undef i386 /* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -308,29 +292,3 @@ SECTIONS STABS_DEBUG DWARF_DEBUG } - -/* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); - -/* - * Build-time check on the image size: - */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); -#endif - -#ifdef CONFIG_KEXEC -#include - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif -- cgit v1.1 From afb8095a7eab32e5760613fa73d2f80a39cc45bf Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:19 +0200 Subject: x86, vmlinux.lds: unify PHDRS PHDRS are not equal for the two - so use ifdefs to cover up for that. On the assumption that they may become equal the ifdef is inside the PHDRS definiton. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-3-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 13 +++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 5 ----- arch/x86/kernel/vmlinux_64.lds.S | 11 ----------- 3 files changed, 13 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d113642..1a1b303 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -40,6 +40,19 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_X86_64 + user PT_LOAD FLAGS(7); /* RWE */ + data.init PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(7); /* RWE */ +#endif + data.init2 PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 4c985fc..4fd40dc 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,8 +1,3 @@ -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} SECTIONS { . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 7f1cc3d..6e7cbee 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,14 +1,3 @@ -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ -#endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} SECTIONS { . = __START_KERNEL; -- cgit v1.1 From 444e0ae4831f99ba25062d9a5ccb7117c62841a0 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:20 +0200 Subject: x86, vmlinux.lds: unify start/end of SECTIONS [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-4-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 14 ++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 9 --------- arch/x86/kernel/vmlinux_64.lds.S | 9 --------- 3 files changed, 14 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1a1b303..845776f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -54,12 +54,26 @@ PHDRS { note PT_NOTE FLAGS(0); /* ___ */ } +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; +#else + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else # include "vmlinux_64.lds.S" #endif + STABS_DEBUG + DWARF_DEBUG +} + #ifdef CONFIG_X86_32 ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 4fd40dc..3d3d49c 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,8 +1,3 @@ -SECTIONS -{ - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - /* Text and read-only data */ .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { _text = .; @@ -205,7 +200,3 @@ SECTIONS *(.exitcall.exit) *(.discard) } - - STABS_DEBUG - DWARF_DEBUG -} diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 6e7cbee..2d7fa20 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,8 +1,3 @@ -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; @@ -277,7 +272,3 @@ SECTIONS *(.eh_frame) *(.discard) } - - STABS_DEBUG - DWARF_DEBUG -} -- cgit v1.1 From dfc20895d944cfa81d8ff00809b68ecb8f72cbb0 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:21 +0200 Subject: x86, vmlinux.lds: unify .text output sections 32 bit x86 had a dedicated .text.head output section, whereas 64 bit had it all in a single output section. In the unified version the dedicated .text.head output section was kept to have full control over the head code. 32 bit: - Moved definition of _stext to the linker script. The definition is located _after_ .text.page_aligned as this is what 32 bit did before. The ALIGN(8) was introduced so we hit the exact same address (on the tested config) before and after the move. I assume that it is a bug that _stext did not cover the .text.page_aligned section - if this is true it can be fixed in a follow-up patch (and the ugly ALIGN() can be dropped). [ Impact: 64-bit: cleanup, 32-bit: use the 64-bit linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-5-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 7 ------- arch/x86/kernel/vmlinux.lds.S | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 24 ------------------------ arch/x86/kernel/vmlinux_64.lds.S | 20 -------------------- 4 files changed, 31 insertions(+), 51 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 3068388..dc5ed4b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -608,13 +608,6 @@ ignore_int: ENTRY(initial_code) .long i386_start_kernel -.section .text -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - /* * BSS section */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 845776f..a7c88bb 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -64,6 +64,37 @@ SECTIONS phys_startup_64 = startup_64 - LOAD_OFFSET; #endif + /* Text and read-only data */ + + /* bootstrapping code */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* The rest of the text */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_X86_32 + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) +#endif + . = ALIGN(8); + _stext = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 3d3d49c..8540092 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,27 +1,3 @@ - /* Text and read-only data */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - /* not really needed, already page aligned */ - . = ALIGN(PAGE_SIZE); - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - /* Exception table */ . = ALIGN(16); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 2d7fa20..b5d4367 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,23 +1,3 @@ - /* Text and read-only data */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - /* Exception table */ . = ALIGN(16); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { -- cgit v1.1 From 448bc3ab0d03e77fee8e4264de0d001fc87bc164 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:22 +0200 Subject: x86, vmlinux.lds: unify exception table [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-6-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 10 ++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 10 ---------- arch/x86/kernel/vmlinux_64.lds.S | 10 ---------- 3 files changed, 10 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a7c88bb..67164f6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -94,6 +94,16 @@ SECTIONS NOTES :text :note + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RODATA + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 8540092..920cc69 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,13 +1,3 @@ - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - /* writeable */ . = ALIGN(PAGE_SIZE); /* Data */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index b5d4367..641f3f9 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,13 +1,3 @@ - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - /* Align data segment to page size boundary */ . = ALIGN(PAGE_SIZE); /* Data */ -- cgit v1.1 From 1f6397bac55040cd520d9eaf299e155a7aa01d5f Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:23 +0200 Subject: x86, vmlinux.lds: unify data output sections For 64 bit the following functional changes are introduced: - .data.page_aligned has moved - .data.cacheline_aligned has moved - .data.read_mostly has moved - ALIGN() moved out of output section for .data.cacheline_aligned - ALIGN() moved out of output section for .data.page_aligned Notice that 32 bit and 64 bit has different location of _edata. .data_nosave is 32 bit only as 64 bit is special due to PERCPU. [ Impact: 32-bit: cleanup, 64-bit: use 32-bit linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-7-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 55 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 37 --------------------------- arch/x86/kernel/vmlinux_64.lds.S | 28 -------------------- 3 files changed, 55 insertions(+), 65 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 67164f6..067bdb0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -104,6 +104,61 @@ SECTIONS RODATA + /* Data */ + . = ALIGN(PAGE_SIZE); + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + +#ifdef CONFIG_X86_64 + /* End of data section */ + _edata = .; +#endif + } :data + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } +#endif + + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } + +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +#endif + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); +#endif + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + +#ifdef CONFIG_X86_32 + /* End of data section */ + _edata = .; +#endif + } + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 920cc69..8ade846 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,40 +1,3 @@ - /* writeable */ - . = ALIGN(PAGE_SIZE); - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - } :data - - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - - /* End of data section */ - _edata = .; - } - /* init_task */ . = ALIGN(THREAD_SIZE); .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 641f3f9..8262701 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,26 +1,3 @@ - /* Align data segment to page size boundary */ - . = ALIGN(PAGE_SIZE); - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - /* End of data section */ - _edata = .; - } :data - - - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - #define VSYSCALL_ADDR (-10*1024*1024) #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ SIZEOF(.data.read_mostly) + 4095) & ~(4095)) @@ -95,11 +72,6 @@ *(.data.init_task) } :data.init - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { /* might get freed after init */ . = ALIGN(PAGE_SIZE); -- cgit v1.1 From ff6f87e1626e10beef675084c9b5384a9477e3d5 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:24 +0200 Subject: x86, vmlinux.lds: move vsyscall output sections [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-8-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 71 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_64.lds.S | 68 -------------------------------------- 2 files changed, 71 insertions(+), 68 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 067bdb0..b3106c2 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -159,6 +159,77 @@ SECTIONS #endif } +#ifdef CONFIG_X86_64 + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); + + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } + + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); + + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } + + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 8262701..013aa0e 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,71 +1,3 @@ -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { - *(.vsyscall_0) - } :user - - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { - *(.vsyscall_fn) - } - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { - *(.vsyscall_gtod_data) - } - - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { - *(.vsyscall_clock) - } - vsyscall_clock = VVIRT(.vsyscall_clock); - - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { - *(.vsyscall_1) - } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { - *(.vsyscall_2) - } - - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { - *(.vgetcpu_mode) - } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { - *(.jiffies) - } - jiffies = VVIRT(.jiffies); - - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { - *(.vsyscall_3) - } - - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { . = ALIGN(THREAD_SIZE); -- cgit v1.1 From e58bdaa8f810332e5c1760ce496b01e07d51642c Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:25 +0200 Subject: x86, vmlinux.lds: unify first part of initdata 32-bit: - Move definition of __init_begin outside output_section because it covers more than one section - Move ALIGN() for end-of-section inside .smp_locks output section. Same effect but the intent is better documented that we need both start and end aligned. 64-bit: - Move ALIGN() outside output section in .init.setup - Deleted unused __smp_alt_* symbols None of the above should result in any functional change. [ Impact: refactor and unify linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-9-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 61 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 60 --------------------------------------- arch/x86/kernel/vmlinux_64.lds.S | 59 -------------------------------------- 3 files changed, 61 insertions(+), 119 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b3106c2..8b203c4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -231,6 +231,67 @@ SECTIONS #endif /* CONFIG_X86_64 */ + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } +#ifdef CONFIG_X86_64 + :data.init +#endif + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 8ade846..d8ba539 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,63 +1,3 @@ - /* init_task */ - . = ALIGN(THREAD_SIZE); - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* Init code and data - will be freed after init */ - . = ALIGN(PAGE_SIZE); - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - - SECURITY_INIT - . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { __alt_instructions = .; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 013aa0e..0e8054e 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,62 +1,3 @@ - /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); - *(.data.init_task) - } :data.init - - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - /* Init code and data */ - . = ALIGN(PAGE_SIZE); - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; -- cgit v1.1 From ae61836289a415351caa524d328110aaeae100d4 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:26 +0200 Subject: x86, vmlinux.lds: unify parainstructions 32 bit: - increase alignment from 4 to 8 for .parainstructions - increase alignment from 4 to 8 for .altinstructions 64 bit: - move ALIGN() outside output section for .altinstructions None of the above should result in any functional change. [ Impact: refactor and unify linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-10-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 18 ++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 18 ------------------ arch/x86/kernel/vmlinux_64.lds.S | 18 ------------------ 3 files changed, 18 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8b203c4..c8dd71e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,6 +291,24 @@ SECTIONS SECURITY_INIT + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index d8ba539..5df9647 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,21 +1,3 @@ - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 0e8054e..9ef7096 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,21 +1,3 @@ - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame -- cgit v1.1 From bf6a57418d5445c98047edbec022c9e54d1526e6 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:27 +0200 Subject: x86, vmlinux.lds: unify .exit.* and .init.ramfs [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-11-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 20 ++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 21 --------------------- arch/x86/kernel/vmlinux_64.lds.S | 21 --------------------- 3 files changed, 20 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8dd71e..1ab62a5 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -309,6 +309,26 @@ SECTIONS *(.altinstr_replacement) } + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 5df9647..36c8fbd 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,24 +1,3 @@ - /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 9ef7096..1aa5362 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,24 +1,3 @@ - /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - #ifdef CONFIG_SMP /* * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the -- cgit v1.1 From 9d16e78318f174fd4b07916a93e41749d5199267 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:28 +0200 Subject: x86, vmlinux.lds: unify percpu 32 bit: - move __init_end outside the .bss output section It really did not belong in there [ Impact: 64-bit: cleanup, 32-bit: refactor linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-12-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 6 ------ arch/x86/kernel/vmlinux_64.lds.S | 26 -------------------------- 3 files changed, 30 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ab62a5..1ea2b85 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -330,6 +330,36 @@ SECTIONS } #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) +#else + PERCPU(PAGE_SIZE) +#endif + + . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ + __init_end = .; + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ +#endif + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 36c8fbd..d23ee2c 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,11 +1,5 @@ - PERCPU(PAGE_SIZE) - - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - /* BSS */ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; __bss_start = .; *(.bss.page_aligned) *(.bss) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 1aa5362..a539366 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,29 +1,3 @@ -#ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else - PERCPU(PAGE_SIZE) -#endif - - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 - /* use another section data.init2, see PERCPU_VADDR() above */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); __bss_start = .; /* BSS */ -- cgit v1.1 From 091e52c3551d3031343df24b573b770b4c6c72b6 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:29 +0200 Subject: x86, vmlinux.lds: unify remaining parts 32 bit: - explicit page align .bss - move ALING() out of .brk output section - discard *(.eh_frame) 64 bit: - move ALIGN() out of .bss output section - move ALIGN() out of .brk output section - use a dedicated section to define _end [ Impact: unify and fix section alignments in linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-13-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 32 +++++++++++++++++++++++++++----- arch/x86/kernel/vmlinux_32.lds.S | 26 -------------------------- arch/x86/kernel/vmlinux_64.lds.S | 24 ------------------------ 3 files changed, 27 insertions(+), 55 deletions(-) delete mode 100644 arch/x86/kernel/vmlinux_32.lds.S delete mode 100644 arch/x86/kernel/vmlinux_64.lds.S (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ea2b85..ef3e4f1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -359,12 +359,34 @@ SECTIONS /* use another section data.init2, see PERCPU_VADDR() above */ #endif + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } -#ifdef CONFIG_X86_32 -# include "vmlinux_32.lds.S" -#else -# include "vmlinux_64.lds.S" -#endif + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = .; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } STABS_DEBUG DWARF_DEBUG diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index d23ee2c..0000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,26 +0,0 @@ - /* BSS */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __bss_start = .; - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = .; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index a539366..0000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,24 +0,0 @@ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = .; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } -- cgit v1.1 From 91fd7fe809bdf4d8aa56559d17b9f25a1a6fe732 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 10:58:38 +0200 Subject: x86, vmlinux.lds: add copyright Acked-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ef3e4f1..0bdbaa5 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -3,7 +3,8 @@ * * Historic 32-bit version written by Martin Mares * - * Modernisation and unification done by Sam Ravnborg + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg * * * Don't define absolute symbols until and unless you know that symbol -- cgit v1.1 From fd0731944333db6e9e91b6954c6ef95f4b71ab04 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 12:56:58 +0200 Subject: x86, vmlinux.lds: fix relocatable symbols __init_begin/_end symbols should be inside sections as well, otherwise the relocatable kernel gets confused when freeing init sections in the wrong place. [ Impact: fix bootup crash ] Cc: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <20090429105056.GA28720@uranus.ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0bdbaa5..4c85b2e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -255,8 +255,8 @@ SECTIONS /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); - __init_begin = .; /* paired with __init_end */ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ _sinittext = .; INIT_TEXT _einittext = .; @@ -346,8 +346,11 @@ SECTIONS #endif . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ - __init_end = .; + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } #ifdef CONFIG_X86_64 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { -- cgit v1.1 From da1a776be1ac7f78bb30ececbec4c1383163b079 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:46:58 +0200 Subject: perf_counter, x86: remove X86_FEATURE_ARCH_PERFMON flag for AMD cpus X86_FEATURE_ARCH_PERFMON is an Intel hardware feature that does not work on AMD CPUs. The flag is now only used in Intel specific code (especially initialization). [ Impact: refactor code ] Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241002046-8832-2-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ---- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd69c51..7e4a459 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -420,10 +420,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - /* Enable Performance counter for K7 and later */ - if (c->x86 > 6 && c->x86 <= 0x11) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0fcbaab..7d0f81d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -949,6 +949,9 @@ static struct pmc_x86_ops *pmc_intel_init(void) unsigned int unused; unsigned int ebx; + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return NULL; + /* * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. @@ -987,9 +990,6 @@ static struct pmc_x86_ops *pmc_amd_init(void) void __init init_hw_perf_counters(void) { - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: pmc_ops = pmc_intel_init(); -- cgit v1.1 From 4138960a9251a265002b5cf07e671a49f8495381 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:00 +0200 Subject: perf_counter, x86: add default path to cpu detection This quits hw counter initialization immediately if no cpu is detected. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-4-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7d0f81d..d6d6529 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -997,6 +997,8 @@ void __init init_hw_perf_counters(void) case X86_VENDOR_AMD: pmc_ops = pmc_amd_init(); break; + default: + return; } if (!pmc_ops) return; -- cgit v1.1 From 4295ee62660b13ddb87d41539f49b239e6e7d56f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:01 +0200 Subject: perf_counter, x86: rework pmc_amd_save_disable_all() and pmc_amd_restore_all() MSR reads and writes are expensive. This patch adds checks to avoid its usage where possible. [ Impact: micro-optimization on AMD CPUs ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-5-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d6d6529..75a0903 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -334,11 +334,13 @@ static u64 pmc_amd_save_disable_all(void) for (idx = 0; idx < nr_counters_generic; idx++) { u64 val; + if (!test_bit(idx, cpuc->active_mask)) + continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } return enabled; @@ -372,13 +374,15 @@ static void pmc_amd_restore_all(u64 ctrl) return; for (idx = 0; idx < nr_counters_generic; idx++) { - if (test_bit(idx, cpuc->active_mask)) { - u64 val; + u64 val; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + continue; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } } -- cgit v1.1 From 527e26af3741a2168986d8b82653ffe173891324 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:02 +0200 Subject: perf_counter, x86: protect per-cpu variables with compile barriers only Per-cpu variables needn't to be protected with cpu barriers (smp_wmb()). Protection is only needed for preemption on the same cpu (rescheduling or the nmi handler). This can be done using a compiler barrier only. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-6-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 75a0903..ad663d5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -673,7 +673,7 @@ try_generic: /* * Make it visible before enabling the hw: */ - smp_wmb(); + barrier(); __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -745,7 +745,7 @@ static void pmc_generic_disable(struct perf_counter *counter) * Make sure the cleared pointer becomes visible before we * (potentially) free the counter: */ - smp_wmb(); + barrier(); /* * Drain the remaining delta count out of a counter -- cgit v1.1 From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:03 +0200 Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu This patch renames struct hw_perf_counter_ops into struct pmu. It introduces a structure to describe a cpu specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ad663d5..95de980 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__pmc_generic_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) +__x86_pmu_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter, } static void -__pmc_generic_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +__x86_pmu_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); @@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static int pmc_generic_enable(struct perf_counter *counter) +static int x86_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -667,7 +667,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; /* @@ -676,7 +676,7 @@ try_generic: barrier(); __hw_perf_counter_set_period(counter, hwc, idx); - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); return 0; } @@ -731,13 +731,13 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void pmc_generic_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); } /* @@ -805,7 +805,7 @@ again: perf_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __pmc_generic_disable(counter, &counter->hw, bit); + __x86_pmu_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); @@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void pmc_generic_read(struct perf_counter *counter) +static void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } -static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .enable = pmc_generic_enable, - .disable = pmc_generic_disable, - .read = pmc_generic_read, +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, }; -const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { int err; @@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter) if (err) return ERR_PTR(err); - return &x86_perf_counter_ops; + return &pmu; } /* -- cgit v1.1 From 5f4ec28ffe77c840354cce1820a3436106e9e0f1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:04 +0200 Subject: perf_counter, x86: rename struct pmc_x86_ops into struct x86_pmu This patch renames struct pmc_x86_ops into struct x86_pmu. It introduces a structure to describe an x86 model specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-8-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 135 +++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 67 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 95de980..808a1a1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -44,9 +44,9 @@ struct cpu_hw_counters { }; /* - * struct pmc_x86_ops - performance counter x86 ops + * struct x86_pmu - generic x86 pmu */ -struct pmc_x86_ops { +struct x86_pmu { u64 (*save_disable_all)(void); void (*restore_all)(u64); u64 (*get_status)(u64); @@ -60,7 +60,7 @@ struct pmc_x86_ops { int max_events; }; -static struct pmc_x86_ops *pmc_ops __read_mostly; +static struct x86_pmu *x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, @@ -82,12 +82,12 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static u64 pmc_intel_event_map(int event) +static u64 intel_pmu_event_map(int event) { return intel_perfmon_event_map[event]; } -static u64 pmc_intel_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL @@ -114,12 +114,12 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -static u64 pmc_amd_event_map(int event) +static u64 amd_pmu_event_map(int event) { return amd_perfmon_event_map[event]; } -static u64 pmc_amd_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL @@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void) disable_lapic_nmi_watchdog(); for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu->perfctr + i)) goto perfctr_fail; } for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu->eventsel + i)) goto eventsel_fail; } @@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(pmc_ops->eventsel + i); + release_evntsel_nmi(x86_pmu->eventsel + i); i = nr_counters_generic; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(pmc_ops->perfctr + i); + release_perfctr_nmi(x86_pmu->perfctr + i); if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); @@ -216,8 +216,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < nr_counters_generic; i++) { - release_perfctr_nmi(pmc_ops->perfctr + i); - release_evntsel_nmi(pmc_ops->eventsel + i); + release_perfctr_nmi(x86_pmu->perfctr + i); + release_evntsel_nmi(x86_pmu->eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) @@ -293,14 +293,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (perf_event_raw(hw_event)) { - hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); + hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event)); } else { - if (perf_event_id(hw_event) >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= x86_pmu->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu->event_map(perf_event_id(hw_event)); } counter->destroy = hw_perf_counter_destroy; @@ -308,7 +308,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 pmc_intel_save_disable_all(void) +static u64 intel_pmu_save_disable_all(void) { u64 ctrl; @@ -318,7 +318,7 @@ static u64 pmc_intel_save_disable_all(void) return ctrl; } -static u64 pmc_amd_save_disable_all(void) +static u64 amd_pmu_save_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int enabled, idx; @@ -327,7 +327,8 @@ static u64 pmc_amd_save_disable_all(void) cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the - * counters proper, so that pcm_amd_enable() does the right thing. + * counters proper, so that amd_pmu_enable_counter() does the + * right thing. */ barrier(); @@ -351,19 +352,19 @@ u64 hw_perf_save_disable(void) if (unlikely(!perf_counters_initialized)) return 0; - return pmc_ops->save_disable_all(); + return x86_pmu->save_disable_all(); } /* * Exported because of ACPI idle */ EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void pmc_intel_restore_all(u64 ctrl) +static void intel_pmu_restore_all(u64 ctrl) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } -static void pmc_amd_restore_all(u64 ctrl) +static void amd_pmu_restore_all(u64 ctrl) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; @@ -391,14 +392,14 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->restore_all(ctrl); + x86_pmu->restore_all(ctrl); } /* * Exported because of ACPI idle */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static u64 pmc_intel_get_status(u64 mask) +static u64 intel_pmu_get_status(u64 mask) { u64 status; @@ -407,7 +408,7 @@ static u64 pmc_intel_get_status(u64 mask) return status; } -static u64 pmc_amd_get_status(u64 mask) +static u64 amd_pmu_get_status(u64 mask) { u64 status = 0; int idx; @@ -432,15 +433,15 @@ static u64 hw_perf_get_status(u64 mask) if (unlikely(!perf_counters_initialized)) return 0; - return pmc_ops->get_status(mask); + return x86_pmu->get_status(mask); } -static void pmc_intel_ack_status(u64 ack) +static void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void pmc_amd_ack_status(u64 ack) +static void amd_pmu_ack_status(u64 ack) { } @@ -449,16 +450,16 @@ static void hw_perf_ack_status(u64 ack) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->ack_status(ack); + x86_pmu->ack_status(ack); } -static void pmc_intel_enable(int idx, u64 config) +static void intel_pmu_enable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static void pmc_amd_enable(int idx, u64 config) +static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -474,15 +475,15 @@ static void hw_perf_enable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->enable(idx, config); + x86_pmu->enable(idx, config); } -static void pmc_intel_disable(int idx, u64 config) +static void intel_pmu_disable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); } -static void pmc_amd_disable(int idx, u64 config) +static void amd_pmu_disable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -496,7 +497,7 @@ static void hw_perf_disable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->disable(idx, config); + x86_pmu->disable(idx, config); } static inline void @@ -613,11 +614,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -661,8 +662,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = pmc_ops->eventsel; - hwc->counter_base = pmc_ops->perfctr; + hwc->config_base = x86_pmu->eventsel; + hwc->counter_base = x86_pmu->perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -710,8 +711,8 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); - rdmsrl(pmc_ops->perfctr + idx, pmc_count); + rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu->perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -918,35 +919,35 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; -static struct pmc_x86_ops pmc_intel_ops = { - .save_disable_all = pmc_intel_save_disable_all, - .restore_all = pmc_intel_restore_all, - .get_status = pmc_intel_get_status, - .ack_status = pmc_intel_ack_status, - .enable = pmc_intel_enable, - .disable = pmc_intel_disable, +static struct x86_pmu intel_pmu = { + .save_disable_all = intel_pmu_save_disable_all, + .restore_all = intel_pmu_restore_all, + .get_status = intel_pmu_get_status, + .ack_status = intel_pmu_ack_status, + .enable = intel_pmu_enable_counter, + .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = pmc_intel_event_map, - .raw_event = pmc_intel_raw_event, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; -static struct pmc_x86_ops pmc_amd_ops = { - .save_disable_all = pmc_amd_save_disable_all, - .restore_all = pmc_amd_restore_all, - .get_status = pmc_amd_get_status, - .ack_status = pmc_amd_ack_status, - .enable = pmc_amd_enable, - .disable = pmc_amd_disable, +static struct x86_pmu amd_pmu = { + .save_disable_all = amd_pmu_save_disable_all, + .restore_all = amd_pmu_restore_all, + .get_status = amd_pmu_get_status, + .ack_status = amd_pmu_ack_status, + .enable = amd_pmu_enable_counter, + .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, - .event_map = pmc_amd_event_map, - .raw_event = pmc_amd_raw_event, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; -static struct pmc_x86_ops *pmc_intel_init(void) +static struct x86_pmu *intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -977,10 +978,10 @@ static struct pmc_x86_ops *pmc_intel_init(void) nr_counters_fixed = edx.split.num_counters_fixed; counter_value_mask = (1ULL << eax.split.bit_width) - 1; - return &pmc_intel_ops; + return &intel_pmu; } -static struct pmc_x86_ops *pmc_amd_init(void) +static struct x86_pmu *amd_pmu_init(void) { nr_counters_generic = 4; nr_counters_fixed = 0; @@ -989,22 +990,22 @@ static struct pmc_x86_ops *pmc_amd_init(void) pr_info("AMD Performance Monitoring support detected.\n"); - return &pmc_amd_ops; + return &amd_pmu; } void __init init_hw_perf_counters(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - pmc_ops = pmc_intel_init(); + x86_pmu = intel_pmu_init(); break; case X86_VENDOR_AMD: - pmc_ops = pmc_amd_init(); + x86_pmu = amd_pmu_init(); break; default: return; } - if (!pmc_ops) + if (!x86_pmu) return; pr_info("... num counters: %d\n", nr_counters_generic); -- cgit v1.1 From 39d81eab2374d71b2d9c82f66258a1a4f57ddd2e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:05 +0200 Subject: perf_counter, x86: make interrupt handler model specific This separates the perfcounter interrupt handler for AMD and Intel cpus. The AMD interrupt handler implementation is a follow-on patch. [ Impact: refactor and clean up code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-9-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 808a1a1..9d90de0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -4,6 +4,7 @@ * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar * Copyright(C) 2009 Jaswinder Singh Rajput + * Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter * * For licencing details see kernel-base/COPYING */ @@ -47,6 +48,7 @@ struct cpu_hw_counters { * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); u64 (*get_status)(u64); @@ -241,6 +243,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int err; + /* disable temporarily */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return -ENOSYS; + if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -780,7 +786,7 @@ static void perf_save_and_restart(struct perf_counter *counter) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); u64 ack, status; @@ -827,6 +833,8 @@ out: return ret; } +static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; } + void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; @@ -851,7 +859,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ack_APIC_irq(); - __smp_perf_counter_interrupt(regs, 0); + x86_pmu->handle_irq(regs, 0); irq_exit(); } @@ -908,7 +916,7 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = __smp_perf_counter_interrupt(regs, 1); + ret = x86_pmu->handle_irq(regs, 1); return ret ? NOTIFY_STOP : NOTIFY_OK; } @@ -920,6 +928,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { }; static struct x86_pmu intel_pmu = { + .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, .get_status = intel_pmu_get_status, @@ -934,6 +943,7 @@ static struct x86_pmu intel_pmu = { }; static struct x86_pmu amd_pmu = { + .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, .get_status = amd_pmu_get_status, -- cgit v1.1 From b7f8859a8ed1937e2139c17b84878f1d413fa659 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:06 +0200 Subject: perf_counter, x86: remove get_status() from struct x86_pmu This function is Intel only and not necessary for AMD cpus. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-10-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 39 +++++--------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9d90de0..d0bb029 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -51,7 +51,6 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - u64 (*get_status)(u64); void (*ack_status)(u64); void (*enable)(int, u64); void (*disable)(int, u64); @@ -405,41 +404,15 @@ void hw_perf_restore(u64 ctrl) */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static u64 intel_pmu_get_status(u64 mask) +static inline u64 intel_pmu_get_status(u64 mask) { u64 status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static u64 amd_pmu_get_status(u64 mask) -{ - u64 status = 0; - int idx; - - for (idx = 0; idx < nr_counters_generic; idx++) { - s64 val; - - if (!(mask & (1 << idx))) - continue; - - rdmsrl(MSR_K7_PERFCTR0 + idx, val); - val <<= (64 - counter_value_bits); - if (val >= 0) - status |= (1 << idx); - } - - return status; -} - -static u64 hw_perf_get_status(u64 mask) -{ if (unlikely(!perf_counters_initialized)) return 0; + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - return x86_pmu->get_status(mask); + return status; } static void intel_pmu_ack_status(u64 ack) @@ -795,7 +768,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) cpuc->throttle_ctrl = hw_perf_save_disable(); - status = hw_perf_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(cpuc->throttle_ctrl); if (!status) goto out; @@ -820,7 +793,7 @@ again: /* * Repeat if there is more work to be done: */ - status = hw_perf_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(cpuc->throttle_ctrl); if (status) goto again; out: @@ -931,7 +904,6 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, - .get_status = intel_pmu_get_status, .ack_status = intel_pmu_ack_status, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, @@ -946,7 +918,6 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, - .get_status = amd_pmu_get_status, .ack_status = amd_pmu_ack_status, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, -- cgit v1.1 From dee5d9067ca78b317538fd67930be4e09a83dbc5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:07 +0200 Subject: perf_counter, x86: remove ack_status() from struct x86_pmu This function is Intel only and not necessary for AMD cpus. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-11-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d0bb029..6bbdc16 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -51,7 +51,6 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - void (*ack_status)(u64); void (*enable)(int, u64); void (*disable)(int, u64); unsigned eventsel; @@ -415,23 +414,11 @@ static inline u64 intel_pmu_get_status(u64 mask) return status; } -static void intel_pmu_ack_status(u64 ack) +static inline void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void amd_pmu_ack_status(u64 ack) -{ -} - -static void hw_perf_ack_status(u64 ack) -{ - if (unlikely(!perf_counters_initialized)) - return; - - x86_pmu->ack_status(ack); -} - static void intel_pmu_enable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, @@ -788,7 +775,7 @@ again: __x86_pmu_disable(counter, &counter->hw, bit); } - hw_perf_ack_status(ack); + intel_pmu_ack_status(ack); /* * Repeat if there is more work to be done: @@ -904,7 +891,6 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, - .ack_status = intel_pmu_ack_status, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -918,7 +904,6 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, - .ack_status = amd_pmu_ack_status, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, -- cgit v1.1 From 26816c287e13eedc67bc4ed0cd40c138314b7c7d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:08 +0200 Subject: perf_counter, x86: rename __hw_perf_counter_set_period into x86_perf_counter_set_period [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-12-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6bbdc16..fa6541d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -498,7 +498,7 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); * To be called with the counter disabled in hw: */ static void -__hw_perf_counter_set_period(struct perf_counter *counter, +x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); @@ -642,7 +642,7 @@ try_generic: */ barrier(); - __hw_perf_counter_set_period(counter, hwc, idx); + x86_perf_counter_set_period(counter, hwc, idx); __x86_pmu_enable(counter, hwc, idx); return 0; @@ -731,7 +731,7 @@ static void perf_save_and_restart(struct perf_counter *counter) int idx = hwc->idx; x86_perf_counter_update(counter, hwc, idx); - __hw_perf_counter_set_period(counter, hwc, idx); + x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) __x86_pmu_enable(counter, hwc, idx); -- cgit v1.1 From 55de0f2e57994b525324bf0d04d242d9358a2417 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:09 +0200 Subject: perf_counter, x86: rename intel only functions [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-13-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fa6541d..5a52d73 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -725,7 +725,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: */ -static void perf_save_and_restart(struct perf_counter *counter) +static void intel_pmu_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; @@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); int ret = 0; - cpuc->throttle_ctrl = hw_perf_save_disable(); + cpuc->throttle_ctrl = intel_pmu_save_disable_all(); status = intel_pmu_get_status(cpuc->throttle_ctrl); if (!status) @@ -770,7 +770,7 @@ again: if (!counter) continue; - perf_save_and_restart(counter); + intel_pmu_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) __x86_pmu_disable(counter, &counter->hw, bit); } @@ -788,7 +788,7 @@ out: * Restore - do not reenable when global enable is off or throttled: */ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - hw_perf_restore(cpuc->throttle_ctrl); + intel_pmu_restore_all(cpuc->throttle_ctrl); return ret; } -- cgit v1.1 From 72eae04d3a3075c26d39e1e685acfc8e8c29db64 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:10 +0200 Subject: perf_counter, x86: modify initialization of struct x86_pmu This patch adds an error handler and changes initialization of struct x86_pmu. No functional changes. Needed for follow-on patches. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-14-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a52d73..7c72a94 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -913,7 +913,7 @@ static struct x86_pmu amd_pmu = { .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; -static struct x86_pmu *intel_pmu_init(void) +static int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -921,7 +921,7 @@ static struct x86_pmu *intel_pmu_init(void) unsigned int ebx; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return NULL; + return -ENODEV; /* * Check whether the Architectural PerfMon supports @@ -929,49 +929,54 @@ static struct x86_pmu *intel_pmu_init(void) */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return NULL; + return -ENODEV; intel_perfmon_version = eax.split.version_id; if (intel_perfmon_version < 2) - return NULL; + return -ENODEV; pr_info("Intel Performance Monitoring support detected.\n"); pr_info("... version: %d\n", intel_perfmon_version); pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); + x86_pmu = &intel_pmu; + nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; counter_value_mask = (1ULL << eax.split.bit_width) - 1; - return &intel_pmu; + return 0; } -static struct x86_pmu *amd_pmu_init(void) +static int amd_pmu_init(void) { + x86_pmu = &amd_pmu; + nr_counters_generic = 4; nr_counters_fixed = 0; counter_value_mask = 0x0000FFFFFFFFFFFFULL; counter_value_bits = 48; pr_info("AMD Performance Monitoring support detected.\n"); - - return &amd_pmu; + return 0; } void __init init_hw_perf_counters(void) { + int err; + switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - x86_pmu = intel_pmu_init(); + err = intel_pmu_init(); break; case X86_VENDOR_AMD: - x86_pmu = amd_pmu_init(); + err = amd_pmu_init(); break; default: return; } - if (!x86_pmu) + if (err != 0) return; pr_info("... num counters: %d\n", nr_counters_generic); -- cgit v1.1 From 4a06bd8508f65ad1dd5cd2046b85694813fa36a2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:11 +0200 Subject: perf_counter, x86: make x86_pmu data a static struct Instead of using a pointer to reference to the x86 pmu we now have one single data structure that is initialized at the beginning. This saves the pointer access when using this memory. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-15-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7c72a94..68597d7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -60,7 +60,7 @@ struct x86_pmu { int max_events; }; -static struct x86_pmu *x86_pmu __read_mostly; +static struct x86_pmu x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, @@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void) disable_lapic_nmi_watchdog(); for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_perfctr_nmi(x86_pmu->perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_evntsel_nmi(x86_pmu->eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu->eventsel + i); + release_evntsel_nmi(x86_pmu.eventsel + i); i = nr_counters_generic; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu->perfctr + i); + release_perfctr_nmi(x86_pmu.perfctr + i); if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); @@ -216,8 +216,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < nr_counters_generic; i++) { - release_perfctr_nmi(x86_pmu->perfctr + i); - release_evntsel_nmi(x86_pmu->eventsel + i); + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) @@ -297,14 +297,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event)); + hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); } else { - if (perf_event_id(hw_event) >= x86_pmu->max_events) + if (perf_event_id(hw_event) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu->event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); } counter->destroy = hw_perf_counter_destroy; @@ -356,7 +356,7 @@ u64 hw_perf_save_disable(void) if (unlikely(!perf_counters_initialized)) return 0; - return x86_pmu->save_disable_all(); + return x86_pmu.save_disable_all(); } /* * Exported because of ACPI idle @@ -396,7 +396,7 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->restore_all(ctrl); + x86_pmu.restore_all(ctrl); } /* * Exported because of ACPI idle @@ -441,7 +441,7 @@ static void hw_perf_enable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->enable(idx, config); + x86_pmu.enable(idx, config); } static void intel_pmu_disable_counter(int idx, u64 config) @@ -463,7 +463,7 @@ static void hw_perf_disable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->disable(idx, config); + x86_pmu.disable(idx, config); } static inline void @@ -580,11 +580,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -628,8 +628,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = x86_pmu->eventsel; - hwc->counter_base = x86_pmu->perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->counter_base = x86_pmu.perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -677,8 +677,8 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu->perfctr + idx, pmc_count); + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -819,7 +819,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ack_APIC_irq(); - x86_pmu->handle_irq(regs, 0); + x86_pmu.handle_irq(regs, 0); irq_exit(); } @@ -876,7 +876,7 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = x86_pmu->handle_irq(regs, 1); + ret = x86_pmu.handle_irq(regs, 1); return ret ? NOTIFY_STOP : NOTIFY_OK; } @@ -940,7 +940,7 @@ static int intel_pmu_init(void) pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); - x86_pmu = &intel_pmu; + x86_pmu = intel_pmu; nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; @@ -951,7 +951,7 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { - x86_pmu = &amd_pmu; + x86_pmu = amd_pmu; nr_counters_generic = 4; nr_counters_fixed = 0; -- cgit v1.1 From 0933e5c6a680ba8d8d786a6f7fa377b7ec0d1e49 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:12 +0200 Subject: perf_counter, x86: move counter parameters to struct x86_pmu [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-16-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 80 ++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 68597d7..75dbb1f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -24,16 +24,7 @@ #include static bool perf_counters_initialized __read_mostly; - -/* - * Number of (generic) HW counters: - */ -static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; -static u64 counter_value_mask __read_mostly; -static int counter_value_bits __read_mostly; - -static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; @@ -58,6 +49,10 @@ struct x86_pmu { u64 (*event_map)(int); u64 (*raw_event)(u64); int max_events; + int num_counters; + int num_counters_fixed; + int counter_bits; + u64 counter_mask; }; static struct x86_pmu x86_pmu __read_mostly; @@ -183,12 +178,12 @@ static bool reserve_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -199,7 +194,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = nr_counters_generic; + i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) @@ -215,7 +210,7 @@ static void release_pmc_hardware(void) { int i; - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } @@ -336,7 +331,7 @@ static u64 amd_pmu_save_disable_all(void) */ barrier(); - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -378,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl) if (!ctrl) return; - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -527,7 +522,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->prev_count, (u64)-left); err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & counter_value_mask); + (u64)(-left) & x86_pmu.counter_mask); } static inline void @@ -621,8 +616,9 @@ static int x86_pmu_enable(struct perf_counter *counter) /* Try to get the previous generic counter again */ if (test_and_set_bit(idx, cpuc->used)) { try_generic: - idx = find_first_zero_bit(cpuc->used, nr_counters_generic); - if (idx == nr_counters_generic) + idx = find_first_zero_bit(cpuc->used, + x86_pmu.num_counters); + if (idx == x86_pmu.num_counters) return -EAGAIN; set_bit(idx, cpuc->used); @@ -654,7 +650,7 @@ void perf_counter_print_debug(void) struct cpu_hw_counters *cpuc; int cpu, idx; - if (!nr_counters_generic) + if (!x86_pmu.num_counters) return; local_irq_disable(); @@ -676,7 +672,7 @@ void perf_counter_print_debug(void) } pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); @@ -689,7 +685,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < nr_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -911,6 +907,9 @@ static struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 4, + .counter_bits = 48, + .counter_mask = (1ULL << 48) - 1, }; static int intel_pmu_init(void) @@ -941,10 +940,10 @@ static int intel_pmu_init(void) pr_info("... mask length: %d\n", eax.split.mask_length); x86_pmu = intel_pmu; - - nr_counters_generic = eax.split.num_counters; - nr_counters_fixed = edx.split.num_counters_fixed; - counter_value_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; return 0; } @@ -952,12 +951,6 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; - - nr_counters_generic = 4; - nr_counters_fixed = 0; - counter_value_mask = 0x0000FFFFFFFFFFFFULL; - counter_value_bits = 48; - pr_info("AMD Performance Monitoring support detected.\n"); return 0; } @@ -979,25 +972,26 @@ void __init init_hw_perf_counters(void) if (err != 0) return; - pr_info("... num counters: %d\n", nr_counters_generic); - if (nr_counters_generic > X86_PMC_MAX_GENERIC) { - nr_counters_generic = X86_PMC_MAX_GENERIC; + pr_info("... num counters: %d\n", x86_pmu.num_counters); + if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_counters_generic, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters, X86_PMC_MAX_GENERIC); } - perf_counter_mask = (1 << nr_counters_generic) - 1; - perf_max_counters = nr_counters_generic; + perf_counter_mask = (1 << x86_pmu.num_counters) - 1; + perf_max_counters = x86_pmu.num_counters; - pr_info("... value mask: %016Lx\n", counter_value_mask); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - if (nr_counters_fixed > X86_PMC_MAX_FIXED) { - nr_counters_fixed = X86_PMC_MAX_FIXED; + if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - nr_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); } - pr_info("... fixed counters: %d\n", nr_counters_fixed); + pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed); - perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; + perf_counter_mask |= + ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; pr_info("... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; -- cgit v1.1 From faa28ae018ed004a22aa4a7704e04ccdde4a941e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:13 +0200 Subject: perf_counter, x86: make pmu version generic This makes the use of the version variable generic. Also, some debug messages have been generalized. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-17-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 75dbb1f..15d2c03 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -39,6 +39,8 @@ struct cpu_hw_counters { * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + const char *name; + int version; int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); @@ -61,8 +63,6 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; -static __read_mostly int intel_perfmon_version; - /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -658,7 +658,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (intel_perfmon_version >= 2) { + if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -884,6 +884,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { }; static struct x86_pmu intel_pmu = { + .name = "Intel", .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, @@ -897,6 +898,7 @@ static struct x86_pmu intel_pmu = { }; static struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, @@ -918,6 +920,7 @@ static int intel_pmu_init(void) union cpuid10_eax eax; unsigned int unused; unsigned int ebx; + int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return -ENODEV; @@ -930,16 +933,12 @@ static int intel_pmu_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return -ENODEV; - intel_perfmon_version = eax.split.version_id; - if (intel_perfmon_version < 2) + version = eax.split.version_id; + if (version < 2) return -ENODEV; - pr_info("Intel Performance Monitoring support detected.\n"); - pr_info("... version: %d\n", intel_perfmon_version); - pr_info("... bit width: %d\n", eax.split.bit_width); - pr_info("... mask length: %d\n", eax.split.mask_length); - x86_pmu = intel_pmu; + x86_pmu.version = version; x86_pmu.num_counters = eax.split.num_counters; x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; x86_pmu.counter_bits = eax.split.bit_width; @@ -951,7 +950,6 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; - pr_info("AMD Performance Monitoring support detected.\n"); return 0; } @@ -972,6 +970,10 @@ void __init init_hw_perf_counters(void) if (err != 0) return; + pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... num counters: %d\n", x86_pmu.num_counters); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { x86_pmu.num_counters = X86_PMC_MAX_GENERIC; -- cgit v1.1 From bb775fc2d1dcd1aa6eafde37a8289ba2d80783aa Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:14 +0200 Subject: perf_counter, x86: make x86_pmu_read() static inline [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-18-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 15d2c03..3f3ae47 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1002,7 +1002,7 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } -- cgit v1.1 From 93904966934193204ad08e951f806d5631c29eb3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:15 +0200 Subject: perf_counter, x86: rename cpuc->active_mask This is to have a consistent naming scheme with cpuc->used. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-19-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f3ae47..9ec51a6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -29,9 +29,9 @@ static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; }; @@ -334,7 +334,7 @@ static u64 amd_pmu_save_disable_all(void) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) @@ -376,7 +376,7 @@ static void amd_pmu_restore_all(u64 ctrl) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) @@ -424,7 +424,7 @@ static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, cpuc->active_mask); + set_bit(idx, cpuc->active); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -448,7 +448,7 @@ static void amd_pmu_disable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - clear_bit(idx, cpuc->active_mask); + clear_bit(idx, cpuc->active); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } -- cgit v1.1 From 095342389e2ed8deed07b3076f990260ce3c7c9f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:16 +0200 Subject: perf_counter, x86: generic use of cpuc->active cpuc->active will now be used to indicate an enabled counter which implies also valid pointers of cpuc->counters[]. In contrast, cpuc->used only locks the counter, but it can be still uninitialized. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-20-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9ec51a6..f7fd4a3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -424,7 +424,6 @@ static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, cpuc->active); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -446,9 +445,6 @@ static void intel_pmu_disable_counter(int idx, u64 config) static void amd_pmu_disable_counter(int idx, u64 config) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - clear_bit(idx, cpuc->active); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } @@ -633,10 +629,7 @@ try_generic: __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - /* - * Make it visible before enabling the hw: - */ - barrier(); + set_bit(idx, cpuc->active); x86_perf_counter_set_period(counter, hwc, idx); __x86_pmu_enable(counter, hwc, idx); @@ -700,10 +693,13 @@ static void x86_pmu_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active); __x86_pmu_disable(counter, hwc, idx); - clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; /* * Make sure the cleared pointer becomes visible before we * (potentially) free the counter: @@ -715,6 +711,8 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + cpuc->counters[idx] = NULL; + clear_bit(idx, cpuc->used); } /* @@ -763,7 +761,7 @@ again: struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); - if (!counter) + if (!test_bit(bit, cpuc->active)) continue; intel_pmu_save_and_restart(counter); -- cgit v1.1 From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:17 +0200 Subject: perf_counter, x86: consistent use of type int for counter index The type of counter index is sometimes implemented as unsigned int. This patch changes this to have a consistent usage of int. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7fd4a3..d8beebe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config) static inline void __pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter, static inline void __x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, static inline void __pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - unsigned int idx = hwc->idx; + int idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler -- cgit v1.1 From 7c90cc45f89af4dd4617f97d452740ad95b800d5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:18 +0200 Subject: perf_counter, x86: rework counter enable functions There is vendor specific code in generic x86 code, and there is vendor specific code that could be generic. This patch introduces x86_pmu_enable_counter() for x86 generic code. Fixed counter code for Intel is moved to Intel only functions. In the end, checks and calls via function pointers were reduced to the necessary. Also, the internal function i/f changed. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-22-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 52 ++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d8beebe..ae55933 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -44,7 +44,7 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - void (*enable)(int, u64); + void (*enable)(struct hw_perf_counter *, int); void (*disable)(int, u64); unsigned eventsel; unsigned perfctr; @@ -414,28 +414,15 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void intel_pmu_enable_counter(int idx, u64 config) +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, - config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} - -static void amd_pmu_enable_counter(int idx, u64 config) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - config |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - wrmsrl(MSR_K7_EVNTSEL0 + idx, config); -} + int err; -static void hw_perf_enable(int idx, u64 config) -{ if (unlikely(!perf_counters_initialized)) return; - x86_pmu.enable(idx, config); + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } static void intel_pmu_disable_counter(int idx, u64 config) @@ -522,8 +509,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, } static inline void -__pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -548,14 +534,24 @@ __pmc_fixed_enable(struct perf_counter *counter, err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void -__x86_pmu_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - __pmc_fixed_enable(counter, hwc, idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + x86_pmu_enable_counter(hwc, idx); else - hw_perf_enable(idx, hwc->config); + amd_pmu_disable_counter(idx, hwc->config); } static int @@ -632,7 +628,7 @@ try_generic: set_bit(idx, cpuc->active); x86_perf_counter_set_period(counter, hwc, idx); - __x86_pmu_enable(counter, hwc, idx); + x86_pmu.enable(hwc, idx); return 0; } @@ -728,7 +724,7 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __x86_pmu_enable(counter, hwc, idx); + intel_pmu_enable_counter(hwc, idx); } /* -- cgit v1.1 From d43698918bd46c71d494555fb92195fbea1fcb6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:19 +0200 Subject: perf_counter, x86: rework counter disable functions As for the enable function, this patch reworks the disable functions and introduces x86_pmu_disable_counter(). The internal function i/f in struct x86_pmu changed too. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-23-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 48 ++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ae55933..df9012b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -45,7 +45,7 @@ struct x86_pmu { u64 (*save_disable_all)(void); void (*restore_all)(u64); void (*enable)(struct hw_perf_counter *, int); - void (*disable)(int, u64); + void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -425,28 +425,19 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static void intel_pmu_disable_counter(int idx, u64 config) +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); -} - -static void amd_pmu_disable_counter(int idx, u64 config) -{ - wrmsrl(MSR_K7_EVNTSEL0 + idx, config); - -} + int err; -static void hw_perf_disable(int idx, u64 config) -{ if (unlikely(!perf_counters_initialized)) return; - x86_pmu.disable(idx, config); + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config); } static inline void -__pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -460,13 +451,20 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - __pmc_fixed_disable(counter, hwc, idx); - else - hw_perf_disable(idx, hwc->config); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + x86_pmu_disable_counter(hwc, idx); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -551,7 +549,7 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) if (cpuc->enabled) x86_pmu_enable_counter(hwc, idx); else - amd_pmu_disable_counter(idx, hwc->config); + x86_pmu_disable_counter(hwc, idx); } static int @@ -622,7 +620,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __x86_pmu_disable(counter, hwc, idx); + x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; set_bit(idx, cpuc->active); @@ -694,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * could reenable again: */ clear_bit(idx, cpuc->active); - __x86_pmu_disable(counter, hwc, idx); + x86_pmu.disable(hwc, idx); /* * Make sure the cleared pointer becomes visible before we @@ -762,7 +760,7 @@ again: intel_pmu_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __x86_pmu_disable(counter, &counter->hw, bit); + intel_pmu_disable_counter(&counter->hw, bit); } intel_pmu_ack_status(ack); -- cgit v1.1 From 85cf9dba92152bb4edec118b2f4f0be1ae7fdcab Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:20 +0200 Subject: perf_counter, x86: change and remove pmu initialization checks Some functions are only called if the pmu was proper initialized. That initalization checks can be removed. The way to check initialization changed too. Now, the pointer to the interrupt handler is checked. If it exists the pmu is initialized. This also removes a static variable and uses struct x86_pmu as only data source for the check. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-24-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index df9012b..2d3681b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -23,7 +23,6 @@ #include #include -static bool perf_counters_initialized __read_mostly; static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { @@ -227,6 +226,11 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) } } +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -240,8 +244,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return -ENOSYS; - if (unlikely(!perf_counters_initialized)) - return -EINVAL; + if (!x86_pmu_initialized()) + return -ENODEV; err = 0; if (atomic_inc_not_zero(&num_counters)) { @@ -348,9 +352,8 @@ static u64 amd_pmu_save_disable_all(void) u64 hw_perf_save_disable(void) { - if (unlikely(!perf_counters_initialized)) + if (!x86_pmu_initialized()) return 0; - return x86_pmu.save_disable_all(); } /* @@ -388,9 +391,8 @@ static void amd_pmu_restore_all(u64 ctrl) void hw_perf_restore(u64 ctrl) { - if (unlikely(!perf_counters_initialized)) + if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); } /* @@ -402,8 +404,6 @@ static inline u64 intel_pmu_get_status(u64 mask) { u64 status; - if (unlikely(!perf_counters_initialized)) - return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); return status; @@ -417,10 +417,6 @@ static inline void intel_pmu_ack_status(u64 ack) static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { int err; - - if (unlikely(!perf_counters_initialized)) - return; - err = checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } @@ -428,10 +424,6 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { int err; - - if (unlikely(!perf_counters_initialized)) - return; - err = checking_wrmsrl(hwc->config_base + idx, hwc->config); } @@ -787,10 +779,10 @@ void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + if (!x86_pmu_initialized()) return; - if (unlikely(!perf_counters_initialized)) + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; cpuc = &__get_cpu_var(cpu_hw_counters); @@ -829,8 +821,9 @@ void perf_counters_lapic_init(int nmi) { u32 apic_val; - if (!perf_counters_initialized) + if (!x86_pmu_initialized()) return; + /* * Enable the performance counter vector in the APIC LVT: */ @@ -988,7 +981,6 @@ void __init init_hw_perf_counters(void) ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_initialized = true; perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); -- cgit v1.1 From a29aa8a7ff93e4196d558036928597e68337dd8d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:21 +0200 Subject: perf_counter, x86: implement the interrupt handler for AMD cpus This patch implements the interrupt handler for AMD performance counters. In difference to the Intel pmu, there is no single status register and also there are no fixed counters. This makes the handler very different and it is useful to make the handler vendor specific. To check if a counter is overflowed the upper bit of the counter is checked. Only counters where the active bit is set are checked. With this patch throttling is enabled for AMD performance counters. This patch also reenables Linux performance counters on AMD cpus. [ Impact: re-enable perfcounters on AMD CPUs ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-25-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 45 +++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2d3681b..f4d59d4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -240,10 +240,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int err; - /* disable temporarily */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return -ENOSYS; - if (!x86_pmu_initialized()) return -ENODEV; @@ -773,7 +769,43 @@ out: return ret; } -static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; } +static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) +{ + int cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + u64 val; + int handled = 0; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx; + + ++cpuc->interrupts; + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active)) + continue; + counter = cpuc->counters[idx]; + hwc = &counter->hw; + x86_perf_counter_update(counter, hwc, idx); + val = atomic64_read(&hwc->prev_count); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + /* counter overflow */ + x86_perf_counter_set_period(counter, hwc, idx); + handled = 1; + inc_irq_stat(apic_perf_irqs); + if (perf_counter_overflow(counter, nmi, regs, 0)) + amd_pmu_disable_counter(hwc, idx); + else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) + /* + * do not reenable when throttled, but reload + * the register + */ + amd_pmu_disable_counter(hwc, idx); + else if (counter->state == PERF_COUNTER_STATE_ACTIVE) + amd_pmu_enable_counter(hwc, idx); + } + return handled; +} void perf_counter_unthrottle(void) { @@ -782,9 +814,6 @@ void perf_counter_unthrottle(void) if (!x86_pmu_initialized()) return; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) -- cgit v1.1 From 4b7bfd0d276da3a006d37e85d3cf900d7a14ae2a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:22 +0200 Subject: perf_counter, x86: return raw count with x86_perf_counter_update() To check on AMD cpus if a counter overflows, the upper bit of the raw counter value must be checked. This value is already internally available in x86_perf_counter_update(). Now, the value is returned so that it can be used directly to check for overflows. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-26-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f4d59d4..a8a53ab 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -132,7 +132,7 @@ static u64 amd_pmu_raw_event(u64 event) * Can only be executed on the CPU where the counter is active. * Returns the delta events processed. */ -static void +static u64 x86_perf_counter_update(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -165,6 +165,8 @@ again: atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; } static atomic_t num_counters; @@ -785,8 +787,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) continue; counter = cpuc->counters[idx]; hwc = &counter->hw; - x86_perf_counter_update(counter, hwc, idx); - val = atomic64_read(&hwc->prev_count); + val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; /* counter overflow */ -- cgit v1.1 From c619b8ffb1cec6a431687a35695dc6fd292a79e6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:23 +0200 Subject: perf_counter, x86: introduce max_period variable In x86 pmus the allowed counter period to programm differs. This introduces a max_period value and allows the generic implementation for all models to check the max period. [ Impact: generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-27-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a8a53ab..4b8715b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -54,6 +54,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + u64 max_period; }; static struct x86_pmu x86_pmu __read_mostly; @@ -279,14 +280,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic counter period: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) - hwc->irq_period = 0x7FFFFFFF; + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) + hwc->irq_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, hwc->irq_period); @@ -910,6 +905,12 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + .max_period = (1ULL << 31) - 1, }; static struct x86_pmu amd_pmu = { @@ -927,6 +928,8 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, }; static int intel_pmu_init(void) @@ -999,6 +1002,7 @@ void __init init_hw_perf_counters(void) perf_max_counters = x86_pmu.num_counters; pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; -- cgit v1.1 From ef7b3e09ffdcd5200aea9523f6b56d331d1c4fc0 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:24 +0200 Subject: perf_counter, x86: remove vendor check in fixed_mode_idx() The function fixed_mode_idx() is used generically. Now it checks the num_counters_fixed value instead of the vendor to decide if fixed counters are present. [ Impact: generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4b8715b..d1c8036 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -542,7 +542,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (!x86_pmu.num_counters_fixed) return -1; if (unlikely(hwc->nmi)) -- cgit v1.1 From 19d84dab55a383d75c885b5c1a618f5ead96f2f6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:25 +0200 Subject: perf_counter, x86: remove unused function argument in intel_pmu_get_status() The mask argument is unused and thus can be removed. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-29-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d1c8036..856b0b8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -393,7 +393,7 @@ void hw_perf_restore(u64 ctrl) */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static inline u64 intel_pmu_get_status(u64 mask) +static inline u64 intel_pmu_get_status(void) { u64 status; @@ -728,7 +728,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) cpuc->throttle_ctrl = intel_pmu_save_disable_all(); - status = intel_pmu_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(); if (!status) goto out; @@ -753,7 +753,7 @@ again: /* * Repeat if there is more work to be done: */ - status = intel_pmu_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(); if (status) goto again; out: -- cgit v1.1 From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 14:52:50 +0200 Subject: perf_counter: add/update copyrights Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 856b0b8..47e563b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1,10 +1,11 @@ /* * Performance counter x86 architecture code * - * Copyright(C) 2008 Thomas Gleixner - * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar - * Copyright(C) 2009 Jaswinder Singh Rajput - * Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * * For licencing details see kernel-base/COPYING */ -- cgit v1.1 From 43f6201a22dbf1c5abe1cab96b49bd56fa9df8f4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 16:55:56 +0200 Subject: perf_counter, x86: rename bitmasks to ->used_mask and ->active_mask Standardize on explicitly mentioning '_mask' in fields that are not plain flags but masks. This avoids typos like: if (cpuc->used) (which could easily slip through review unnoticed), while if a typo looks like this: if (cpuc->used_mask) it might get noticed during review. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241016956-24648-1-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 47e563b..fc06f4d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -28,8 +28,8 @@ static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; - unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long active[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; int enabled; @@ -332,7 +332,7 @@ static u64 amd_pmu_save_disable_all(void) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) @@ -373,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) @@ -576,7 +576,7 @@ static int x86_pmu_enable(struct perf_counter *counter) * Try to get the fixed counter, if that is already taken * then try to get a generic counter: */ - if (test_and_set_bit(idx, cpuc->used)) + if (test_and_set_bit(idx, cpuc->used_mask)) goto try_generic; hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; @@ -590,14 +590,14 @@ static int x86_pmu_enable(struct perf_counter *counter) } else { idx = hwc->idx; /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used)) { + if (test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used, + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_counters); if (idx == x86_pmu.num_counters) return -EAGAIN; - set_bit(idx, cpuc->used); + set_bit(idx, cpuc->used_mask); hwc->idx = idx; } hwc->config_base = x86_pmu.eventsel; @@ -609,7 +609,7 @@ try_generic: x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active); + set_bit(idx, cpuc->active_mask); x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); @@ -643,7 +643,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -677,7 +677,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * Must be done before we disable, otherwise the nmi handler * could reenable again: */ - clear_bit(idx, cpuc->active); + clear_bit(idx, cpuc->active_mask); x86_pmu.disable(hwc, idx); /* @@ -692,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter) */ x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used); + clear_bit(idx, cpuc->used_mask); } /* @@ -741,7 +741,7 @@ again: struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active)) + if (!test_bit(bit, cpuc->active_mask)) continue; intel_pmu_save_and_restart(counter); @@ -779,7 +779,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) ++cpuc->interrupts; for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; counter = cpuc->counters[idx]; hwc = &counter->hw; -- cgit v1.1 From 2b72394e4089643f11669d9610907a1442fe044a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 28 Apr 2009 16:00:49 +0300 Subject: x86: move max_pfn_mapped and max_low_pfn_mapped to setup.c This patch moves the max_pfn_mapped and max_low_pfn_mapped global variables to kernel/setup.c where they're initialized. [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1240923649.1982.21.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b415843..0d77e56 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,14 @@ #define ARCH_SETUP #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + RESERVE_BRK(dmi_alloc, 65536); unsigned int boot_cpu_id __read_mostly; -- cgit v1.1 From 12d161147f828192b5bcc33166f468a827832767 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:10 +0000 Subject: x86: hookup sys_rt_tgsigqueueinfo Make the new sys_rt_tgsigqueueinfo available for x86. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c873..734f92c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -334,3 +334,4 @@ ENTRY(sys_call_table) .long sys_inotify_init1 .long sys_preadv .long sys_pwritev + .long sys_rt_tgsigqueueinfo /* 335 */ -- cgit v1.1 From 63a809a2dc53b91268dd915bbcbd425063893676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:17 +0200 Subject: perf_counter: fix nmi-watchdog interaction When we don't have any perf-counters active, don't act like we know what the NMI is for. [ Impact: fix hard hang with nmi_watchdog=2 ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.109867793@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fc06f4d..d4c0cc9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self, struct pt_regs *regs; int ret; + if (!atomic_read(&num_counters)) + return NOTIFY_DONE; + switch (cmd) { case DIE_NMI: case DIE_NMI_IPI: -- cgit v1.1 From 15e957d08dd4a841359cfec59ecb74041e0097aa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Apr 2009 01:17:50 -0700 Subject: x86/irq: use move_irq_desc() in create_irq_nr() move_irq_desc() will try to move irq_desc to the home node if the allocated one is not correct, in create_irq_nr(). ( This can happen on devices that are on different nodes that are using MSI, when drivers are loaded and unloaded randomly. ) v2: fix non-smp build v3: add NUMA_IRQ_DESC to eliminate #ifdefs [ Impact: improve irq descriptor locality on NUMA systems ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F95EAE.2050903@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9cd4806..e583291fe 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3197,11 +3197,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (cfg_new->vector != 0) continue; -#ifdef CONFIG_NUMA_IRQ_DESC - /* different node ?*/ - if (desc_new->node != node) - desc = move_irq_desc(desc, node); -#endif + desc_new = move_irq_desc(desc_new, node); if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; -- cgit v1.1 From 6f0aced639d346e5f54eea9fcb2784b633493d09 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 1 May 2009 23:54:25 +0400 Subject: x86, apic: use pr_ macro Replace recenly appeared printk with pr_ macro (the file already use a lot of them). [ Impact: cleanup ] Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090501195425.GB4633@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 28f747d..e258bed 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2191,7 +2191,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d) { if (multi) return 0; - printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident); + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); multi = 1; return 0; } -- cgit v1.1 From 1cbac972ba28e706fa9ce4d4c81830040bc811ee Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 2 May 2009 13:39:56 +0400 Subject: x86: uv io-apic - use BUILD_BUG_ON instead of BUG_ON The expression is known to be true/false at compilation time so we're allowed to use build-time instead of run-time check. Also align 'entry' items assignment. [ Impact: shrink kernel a bit, cleanup ] Signed-off-by: Cyrill Gorcunov Cc: Jack Steiner LKML-Reference: <20090502093956.GB4791@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8aef5f9..a80335b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3749,6 +3749,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, eligible_cpu); @@ -3762,15 +3764,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3788,10 +3788,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) struct uv_IO_APIC_route_entry *entry; int mmr_pnode; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - entry->mask = 1; mmr_pnode = uv_blade_to_pnode(mmr_blade); -- cgit v1.1 From 9a8709d44139748fe2e0ab56d20d8c384c8b65ad Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 2 May 2009 00:25:11 +0400 Subject: x86: uv - prevent NULL dereference in uv_system_init() We may reach NULL dereference oops if kmalloc failed. Prevent it with explicit BUG_ON. [ Impact: more controlled assert in 'impossible' scenario ] Signed-off-by: Cyrill Gorcunov Acked-by: Jack Steiner LKML-Reference: <20090501202511.GE4633@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 873bf71..9d9e228 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -569,15 +569,18 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_blade_info); get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_node_to_blade); memset(uv_node_to_blade, 255, bytes); bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_cpu_to_blade); memset(uv_cpu_to_blade, 255, bytes); blade = 0; -- cgit v1.1 From ba77813a2a22d631fe5bc0bf1ec0d11350544b70 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 May 2009 18:47:44 +0200 Subject: perf_counter: x86: fixup nmi_watchdog vs perf_counter boo-boo Invert the atomic_inc_not_zero() test so that we will indeed detect the first activation. Also rename the global num_counters, since its easy to confuse with x86_pmu.num_counters. [ Impact: fix non-working perfcounters on AMD CPUs, cleanup ] Signed-off-by: Peter Zijlstra LKML-Reference: <1241455664.7620.4938.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d4c0cc9..196b58f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -171,7 +171,7 @@ again: return new_raw_count; } -static atomic_t num_counters; +static atomic_t active_counters; static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) @@ -224,7 +224,7 @@ static void release_pmc_hardware(void) static void hw_perf_counter_destroy(struct perf_counter *counter) { - if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); mutex_unlock(&pmc_reserve_mutex); } @@ -248,12 +248,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -ENODEV; err = 0; - if (atomic_inc_not_zero(&num_counters)) { + if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) + if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) err = -EBUSY; else - atomic_inc(&num_counters); + atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } if (err) @@ -280,7 +280,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (capable(CAP_SYS_ADMIN) && hw_event->nmi) hwc->nmi = 1; - hwc->irq_period = hw_event->irq_period; + hwc->irq_period = hw_event->irq_period; if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) hwc->irq_period = x86_pmu.max_period; @@ -871,7 +871,7 @@ perf_counter_nmi_handler(struct notifier_block *self, struct pt_regs *regs; int ret; - if (!atomic_read(&num_counters)) + if (!atomic_read(&active_counters)) return NOTIFY_DONE; switch (cmd) { -- cgit v1.1 From 066d7dea32c9bffe6decc0abe465627656cdd84e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:04:09 +0200 Subject: perf_counter: fix fixed-purpose counter support on v2 Intel-PERFMON Fixed-purpose counters stopped working in a simple 'perf stat ls' run: cache references cache misses Due to: ef7b3e0: perf_counter, x86: remove vendor check in fixed_mode_idx() Which made x86_pmu.num_counters_fixed matter: if it's nonzero, the fixed-purpose counters are utilized. But on v2 perfmon this field is not set (despite there being fixed-purpose PMCs). So add a quirk to set the number of fixed-purpose counters to at least three. [ Impact: add quirk for three fixed-purpose counters on certain Intel CPUs ] Cc: Robert Richter Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 196b58f..a6878b0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -962,7 +962,13 @@ static int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + + /* + * Quirk: v2 perfmon does not report fixed-purpose counters, so + * assume at least 3 counters: + */ + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; -- cgit v1.1 From 975e5f45500dff6d15c0001bb662e9aac0ce0076 Mon Sep 17 00:00:00 2001 From: Samuel Bronson Date: Wed, 6 May 2009 22:27:55 -0400 Subject: x86: use symbolic name for VM86_SIGNAL when used as vm86 default return This code has apparently used "0" and not VM86_SIGNAL since Linux 1.1.9, when Linus added VM86_SIGNAL to vm86.h. This patch changes the code to use the symbolic name. The magic 0 tripped me up in trying to extend the vm86(2) manpage to actually explain vm86()'s interface -- my greps for VM86_SIGNAL came up fruitless. [ Impact: cleanup; no object code change ] Signed-off-by: Samuel Bronson Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vm86_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e..b8035a0 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -318,9 +318,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%ax) to 0 + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ - info->regs32->ax = 0; + info->regs32->ax = VM86_SIGNAL; tsk->thread.saved_sp0 = tsk->thread.sp0; tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); -- cgit v1.1 From 643bec956544d376b7c2a80a3d5c3d0bf94da8d3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 7 May 2009 09:12:50 +0200 Subject: x86: clean up arch/x86/kernel/tsc_sync.c a bit - remove unused define - make the lock variable definition stand out some more - convert KERN_* to pr_info() / pr_warning() [ Impact: cleanup ] LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328..027b5b4 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count; * of a critical section, to be able to prove TSC time-warps: */ static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; + static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk(KERN_INFO - "Skipping synchronization checks as TSC is reliable.\n"); + pr_info("Skipping synchronization checks as TSC is reliable.\n"); return; } - printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); + pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); /* * Reset it - in case this is a second bootup: @@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu) if (nr_warps) { printk("\n"); - printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," - " turning off TSC clock.\n", max_warp); + pr_warning("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { printk(" passed.\n"); @@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&stop_count) != cpus) cpu_relax(); } -#undef NR_LOOPS - -- cgit v1.1 From 6cac5a924668a56c7ccefc345805f1fe0536a90e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 29 Mar 2009 19:56:29 -0700 Subject: xen/x86-64: fix breakpoints and hardware watchpoints Native x86-64 uses the IST mechanism to run int3 and debug traps on an alternative stack. Xen does not do this, and so the frames were being misinterpreted by the ptrace code. This change special-cases these two exceptions by using Xen variants which run on the normal kernel stack properly. Impact: avoid crash or bad data when IST trap is invoked under Xen Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/entry_64.S | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6..bb01ce0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1379,6 +1379,11 @@ END(xen_failsafe_callback) paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment +#ifdef CONFIG_XEN +zeroentry xen_debug do_debug +zeroentry xen_int3 do_int3 +errorentry xen_stack_segment do_stack_segment +#endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE -- cgit v1.1 From 778dedae0cb76a441145f3a0c5d59fcb3ba296d5 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 9 May 2009 12:54:34 +0800 Subject: x86: mce: remove duplicated #include Remove duplicated #include in arch/x86/kernel/cpu/mcheck/mce_intel_64.c. [ Impact: cleanup ] Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index d6b72df..aedf976 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -15,7 +15,6 @@ #include #include #include -#include asmlinkage void smp_thermal_interrupt(void) { -- cgit v1.1 From 45fbe3ee01b8e463b28c2751b5dcc0cbdc142d90 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 6 May 2009 08:06:44 -0700 Subject: x86, e820, pci: reserve extra free space near end of RAM The point is to take all RAM resources we have, and _after_ we've added all the resources we've seen in the E820 tree, we then _also_ try to add fake reserved entries for any "round up to X" at the end of the RAM resources. [ Impact: improve PCI mem-resource allocation robustness, protect "stolen RAM" ] Reported-by: Yannick Roehlly Acked-by: Jesse Barnes Signed-off-by: Yinghai Lu Cc: Ivan Kokshaysky Cc: Andrew Morton Cc: yannick.roehlly@free.fr LKML-Reference: <4A01A784.2050407@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0062813..a2335d9 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1371,6 +1371,23 @@ void __init e820_reserve_resources(void) } } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 32MB for anything above that */ + return 32*1024*1024; +} + void __init e820_reserve_resources_late(void) { int i; @@ -1382,6 +1399,24 @@ void __init e820_reserve_resources_late(void) insert_resource_expand_to_fit(&iomem_resource, res); res++; } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + resource_size_t start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)); + if (start == end) + continue; + reserve_region_with_split(&iomem_resource, start, + end - 1, "RAM buffer"); + } } char *__init default_machine_specific_memory_setup(void) -- cgit v1.1 From 5d423ccd7ba4285f1084e91b26805e1d0ae978ed Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 08:07:52 -0700 Subject: x86/pci: remove rounding quirk from e820_setup_gap() Now that the e820 code explicitly reserves 'potentially dangerous' free physical memory address space to protect ACPI stolen RAM, there's no need for the rounding quirk in the PCI allocator anymore. Also, this quirk was open-ended iteration that could end up reserving a lot of free space and potentially breaking drivers - such as the one reported by Yannick Roehlly where there's a PCI device with a large memory resource. So remove it. [ Impact: make more of the PCI hole available for assigning pci devices ] Reported-by: Yannick Roehlly Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4A01A7C8.5090701@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a2335d9..7271fa3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize, round; + unsigned long gapstart, gapsize; int found; gapstart = 0x10000000; @@ -635,14 +635,9 @@ __init void e820_setup_gap(void) #endif /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. + * e820_reserve_resources_late protect stolen RAM already */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; + pci_mem_start = gapstart; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", -- cgit v1.1 From b9e0353fc85dab4ef5ebcef2bd09ebc4ce6d5a7b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:05:32 -0700 Subject: x86/acpi: remove irq-compression trick on 32-bit We already have a per cpu vector on 32-bit via recent changes, and don't need this trick any more (which trick obfuscates the real GSI mappings and which only triggers on larger systems to begin with): On 3 ioapic system (24 per ioapic) before patch I got: ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71 IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 64 Mode:1 Active:1) pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 64 ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67 IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 65 Mode:1 Active:1) pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66 IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1) pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65 IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 67 Mode:1 Active:1) pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64 IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 68 Mode:1 Active:1) pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67 pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68 pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65 after the patch we get: ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71 IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 71 Mode:1 Active:1) pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 71 ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67 IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 67 Mode:1 Active:1) pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66 IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1) pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65 IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 65 Mode:1 Active:1) pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64 IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 64 Mode:1 Active:1) pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66 pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65 pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64 pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67 As it can be seen that GSIs now get mapped lineary. [ Impact: simplify irq number mapping on bigger 32-bit systems ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C35C.7060207@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 65 +++++---------------------------------------- 1 file changed, 7 insertions(+), 58 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6ee96b5..fb5e882 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1162,22 +1162,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 - - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; -#else if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; -#endif /* Don't set up the ACPI SCI because it's already set up */ if (acpi_gbl_FADT.sci_interrupt == gsi) @@ -1196,66 +1183,28 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) gsi = ioapic_renumber_irq(ioapic, gsi); #endif - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); return gsi; } + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else return gsi; -#endif } - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } -#endif io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; } -- cgit v1.1 From ee214558c2e959781a406e76c5b34364da638e1d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:07:07 -0700 Subject: x86: fix alloc_mptable() Fix the conditions when we stop updating the mptable due to running out of slots. [ Impact: fix memory corruption / non-working update_mptable boot parameter ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C3BB.1000609@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70fd7e4..cd2a41a 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -870,24 +870,17 @@ static inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} #endif /* CONFIG_X86_IO_APIC */ -static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, - int count) +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - if (!mpc_new_phys) { - pr_info("No spare slots, try to append...take your risk, " - "new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - pr_info("No spare slots, try to append..., " - "new mpc_length %x\n", count); - else { - pr_err("mpc_new_length %lx is too small\n", - mpc_new_length); - return -1; - } + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; } - return 0; + return ret; } static int __init replace_intsrc_all(struct mpc_table *mpc, @@ -946,7 +939,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!check_slot(mpc_new_phys, mpc_new_length, count)) + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; -- cgit v1.1 From a31f82057ce6f7ced578d64c07a72ccbdc7336e4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:06:15 -0700 Subject: x86/acpi: call mp_config_acpi_gsi() in mp_register_gsi() The patch to call mp_config_acpi_gsi() from the ACPI IRQ registration code never got mainline because there were open discussions about it. This call is needed to properly update the kernel's copy of the mptable, when the update_mptable boot parameter is needed. Now that the dust has settled with the APIC unification, and since there were no objections when the patch was re-submitted, try this again. [ Impact: fix the update_mptable boot parameter ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C387.7090103@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 66 +++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb5e882..8019ecf 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1158,6 +1159,44 @@ void __init mp_config_acpi_legacy_irqs(void) } } +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, + int polarity) +{ +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; + int ioapic; + u8 pin; + + if (!acpi_ioapic) + return 0; + if (!dev) + return 0; + if (dev->bus != &pci_bus_type) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + save_mp_irq(&mp_irq); +#endif + return 0; +} + int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { int ioapic; @@ -1189,6 +1228,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) ioapic_pin); return gsi; } + mp_config_acpi_gsi(dev, gsi, triggering, polarity); /* * Avoid pin reprogramming. PRTs typically include entries @@ -1208,32 +1248,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) return gsi; } -int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ -#ifdef CONFIG_X86_MPPARSE - struct mpc_intsrc mp_irq; - int ioapic; - - if (!acpi_ioapic) - return 0; - - /* print the entry should happen on mptable identically */ - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | - (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - mp_irq.srcbus = number; - mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); - ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; - mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - - save_mp_irq(&mp_irq); -#endif - return 0; -} - /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error -- cgit v1.1 From bdfe8ac153546537ed24de69610ea781a734f785 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:07:41 -0700 Subject: x86/acpi: move pin_programmed bit map to io_apic.c Prepare to call setup_io_apic_routing() in pcibios_irq_enable() also remove not needed member apic_id. [ Impact: clean up, prepare for future change ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C3DD.3050104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 18 ++---------------- arch/x86/kernel/apic/io_apic.c | 25 ++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8019ecf..dcfbc3a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -904,10 +904,8 @@ extern int es7000_plat; #endif static struct { - int apic_id; int gsi_base; int gsi_end; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; int mp_find_ioapic(int gsi) @@ -996,7 +994,6 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); @@ -1189,7 +1186,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; + mp_irq.dstapic = mp_ioapics[ioapic].apicid; mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); save_mp_irq(&mp_irq); @@ -1224,23 +1221,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + "%d-%d\n", mp_ioapics[ioapic].apicid, ioapic_pin); return gsi; } mp_config_acpi_gsi(dev, gsi, triggering, polarity); - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; - } - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 21c30e1..e279ae3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3922,7 +3922,7 @@ int __init io_apic_get_version(int ioapic) } #endif -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, +static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, int triggering, int polarity) { struct irq_desc *desc; @@ -3959,6 +3959,29 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, return 0; } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, ioapic, pin, irq, + triggering, polarity); +} int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { -- cgit v1.1 From e20c06fd6950265a899edd96a02dc2e6ae2d1ce5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:08:22 -0700 Subject: x86/pci: add 4 more return parameters to IO_APIC_get_PCI_irq_vector() To prepare those params for pcibios_irq_enable() to call setup_io_apic_routing(). [ Impact: extend function call API to prepare for new functionality ] Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Len Brown Cc: Andrew Morton LKML-Reference: <4A01C406.2040303@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 107 +++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 48 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e279ae3..caf9dbd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -873,54 +873,6 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1139,6 +1091,65 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + int *ioapic, int *ioapic_pin, + int *trigger, int *polarity) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + *ioapic = apic; + *ioapic_pin = mp_irqs[i].dstirq; + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + *ioapic = apic; + *ioapic_pin = mp_irqs[i].dstirq; + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + void lock_vector_lock(void) { /* Used to the online set of cpus does not change -- cgit v1.1 From 5ef2183768bb7d64b85eccbfa1537a61cbefa97c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:08:50 -0700 Subject: x86/acpi: move setup io apic routing out of CONFIG_ACPI scope So we could set io apic routing when ACPI is not enabled. [ Impact: prepare for new functionality ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A01C422.5070400@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 122 ++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 61 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index caf9dbd..3a68dae 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3839,6 +3839,67 @@ int __init arch_probe_nr_irqs(void) } #endif +static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + struct irq_desc *desc; + struct irq_cfg *cfg; + int node; + + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, ioapic, pin); + } + + setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); + + return 0; +} + +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + +int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, + int triggering, int polarity) +{ + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, ioapic, pin, irq, + triggering, polarity); +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -3933,67 +3994,6 @@ int __init io_apic_get_version(int ioapic) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) -{ - struct irq_desc *desc; - struct irq_cfg *cfg; - int node; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - if (dev) - node = dev_to_node(dev); - else - node = cpu_to_node(boot_cpu_id); - - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); - return 0; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= NR_IRQS_LEGACY) { - cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); - } - - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); - - return 0; -} - -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) -{ - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[ioapic].apicid, pin); - return 0; - } - set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - - return __io_apic_set_pci_routing(dev, ioapic, pin, irq, - triggering, polarity); -} - int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { int i; -- cgit v1.1 From b9c61b70075c87a8612624736faf4a2de5b1ed30 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 10:10:06 -0700 Subject: x86/pci: update pirq_enable_irq() to setup io apic routing So we can set io apic routing only when enabling the device irq. This is advantageous for IRQ descriptor allocation affinity: if we set up the IO-APIC entry later, we have a chance to allocate the IRQ descriptor later and know which device it is on and can set affinity accordingly. [ Impact: standardize/enhance irq-enabling sequence for mptable irqs ] Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Len Brown Cc: Andrew Morton LKML-Reference: <4A01C46E.8000501@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 150 ++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 76 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3a68dae..5d5f412 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1480,9 +1480,13 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq ioapic_write_entry(apic_id, pin, entry); } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; + int apic_id = 0, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; @@ -1490,48 +1494,53 @@ static void __init setup_IO_APIC_irqs(void) apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { - - idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); - continue; - } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + apic_id = mp_find_ioapic(0); + if (apic_id < 0) + apic_id = 0; + } +#endif - irq = pin_2_irq(idx, apic_id, pin); + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic_id].apicid, pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic_id].apicid, pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) - continue; + irq = pin_2_irq(idx, apic_id, pin); - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); - continue; - } - cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, apic_id, pin); + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(apic_id, irq)) + continue; - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; } + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); } if (notcon) @@ -3876,10 +3885,6 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in return 0; } -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, int triggering, int polarity) { @@ -4023,51 +4028,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic, irq, irq_entry; + int pin, ioapic = 0, irq, irq_entry; struct irq_desc *desc; - struct irq_cfg *cfg; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, desc, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - continue; +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + ioapic = 0; + } +#endif - } + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); - /* - * Honour affinities which have been set in early boot - */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; - else - mask = apic->target_cpus(); + desc = irq_to_desc(irq); - if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); - else - set_ioapic_affinity_irq_desc(desc, mask); - } + /* + * Honour affinities which have been set in early boot + */ + if (desc->status & + (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) + mask = desc->affinity; + else + mask = apic->target_cpus(); + if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq_desc(desc, mask); + else + set_ioapic_affinity_irq_desc(desc, mask); } + } #endif -- cgit v1.1 From 61fe91e1319556f32bebfd7ed2c68ef02e2c17f7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: x86: apic: Check rev 3 fadt correctly for physical_apic bit Impact: fix fadt version checking FADT2_REVISION_ID has value 3 aka rev 3 FADT. So need to use >= instead of >, as other places in the code do. [ Impact: extend scope of APIC boot quirk ] Signed-off-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 306e5e8..744e6d8 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * regardless of how many processors are present (x86_64 ES7000 * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; -- cgit v1.1 From 3e0c373749d7eb5b354ac0b043f2b2cdf84eefef Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: x86: clean up and fix setup_clear/force_cpu_cap handling setup_force_cpu_cap() only have one user (Xen guest code), but it should not reuse cleared_cpu_cpus, otherwise it will have problems on SMP. Need to have a separate cpu_cpus_set array too, for forced-on flags, beyond the forced-off flags. Also need to setup handling before all cpus caps are combined. [ Impact: fix the forced-set CPU feature flag logic ] Cc: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Rusty Russell Signed-off-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c4f6678..e7fd5c4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -292,7 +292,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; void load_percpu_segment(int cpu) { @@ -806,6 +807,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -818,10 +829,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); -- cgit v1.1 From 80989ce0643c1034822f3e339ed8d790b649abe1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: x86: clean up and and print out initial max_pfn_mapped Do this so we can check the range that is mapped before init_memory_mapping(). To be able to print out meaningful info, we first have to fix 64-bit to have max_pfn_mapped assigned before that call. This also unifies the code-path a bit. [ Impact: print more debug info, cleanup ] Signed-off-by: Yinghai Lu LKML-Reference: <49BF0978.40605@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0d77e56..4031d6c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -862,12 +862,16 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; #endif #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); #endif + printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", + max_pfn_mapped< Date: Sat, 2 May 2009 10:40:57 -0700 Subject: x86: read apic ID in the !acpi_lapic case Ed found that on 32-bit, boot_cpu_physical_apicid is not read right, when the mptable is broken. Interestingly, actually three paths use/set it: 1. acpi: at that time that is already read from reg 2. mptable: only read from mptable 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit so we could read the apic id for the 2/3 path. We trust the hardware register more than we trust a BIOS data structure (the mptable). We can also avoid the double set_fixmap() when acpi_lapic is used, and also need to move cpu_has_apic earlier and call apic_disable(). Also when need to update the apic id, we'd better read and set the apic version as well - so that quirks are applied precisely. v2: make path 3 with 64bit, use -1 as apic id, so could read it later. v3: fix whitespace problem pointed out by Ed Swierk [ Impact: get correct apic id for bsp other than acpi path ] Reported-by: Ed Swierk Signed-off-by: Yinghai Lu Acked-by: Cyrill Gorcunov LKML-Reference: <49FC85A9.2070702@kernel.org> [ v4: sanity-check in the ACPI case too ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e258bed..1ee966f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1456,7 +1456,6 @@ static int __init detect_init_APIC(void) } mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_physical_apicid = 0; return 0; } #else @@ -1570,6 +1569,8 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { + unsigned int new_apicid; + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; @@ -1586,21 +1587,31 @@ void __init init_apic_mappings(void) } else apic_phys = mp_lapic_addr; - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + /* lets check if we may NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + return; + } + + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys); - /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); - - /* lets check if we may to NOP'ify apic operations */ - if (!cpu_has_apic) { - pr_info("APIC: disable apic facility\n"); - apic_disable(); + new_apicid = read_apic_id(); + if (boot_cpu_physical_apicid != new_apicid) { + boot_cpu_physical_apicid = new_apicid; + apic_version[new_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); } } -- cgit v1.1 From 8823392360dc4992f87bf4c623834d315f297493 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 10 May 2009 10:53:05 +0200 Subject: perf_counter, x86: clean up throttling printk s/PERFMON/perfcounters for perfcounter interrupt throttling warning. 'perfmon' is the CPU feature name that is Intel-only, while we do throttling in a generic way. [ Impact: cleanup ] Signed-off-by: Mike Galbraith Cc: Robert Richter Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a6878b0..da27419 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -814,7 +814,7 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); + printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n"); hw_perf_restore(cpuc->throttle_ctrl); } cpuc->interrupts = 0; -- cgit v1.1 From 97a52714658cd959a3cfa35c5b6f489859f0204b Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 8 May 2009 18:23:50 +0200 Subject: x86: display extended apic registers with print_local_APIC and cpu_debug code Both print_local_APIC (used when apic=debug kernel param is set) and cpu_debug code missed support for some extended APIC registers that I'd like to see. This adds support to show: - extended APIC feature register - extended APIC control register - extended LVT registers [ Impact: print more debug info ] Signed-off-by: Andreas Herrmann Cc: Jaswinder Singh Rajput Cc: Cyrill Gorcunov LKML-Reference: <20090508162350.GO29045@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 14 +++++++++++++- arch/x86/kernel/cpu/cpu_debug.c | 14 +++++++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1ee966f..0e6543f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -395,7 +395,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) { - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); unsigned int v = (mask << 16) | (msg_type << 8) | vector; apic_write(reg, v); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2afe145..65b824c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1739,7 +1739,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) __apicdebuginit(void) print_local_APIC(void *dummy) { - unsigned int v, ver, maxlvt; + unsigned int i, v, ver, maxlvt; u64 icr; if (apic_verbosity == APIC_QUIET) @@ -1827,6 +1827,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); v = apic_read(APIC_TDCR); printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } printk("\n"); } diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab..2fc4f6b 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -588,8 +588,20 @@ static void print_apic(void *arg) seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); -#endif /* CONFIG_X86_LOCAL_APIC */ + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + unsigned int i, v, maxeilvt; + + v = apic_read(APIC_EFEAT); + maxeilvt = (v >> 16) & 0xff; + seq_printf(seq, " EFEAT\t\t: %08x\n", v); + seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); + for (i = 0; i < maxeilvt; i++) { + v = apic_read(APIC_EILVTn(i)); + seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); + } + } +#endif /* CONFIG_X86_LOCAL_APIC */ seq_printf(seq, "\n MSR\t:\n"); } -- cgit v1.1 From cec6be6d1069d697beb490bbb40a290d5ff554a2 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 11 May 2009 17:41:40 +0400 Subject: x86: apic: Fixmap apic address even if apic disabled In case if apic were disabled by boot option we still need read_apic operation. So fixmap a fake apic area if needed. [ Impact: fix boot crash ] Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: eswierk@aristanetworks.com LKML-Reference: <20090511134140.GH4624@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0e6543f..07cffc1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1587,13 +1587,6 @@ void __init init_apic_mappings(void) } else apic_phys = mp_lapic_addr; - /* lets check if we may NOP'ify apic operations */ - if (!cpu_has_apic) { - pr_info("APIC: disable apic facility\n"); - apic_disable(); - return; - } - /* * acpi lapic path already maps that address in * acpi_register_lapic_address() @@ -1602,7 +1595,15 @@ void __init init_apic_mappings(void) set_fixmap_nocache(FIX_APIC_BASE, apic_phys); apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + APIC_BASE, apic_phys); + + /* lets check if we may NOP'ify apic operations */ + if (!cpu_has_apic) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + return; + } + /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). -- cgit v1.1 From 0c23590f00f85467b318ad0c20c36796a5bd4c60 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 03:30:15 +0400 Subject: x86, 64-bit: ifdef out struct thread_struct::ip struct thread_struct::ip isn't used on x86_64, struct pt_regs::ip is used instead. kgdb should be reading 0 always, but I can't check it. [ Impact: (potentially) reduce thread_struct size on 64-bit ] Signed-off-by: Alexey Dobriyan Cc: containers@lists.linux-foundation.org LKML-Reference: <20090503233015.GJ16631@x200.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index eedfaeb..d07706f 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -141,7 +141,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = p->thread.ip; + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; -- cgit v1.1 From 37ba7ab5e33cebc25c68fffe33e9f21e7c2014e8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 15:56:08 -0700 Subject: x86, boot: make kernel_alignment adjustable; new bzImage fields Make the kernel_alignment field adjustable; this allows us to set it to a large value (intended to be 16 MB to avoid ZONE_DMA contention, memory holes and other weirdness) while a smart bootloader can still force a loading at a lesser alignment if absolutely necessary. Also export pref_address (preferred loading address, corresponding to the link-time address) and init_size, the total amount of linear memory the kernel will require during initialization. [ Impact: allows better kernel placement, gives bootloader more info ] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets_32.c | 1 + arch/x86/kernel/asm-offsets_64.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c..1a830cb 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -146,4 +146,5 @@ void foo(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062..898ecc4 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -125,6 +125,7 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); -- cgit v1.1 From 5031296c57024a78ddad4edfc993367dbf4abb98 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 May 2009 16:54:11 -0700 Subject: x86: add extension fields for bootloader type and version A long ago, in days of yore, it all began with a god named Thor. There were vikings and boats and some plans for a Linux kernel header. Unfortunately, a single 8-bit field was used for bootloader type and version. This has generally worked without *too* much pain, but we're getting close to flat running out of ID fields. Add extension fields for both type and version. The type will be extended if it the old field is 0xE; the version is a simple MSB extension. Keep /proc/sys/kernel/bootloader_type containing (type << 4) + (ver & 0xf) for backwards compatiblity, but also add /proc/sys/kernel/bootloader_version which contains the full version number. [ Impact: new feature to support more bootloaders ] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b415843..2b09345 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -214,8 +214,8 @@ unsigned long mmu_cr4_features; unsigned long mmu_cr4_features = X86_CR4_PAE; #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; /* * Setup options @@ -706,6 +706,12 @@ void __init setup_arch(char **cmdline_p) #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; -- cgit v1.1 From 871b72dd1e12afc3f024479531d25a9339d2e3f9 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 11 May 2009 23:48:27 +0200 Subject: x86: microcode: use smp_call_function_single instead of set_cpus_allowed, cleanup of synchronization logic * Solve issues described in 6f66cbc63081fd70e3191b4dbb796746780e5ae1 in a way that doesn't resort to set_cpus_allowed(); * in fact, only collect_cpu_info and apply_microcode callbacks must run on a target cpu, others will do just fine on any other. smp_call_function_single() (as suggested by Ingo) is used to run these callbacks on a target cpu. * cleanup of synchronization logic of the 'microcode_core' part The generic 'microcode_core' part guarantees that only a single cpu (be it a full-fledged cpu, one of the cores or HT) is being updated at any particular moment of time. In general, there is no need for any additional sync. mechanism in arch-specific parts (the patch removes existing spinlocks). See also the "Synchronization" section in microcode_core.c. * return -EINVAL instead of -1 (which is translated into -EPERM) in microcode_write(), reload_cpu() and mc_sysdev_add(). Other suggestions for an error code? * use 'enum ucode_state' as return value of request_microcode_{fw, user} to gain more flexibility by distinguishing between real error cases and situations when an appropriate ucode was not found (which is not an error per-se). * some minor cleanups Thanks a lot to Hugh Dickins for review/suggestions/testing! Reference: http://marc.info/?l=linux-kernel&m=124025889012541&w=2 [ Impact: refactor and clean up microcode driver locking code ] Signed-off-by: Dmitry Adamushko Acked-by: Hugh Dickins Cc: Andrew Morton Cc: Rusty Russell Cc: Andreas Herrmann Cc: Peter Oruba Cc: Arjan van de Ven LKML-Reference: <1242078507.5560.9.camel@earth> [ did some more cleanups ] Signed-off-by: Ingo Molnar arch/x86/include/asm/microcode.h | 25 ++ arch/x86/kernel/microcode_amd.c | 58 ++---- arch/x86/kernel/microcode_core.c | 326 +++++++++++++++++++++----------------- arch/x86/kernel/microcode_intel.c | 92 +++------- 4 files changed, 261 insertions(+), 240 deletions(-) (~20 new comment lines) --- arch/x86/kernel/microcode_amd.c | 58 +++---- arch/x86/kernel/microcode_core.c | 329 ++++++++++++++++++++++---------------- arch/x86/kernel/microcode_intel.c | 90 ++++------- 3 files changed, 244 insertions(+), 233 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b579..c8be20f 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,25 +13,13 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ -#include -#include -#include #include -#include -#include #include #include #include #include #include -#include -#include -#include -#include -#include #include -#include -#include #include #include @@ -79,9 +67,6 @@ struct microcode_amd { #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 -/* serialize access to the physical write */ -static DEFINE_SPINLOCK(microcode_update_lock); - static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) @@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 1; } -static void apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { - unsigned long flags; u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu) BUG_ON(cpu_num != cpu); if (mc_amd == NULL) - return; + return 0; - spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); - return; + return -1; } printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; + + return 0; } static int get_ucode_data(void *to, const u8 *from, size_t n) @@ -263,7 +247,8 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, const u8 *data, size_t size) +static enum ucode_state +generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -272,12 +257,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; + enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: failed to create " "equivalent cpu table\n"); - return -EINVAL; + return UCODE_ERROR; } ucode_ptr += offset; @@ -312,28 +298,27 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - } else + } else { vfree(new_mc); - } + state = UCODE_ERROR; + } + } else + state = UCODE_NFOUND; free_equiv_cpu_table(); - return (int)leftover; + return state; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; const struct firmware *firmware; - int ret; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); + enum ucode_state ret; - ret = request_firmware(&firmware, fw_name, device); - if (ret) { + if (request_firmware(&firmware, fw_name, device)) { printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); @@ -343,11 +328,12 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { printk(KERN_INFO "microcode: AMD microcode update via " "/dev/cpu/microcode not supported\n"); - return -1; + return UCODE_ERROR; } static void microcode_fini_cpu_amd(int cpu) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c..9c44615 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -71,27 +71,18 @@ * Thanks to Stuart Swales for pointing out this bug. */ #include -#include #include -#include +#include #include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include #include #include #include #include -#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -101,36 +92,110 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + #ifdef CONFIG_MICROCODE_OLD_INTERFACE static int do_microcode_update(const void __user *buf, size_t size) { - cpumask_t old; int error = 0; int cpu; - old = current->cpus_allowed; - for_each_online_cpu(cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); - if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); } -out: - set_cpus_allowed_ptr(current, &old); + return error; } @@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2) static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - ssize_t ret; + ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; + pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + return ret; } get_online_cpus(); mutex_lock(µcode_mutex); - ret = do_microcode_update(buf, len); - if (!ret) + if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; mutex_unlock(µcode_mutex); @@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, } static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, }; static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, }; static int __init microcode_dev_init(void) @@ -182,9 +245,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static long reload_for_cpu(void *unused) +static int reload_for_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; mutex_lock(µcode_mutex); if (uci->valid) { - err = microcode_ops->request_microcode_fw(smp_processor_id(), - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); + enum ucode_state ustate; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; } mutex_unlock(µcode_mutex); + return err; } static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, - const char *buf, size_t sz) + const char *buf, size_t size) { - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; + unsigned long val; int cpu = dev->id; + int ret = 0; + char *end; + val = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (val == 1) { get_online_cpus(); if (cpu_online(cpu)) - err = work_on_cpu(cpu, reload_for_cpu, NULL); + ret = reload_for_cpu(cpu); put_online_cpus(); } - if (err) - return err; - return sz; + + if (!ret) + ret = size; + + return ret; } static ssize_t version_show(struct sys_device *dev, @@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = { }; static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", + .attrs = mc_default_attrs, + .name = "microcode", }; -static void __microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu) uci->valid = 0; } -static void microcode_fini_cpu(int cpu) -{ - mutex_lock(µcode_mutex); - __microcode_fini_cpu(cpu); - mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) +static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - memset(uci, 0, sizeof(*uci)); - if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) - uci->valid = 1; + if (!uci->mc) + return UCODE_NFOUND; + + pr_debug("microcode: CPU%d updated upon resume\n", cpu); + apply_microcode_on_target(cpu); + + return UCODE_OK; } -static int microcode_resume_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpu_signature nsig; + enum ucode_state ustate; - pr_debug("microcode: CPU%d resumed\n", cpu); + if (collect_cpu_info(cpu)) + return UCODE_ERROR; - if (!uci->mc) - return 1; + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", - cpu); - return -1; - } + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); - if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", - cpu); - /* Should we look for a new ucode here? */ - return 1; + if (ustate == UCODE_OK) { + pr_debug("microcode: CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); } - return 0; + return ustate; } -static long microcode_update_cpu(void *unused) +static enum ucode_state microcode_update_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); - int err = 0; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; - /* - * Check if the system resume is in progress (uci->valid != NULL), - * otherwise just request a firmware: - */ - if (uci->valid) { - err = microcode_resume_cpu(smp_processor_id()); - } else { - collect_cpu_info(smp_processor_id()); - if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw( - smp_processor_id(), - µcode_pdev->dev); - } - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); - return err; -} + if (uci->valid) + ustate = microcode_resume_cpu(cpu); + else + ustate = microcode_init_cpu(cpu); -static int microcode_init_cpu(int cpu) -{ - int err; - mutex_lock(µcode_mutex); - err = work_on_cpu(cpu, microcode_update_cpu, NULL); - mutex_unlock(µcode_mutex); - - return err; + return ustate; } static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) return err; - err = microcode_init_cpu(cpu); + if (microcode_init_cpu(cpu) == UCODE_ERROR) + err = -EINVAL; return err; } @@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) static int mc_sysdev_resume(struct sys_device *dev) { int cpu = dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; - /* only CPU 0 will apply ucode here */ - microcode_update_cpu(NULL); + /* + * All non-bootup cpus are still disabled, + * so only CPU 0 will apply ucode here. + * + * Moreover, there can be no concurrent + * updates from any other places at this point. + */ + WARN_ON(cpu != 0); + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); + return 0; } static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, }; static __cpuinit int @@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (microcode_init_cpu(cpu)) - printk(KERN_ERR "microcode: failed to init CPU%d\n", - cpu); + microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); + pr_err("microcode: Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -465,13 +508,10 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + pr_err("microcode: no support for this CPU vendor\n"); return -ENODEV; } - error = microcode_dev_init(); - if (error) - return error; microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) { @@ -480,23 +520,31 @@ static int __init microcode_init(void) } get_online_cpus(); + mutex_lock(µcode_mutex); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); + if (error) { - microcode_dev_exit(); platform_device_unregister(microcode_pdev); return error; } + error = microcode_dev_init(); + if (error) + return error; + register_hotcpu_notifier(&mc_cpu_notifier); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " ," " Peter Oruba\n"); return 0; } +module_init(microcode_init); static void __exit microcode_exit(void) { @@ -505,16 +553,17 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); get_online_cpus(); + mutex_lock(µcode_mutex); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); platform_device_unregister(microcode_pdev); microcode_ops = NULL; - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } - -module_init(microcode_init); module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec..0d334dd 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,24 +70,11 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ -#include -#include -#include #include -#include -#include -#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include +#include #include #include @@ -150,13 +137,9 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - spin_unlock_irqrestore(µcode_update_lock, flags); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - csig->sig, csig->pf, csig->rev); + printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) return 0; } -static void apply_microcode(int cpu) +static int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; - unsigned long flags; unsigned int val[2]; int cpu_num; @@ -334,10 +312,7 @@ static void apply_microcode(int cpu) BUG_ON(cpu_num != cpu); if (mc_intel == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); + return 0; /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, @@ -351,30 +326,32 @@ static void apply_microcode(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", - cpu_num, uci->cpu_sig.rev, val[1]); - return; + printk(KERN_ERR "microcode: CPU%d update " + "to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->cpu_sig.rev, val[1], + printk(KERN_INFO "microcode: CPU%d updated to revision " + "0x%x, date = %04x-%02x-%02x \n", + cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + + return 0; } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u8 *ucode_ptr = data, *new_mc = NULL, *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; + enum ucode_state state = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (!new_mc) + if (leftover) { + if (new_mc) + vfree(new_mc); + state = UCODE_ERROR; goto out; + } - if (leftover) { - vfree(new_mc); + if (!new_mc) { + state = UCODE_NFOUND; goto out; } @@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - - out: - return (int)leftover; +out: + return state; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - ret = request_firmware(&firmware, name, device); - if (ret) { + + if (request_firmware(&firmware, name, device)) { pr_debug("microcode: data file %s load failed\n", name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, (void *)firmware->data, @@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) return copy_from_user(to, from, n); } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } -- cgit v1.1 From 9d62dcdfa6f6fc843f7d9b494bcd48f00b94f883 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 11 May 2009 22:05:28 -0400 Subject: x86: merge process.c a bit Merge arch_align_stack() and arch_randomize_brk(), since they are the same. Tested on x86_64. [ Impact: cleanup ] Signed-off-by: Amerigo Wang Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 14 ++++++++++++++ arch/x86/kernel/process_32.c | 13 ------------- arch/x86/kernel/process_64.c | 13 ------------- 3 files changed, 14 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca98915..2b9a8d0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -613,3 +614,16 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84..a3bb049 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -497,15 +496,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41..34386f7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -660,15 +659,3 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} -- cgit v1.1 From bf78ad69cd351798b9447a269c6bd41ce1f111f4 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 11 May 2009 23:29:09 -0400 Subject: x86: process.c, remove useless headers is not needed by these files, remove them. [ Impact: cleanup ] Signed-off-by: WANG Cong Cc: akpm@linux-foundation.org LKML-Reference: <20090512032956.5040.77055.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 2 -- arch/x86/kernel/process_64.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3bb049..56d50b7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,8 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 34386f7..9d6b20e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,8 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include -- cgit v1.1 From 4797f6b021a3fa399942245d07a1feb30df81bb8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 2 May 2009 10:40:57 -0700 Subject: x86: read apic ID in the !acpi_lapic case Ed found that on 32-bit, boot_cpu_physical_apicid is not read right, when the mptable is broken. Interestingly, actually three paths use/set it: 1. acpi: at that time that is already read from reg 2. mptable: only read from mptable 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit so we could read the apic id for the 2/3 path. We trust the hardware register more than we trust a BIOS data structure (the mptable). We can also avoid the double set_fixmap() when acpi_lapic is used, and also need to move cpu_has_apic earlier and call apic_disable(). Also when need to update the apic id, we'd better read and set the apic version as well - so that quirks are applied precisely. v2: make path 3 with 64bit, use -1 as apic id, so could read it later. v3: fix whitespace problem pointed out by Ed Swierk v5: fix boot crash [ Impact: get correct apic id for bsp other than acpi path ] Reported-by: Ed Swierk Signed-off-by: Yinghai Lu Acked-by: Cyrill Gorcunov LKML-Reference: <49FC85A9.2070702@kernel.org> [ v4: sanity-check in the ACPI case too ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 46 ++++++++++++++++++++---------------------- arch/x86/kernel/apic/io_apic.c | 5 +++++ 2 files changed, 27 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 07cffc1..b0fd264 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -242,17 +242,24 @@ static int modern_apic(void) * bare function to substitute write operation * and it's _that_ fast :) */ -void native_apic_write_dummy(u32 reg, u32 v) +static void native_apic_write_dummy(u32 reg, u32 v) { WARN_ON_ONCE((cpu_has_apic || !disable_apic)); } +static u32 native_apic_read_dummy(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + return 0; +} + /* - * right after this call apic->write doesn't do anything + * right after this call apic->write/read doesn't do anything * note that there is no restore operation it works one way */ void apic_disable(void) { + apic->read = native_apic_read_dummy; apic->write = native_apic_write_dummy; } @@ -1576,32 +1583,23 @@ void __init init_apic_mappings(void) return; } - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ + /* If no local APIC can be found return early */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else + /* lets NOP'ify apic operations */ + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } else { apic_phys = mp_lapic_addr; - /* - * acpi lapic path already maps that address in - * acpi_register_lapic_address() - */ - if (!acpi_lapic) - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - - apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - /* lets check if we may NOP'ify apic operations */ - if (!cpu_has_apic) { - pr_info("APIC: disable apic facility\n"); - apic_disable(); - return; + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", + APIC_BASE, apic_phys); } /* diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1f3d366..74d2b48 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1878,6 +1878,11 @@ __apicdebuginit(void) print_PIC(void) __apicdebuginit(int) print_all_ICs(void) { print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic || disable_apic) + return 0; + print_all_local_APICs(); print_IO_APIC(); -- cgit v1.1 From 5bb9efe33ea4001a17ab98186a40a134a3061d67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 08:12:51 +0200 Subject: perf_counter: fix print debug irq disable inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. bash/15802 [HC0[0]:SC0[0]:HE1:SE1] takes: (sysrq_key_table_lock){?.....}, Don't unconditionally enable interrupts in the perf_counter_print_debug() path. [ Impact: fix potential deadlock pointed out by lockdep ] LKML-Reference: Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index da27419..f7772ff 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -621,12 +621,13 @@ void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; struct cpu_hw_counters *cpuc; + unsigned long flags; int cpu, idx; if (!x86_pmu.num_counters) return; - local_irq_disable(); + local_irq_save(flags); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -664,7 +665,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } - local_irq_enable(); + local_irq_restore(flags); } static void x86_pmu_disable(struct perf_counter *counter) -- cgit v1.1 From 29a679754b1a2581ee456eada6c2de7ce95068bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 May 2009 23:19:09 -0400 Subject: x86/stacktrace: return 0 instead of -1 for stack ops If we return -1 in the ops->stack for the stacktrace saving, we end up breaking out of the loop if the stack we are tracing is in the exception stack. This causes traces like: -0 [002] 34263.745825: raise_softirq_irqoff <-__blk_complete_request -0 [002] 34263.745826: <= 0 <= 0 <= 0 <= 0 <= 0 <= 0 <= 0 By returning "0" instead, the irq stack is saved as well, and we see: -0 [003] 883.280992: raise_softirq_irqoff <-__hrtimer_star t_range_ns -0 [003] 883.280992: <= hrtimer_start_range_ns <= tick_nohz_restart_sched_tick <= cpu_idle <= start_secondary <= <= 0 <= 0 [ Impact: record stacks from interrupts ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index f7bddc2..4aaf7e4 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) static int save_stack_stack(void *data, char *name) { - return -1; + return 0; } static void save_stack_address(void *data, unsigned long addr, int reliable) -- cgit v1.1 From ec3232bdf8518bea8410f0027f870b24d3aa8753 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 09:45:19 +0200 Subject: perf_counter: x86: More accurate counter update Take the counter width into account instead of assuming 32 bits. In particular Nehalem has 44 bit wide counters, and all arithmetics should happen on a 44-bit signed integer basis. [ Impact: fix rare event imprecision, warning message on Nehalem ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7772ff..3a92a2b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -138,7 +138,9 @@ static u64 x86_perf_counter_update(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { - u64 prev_raw_count, new_raw_count, delta; + int shift = 64 - x86_pmu.counter_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; /* * Careful: an NMI might modify the previous counter value. @@ -161,9 +163,10 @@ again: * (counter-)time and add that to the generic counter. * * Careful, not all hw sign-extends above the physical width - * of the count, so we do that by clipping the delta to 32 bits: + * of the count. */ - delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); -- cgit v1.1 From f5a5a2f6e69e88647ae12da39f0ff3a510bcf0a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 May 2009 12:54:01 +0200 Subject: perf_counter: x86: Fix throttling If counters are disabled globally when a perfcounter IRQ/NMI hits, and if we throttle in that case, we'll promote the '0' value to the next lapic IRQ and disable all perfcounters at that point, permanently ... Fix it. [ Impact: fix hung perfcounters under load ] Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3a92a2b..88ae8ce 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -765,8 +765,13 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - intel_pmu_restore_all(cpuc->throttle_ctrl); + if (cpuc->throttle_ctrl) { + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { + intel_pmu_restore_all(cpuc->throttle_ctrl); + } else { + pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); + } + } return ret; } @@ -817,11 +822,16 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - if (printk_ratelimit()) - printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n"); + pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); + + /* + * Clear them before re-enabling irqs/NMIs again: + */ + cpuc->interrupts = 0; hw_perf_restore(cpuc->throttle_ctrl); + } else { + cpuc->interrupts = 0; } - cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.1 From a026dfecc035f213c1cfa0bf6407ce3155f6a9df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 10:02:57 +0200 Subject: perf_counter: x86: Allow unpriviliged use of NMIs Apply sysctl_perf_counter_priv to NMIs. Also, fail the counter creation instead of silently down-grading to regular interrupts. [ Impact: allow wider perf-counter usage ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 88ae8ce..c19e927 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -280,8 +280,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * If privileged enough, allow NMI events: */ hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + if (hw_event->nmi) { + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) + return -EACCES; hwc->nmi = 1; + } hwc->irq_period = hw_event->irq_period; if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) -- cgit v1.1 From 962bf7a66edca4d36a730a38ff8410a67f560e40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 13:21:36 +0200 Subject: perf_counter: x86: Fix up the amd NMI/INT throttle perf_counter_unthrottle() restores throttle_ctrl, buts its never set. Also, we fail to disable all counters when throttling. [ Impact: fix rare stuck perf-counters when they are throttled ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c19e927..7601c01 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -334,6 +334,8 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); + if (!enabled) + goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -347,6 +349,7 @@ static u64 amd_pmu_save_disable_all(void) wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } +out: return enabled; } @@ -787,32 +790,43 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) int handled = 0; struct perf_counter *counter; struct hw_perf_counter *hwc; - int idx; + int idx, throttle = 0; + + cpuc->throttle_ctrl = cpuc->enabled; + cpuc->enabled = 0; + barrier(); + + if (cpuc->throttle_ctrl) { + if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) + throttle = 1; + } - ++cpuc->interrupts; for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int disable = 0; + if (!test_bit(idx, cpuc->active_mask)) continue; + counter = cpuc->counters[idx]; hwc = &counter->hw; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; + goto next; + /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - if (perf_counter_overflow(counter, nmi, regs, 0)) - amd_pmu_disable_counter(hwc, idx); - else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - /* - * do not reenable when throttled, but reload - * the register - */ + disable = perf_counter_overflow(counter, nmi, regs, 0); + +next: + if (disable || throttle) amd_pmu_disable_counter(hwc, idx); - else if (counter->state == PERF_COUNTER_STATE_ACTIVE) - amd_pmu_enable_counter(hwc, idx); } + + if (cpuc->throttle_ctrl && !throttle) + cpuc->enabled = 1; + return handled; } -- cgit v1.1 From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 16:21:38 +0200 Subject: perf_counter: Rework the perf counter disable/enable The current disable/enable mechanism is: token = hw_perf_save_disable(); ... /* do bits */ ... hw_perf_restore(token); This works well, provided that the use nests properly. Except we don't. x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore provide a reference counter disable/enable interface, where the first disable disables the hardware, and the last enable enables the hardware again. [ Impact: refactor, simplify the PMU disable/enable logic ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 113 ++++++++++++++----------------------- 1 file changed, 42 insertions(+), 71 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7601c01..313638c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -31,7 +31,6 @@ struct cpu_hw_counters { unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 throttle_ctrl; int enabled; }; @@ -42,8 +41,8 @@ struct x86_pmu { const char *name; int version; int (*handle_irq)(struct pt_regs *, int); - u64 (*save_disable_all)(void); - void (*restore_all)(u64); + void (*disable_all)(void); + void (*enable_all)(void); void (*enable)(struct hw_perf_counter *, int); void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; @@ -56,6 +55,7 @@ struct x86_pmu { int counter_bits; u64 counter_mask; u64 max_period; + u64 intel_ctrl; }; static struct x86_pmu x86_pmu __read_mostly; @@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 intel_pmu_save_disable_all(void) +static void intel_pmu_disable_all(void) { - u64 ctrl; - - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - return ctrl; } -static u64 amd_pmu_save_disable_all(void) +static void amd_pmu_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int enabled, idx; + int idx; + + if (!cpuc->enabled) + return; - enabled = cpuc->enabled; cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the @@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); - if (!enabled) - goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void) val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } - -out: - return enabled; } -u64 hw_perf_save_disable(void) +void hw_perf_disable(void) { if (!x86_pmu_initialized()) - return 0; - return x86_pmu.save_disable_all(); + return; + return x86_pmu.disable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void intel_pmu_restore_all(u64 ctrl) +static void intel_pmu_enable_all(void) { - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); } -static void amd_pmu_restore_all(u64 ctrl) +static void amd_pmu_enable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; - cpuc->enabled = ctrl; - barrier(); - if (!ctrl) + if (cpuc->enabled) return; + cpuc->enabled = 1; + barrier(); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl) } } -void hw_perf_restore(u64 ctrl) +void hw_perf_enable(void) { if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); + x86_pmu.enable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_restore); static inline u64 intel_pmu_get_status(void) { @@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - int ret = 0; - - cpuc->throttle_ctrl = intel_pmu_save_disable_all(); + perf_disable(); status = intel_pmu_get_status(); - if (!status) - goto out; + if (!status) { + perf_enable(); + return 0; + } - ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -767,19 +751,11 @@ again: status = intel_pmu_get_status(); if (status) goto again; -out: - /* - * Restore - do not reenable when global enable is off or throttled: - */ - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { - intel_pmu_restore_all(cpuc->throttle_ctrl); - } else { - pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); - } - } - return ret; + if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) + perf_enable(); + + return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) @@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) struct hw_perf_counter *hwc; int idx, throttle = 0; - cpuc->throttle_ctrl = cpuc->enabled; - cpuc->enabled = 0; - barrier(); - - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - throttle = 1; + if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { + throttle = 1; + __perf_disable(); + cpuc->enabled = 0; + barrier(); } for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -824,9 +798,6 @@ next: amd_pmu_disable_counter(hwc, idx); } - if (cpuc->throttle_ctrl && !throttle) - cpuc->enabled = 1; - return handled; } @@ -839,13 +810,11 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); - /* * Clear them before re-enabling irqs/NMIs again: */ cpuc->interrupts = 0; - hw_perf_restore(cpuc->throttle_ctrl); + perf_enable(); } else { cpuc->interrupts = 0; } @@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, - .save_disable_all = intel_pmu_save_disable_all, - .restore_all = intel_pmu_restore_all, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = { static struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, - .save_disable_all = amd_pmu_save_disable_all, - .restore_all = amd_pmu_restore_all, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, @@ -1003,6 +972,8 @@ static int intel_pmu_init(void) x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + return 0; } -- cgit v1.1 From a4016a79fcbd139e7378944c0d86a39fdbc70ecc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 May 2009 14:52:17 +0200 Subject: perf_counter: x86: Robustify interrupt handling Two consecutive NMIs could daze and confuse the machine when the first would handle the overflow of both counters. [ Impact: fix false-positive syslog messages under multi-session profiling ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 313638c..1dcf670 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -783,6 +783,10 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) counter = cpuc->counters[idx]; hwc = &counter->hw; + + if (counter->hw_event.nmi != nmi) + goto next; + val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) goto next; @@ -869,7 +873,6 @@ perf_counter_nmi_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; - int ret; if (!atomic_read(&active_counters)) return NOTIFY_DONE; @@ -886,9 +889,16 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = x86_pmu.handle_irq(regs, 1); + /* + * Can't rely on the handled return value to say it was our NMI, two + * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs, 1); - return ret ? NOTIFY_STOP : NOTIFY_OK; + return NOTIFY_STOP; } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { -- cgit v1.1 From 1c80f4b598d9b075a2a0be694e28be93a6702bcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 08:25:22 +0200 Subject: perf_counter: x86: Disallow interval of 1 On certain CPUs i have observed a stuck PMU if interval was set to 1 and NMIs were used. The PMU had PMC0 set in MSR_CORE_PERF_GLOBAL_STATUS, but it was not possible to ack it via MSR_CORE_PERF_GLOBAL_OVF_CTRL, and the NMI loop got stuck infinitely. [ Impact: fix rare hangs during high perfcounter load ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1dcf670..46a82d1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -473,6 +473,11 @@ x86_perf_counter_set_period(struct perf_counter *counter, left += period; atomic64_set(&hwc->period_left, left); } + /* + * Quirk: certain CPUs dont like it if just 1 event is left: + */ + if (unlikely(left < 2)) + left = 2; per_cpu(prev_left[idx], smp_processor_id()) = left; -- cgit v1.1 From 9029a5e3801f1cc7cdaab80169d82427acf928d8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 08:26:20 +0200 Subject: perf_counter: x86: Protect against infinite loops in intel_pmu_handle_irq() intel_pmu_handle_irq() can lock up in an infinite loop if the hardware does not allow the acking of irqs. Alas, this happened in testing so make this robust and emit a warning if it happens in the future. Also, clean up the IRQ handlers a bit. [ Impact: improve perfcounter irq/nmi handling robustness ] Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 46a82d1..5a7f718 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -722,9 +722,13 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) */ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int bit, cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc; + struct cpu_hw_counters; + int bit, cpu, loops; u64 ack, status; - struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); perf_disable(); status = intel_pmu_get_status(); @@ -733,7 +737,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) return 0; } + loops = 0; again: + if (++loops > 100) { + WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + return 1; + } + inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { @@ -765,13 +775,14 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu = smp_processor_id(); - struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - u64 val; - int handled = 0; + int cpu, idx, throttle = 0, handled = 0; + struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; - int idx, throttle = 0; + u64 val; + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { throttle = 1; -- cgit v1.1 From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:28 +0200 Subject: perf_counter: frequency based adaptive irq_period Instead of specifying the irq_period for a counter, provide a target interrupt frequency and dynamically adapt the irq_period to match this frequency. [ Impact: new perf-counter attribute/feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.646195868@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a7f718..886dcf3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->irq_period = hw_event->irq_period; - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) - hwc->irq_period = x86_pmu.max_period; - - atomic64_set(&hwc->period_left, hwc->irq_period); + atomic64_set(&hwc->period_left, + min(x86_pmu.max_period, hwc->irq_period)); /* * Raw event type provide the config in the event structure @@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; + s64 period = min(x86_pmu.max_period, hwc->irq_period); int err; /* -- cgit v1.1 From d9bcc01d58d18ed287091707b0b45c6ac888a11a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:06:12 +0530 Subject: x86, mtrr: replace MTRRcap_MSR with msr-index's MSR_MTRRcap Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0b776c0..de9c20f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -306,7 +306,7 @@ void __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; - rdmsr(MTRRcap_MSR, lo, dummy); + rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = (lo >> 8) & 1; for (i = 0; i < num_var_ranges; i++) @@ -703,7 +703,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i static int generic_have_wrcomb(void) { unsigned long config, dummy; - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); return (config & (1 << 10)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01..8fc248b 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void) unsigned long config = 0, dummy; if (use_intel()) { - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); } else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7..5d37fb1 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,7 +5,6 @@ #include #include -#define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff #define MTRRfix64K_00000_MSR 0x250 -- cgit v1.1 From a036c7a358cc9d7ed28a188480b9a4d709e09b24 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:10:43 +0530 Subject: x86, mtrr: replace MTRRfix64K_00000_MSR with msr-index's MSR_MTRRfix64K_00000 Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index de9c20f..8b115c0 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,7 +20,7 @@ struct fixed_range_block { }; static struct fixed_range_block fixed_range_blocks[] = { - { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} @@ -194,7 +194,7 @@ get_fixed_ranges(mtrr_type * frs) k8_check_syscfg_dram_mod_en(); - rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5d37fb1..7f23cae 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,7 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 -- cgit v1.1 From 7d9d55e449089df8463bca2045d702ae6cda64a2 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:15:32 +0530 Subject: x86, mtrr: replace MTRRfix16K_80000_MSR with msr-index's MSR_MTRRfix16K_80000 Use standard msr-index.h's MSR declaration and no need to declare again [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 8b115c0..00437c2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -21,7 +21,7 @@ struct fixed_range_block { static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} }; @@ -197,7 +197,7 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) - rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7f23cae..712b605 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,7 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 #define MTRRfix4K_C8000_MSR 0x269 -- cgit v1.1 From 654ac05801ae806661c8fdeb3b5c420a31cbc5b1 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:21:54 +0530 Subject: x86, mtrr: remove mtrr MSRs double declaration Removed MTRR MSR from mtrr/mtrr.h as these are already declared in msr-index.h and nobody is using them: MTRRfix16K_A0000_MSR MTRRfix4K_C8000_MSR MTRRfix4K_D0000_MSR MTRRfix4K_D8000_MSR MTRRfix4K_E0000_MSR MTRRfix4K_E8000_MSR MTRRfix4K_F0000_MSR MTRRfix4K_F8000_MSR Use standard msr-index.h's MSR declaration and no need to declare again [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/mtrr.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 712b605..5053793 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,15 +7,7 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 -- cgit v1.1 From ba5673ff1ff5f428256db4cedd4b05b7be008bb6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:29:25 +0530 Subject: x86, mtrr: replace MTRRfix4K_C0000_MSR with msr-index's MSR_MTRRfix4K_C0000 Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 00437c2..3cf58e2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -22,7 +22,7 @@ struct fixed_range_block { static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -199,7 +199,7 @@ get_fixed_ranges(mtrr_type * frs) for (i = 0; i < 2; i++) rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) - rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5053793..e5ee686 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,8 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix4K_C0000_MSR 0x268 - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 -- cgit v1.1 From 52650257ea06bb15c2e2bbe854cbdf463726141a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:35:46 +0530 Subject: x86, mtrr: replace MTRRdefType_MSR with msr-index's MSR_MTRRdefType Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++---- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 -- arch/x86/kernel/cpu/mtrr/state.c | 6 +++--- 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b..1d584a1 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 3cf58e2..e930a31 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -314,7 +314,7 @@ void __init get_mtrr_state(void) if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); - rdmsr(MTRRdefType_MSR, lo, dummy); + rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -579,10 +579,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) __flush_tlb(); /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -591,7 +591,7 @@ static void post_set(void) __releases(set_atomicity_lock) __flush_tlb(); /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index e5ee686..7538b76 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,8 +5,6 @@ #include #include -#define MTRRdefType_MSR 0x2ff - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e275..1f5fb15 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) if (use_intel()) /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else were excluded at the top */ ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { if (use_intel()) /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); else if (is_cpu(CYRIX)) /* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt) /* Restore MTRRdefType */ if (use_intel()) /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ctxt->ccr3); -- cgit v1.1 From d2517a49d55536b38c7a87e5289550cfedaa4dcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 10:04:45 +0200 Subject: perf_counter, x86: fix zero irq_period counters The quirk to irq_period unearthed an unrobustness we had in the hw_counter initialization sequence: we left irq_period at 0, which was then quirked up to 2 ... which then generated a _lot_ of interrupts during 'perf stat' runs, slowed them down and skewed the counter results in general. Initialize irq_period to the maximum instead. [ Impact: fix perf stat results ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 886dcf3..5bfd30a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } + if (!hwc->irq_period) + hwc->irq_period = x86_pmu.max_period; + atomic64_set(&hwc->period_left, min(x86_pmu.max_period, hwc->irq_period)); -- cgit v1.1 From e5198075c67a22ec9a09565b1ce88d3d3f5ba855 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: x86, apic: introduce io_apic_irq_attr according to Ingo, io_apic irq-setup related functions have too many parameters with a repetitive signature. So reduce related funcs to get less params by passing a pointer to a newly defined io_apic_irq_attr structure. v2: io_apic_irq ==> irq_attr triggering ==> trigger v3: add set_io_apic_irq_attr [ Impact: cleanup ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jesse Barnes Cc: Len Brown LKML-Reference: <4A08ACD3.2070401@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 22 +++++++++++---------- arch/x86/kernel/apic/io_apic.c | 43 ++++++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index dcfbc3a..4af63df 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -523,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -533,14 +533,14 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (triggering == ACPI_LEVEL_SENSITIVE) + if (trigger == ACPI_LEVEL_SENSITIVE) eisa_set_level_irq(gsi); } #endif #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -1156,7 +1156,7 @@ void __init mp_config_acpi_legacy_irqs(void) } } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { #ifdef CONFIG_X86_MPPARSE @@ -1181,7 +1181,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, /* print the entry should happen on mptable identically */ mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); @@ -1194,10 +1194,11 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering, return 0; } -int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) +int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { int ioapic; int ioapic_pin; + struct io_apic_irq_attr irq_attr; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -1225,11 +1226,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) ioapic_pin); return gsi; } - mp_config_acpi_gsi(dev, gsi, triggering, polarity); + mp_config_acpi_gsi(dev, gsi, trigger, polarity); - io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, + trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + io_apic_set_pci_routing(dev, gsi, &irq_attr); return gsi; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 74d2b48..ce1ac74 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1096,8 +1096,7 @@ static int pin_2_irq(int idx, int apic, int pin) * Not an __init, possibly needed by modules */ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, - int *ioapic, int *ioapic_pin, - int *trigger, int *polarity) + struct io_apic_irq_attr *irq_attr) { int apic, i, best_guess = -1; @@ -1127,10 +1126,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, continue; if (pin == (mp_irqs[i].srcbusirq & 3)) { - *ioapic = apic; - *ioapic_pin = mp_irqs[i].dstirq; - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); return irq; } /* @@ -1138,10 +1137,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, * best-guess fuzzy result for broken mptables. */ if (best_guess < 0) { - *ioapic = apic; - *ioapic_pin = mp_irqs[i].dstirq; - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); best_guess = irq; } } @@ -3865,13 +3864,16 @@ int __init arch_probe_nr_irqs(void) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) +static int __io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { struct irq_desc *desc; struct irq_cfg *cfg; int node; + int ioapic, pin; + int trigger, polarity; + ioapic = irq_attr->ioapic; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); @@ -3889,6 +3891,10 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in return 0; } + pin = irq_attr->ioapic_pin; + trigger = irq_attr->trigger; + polarity = irq_attr->polarity; + /* * IRQs < 16 are already in the irq_2_pin[] map */ @@ -3897,20 +3903,22 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in add_pin_to_irq_node(cfg, node, ioapic, pin); } - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); + setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); return 0; } -int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, - int triggering, int polarity) +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { - + int ioapic, pin; /* * Avoid pin reprogramming. PRTs typically include entries * with redundant pin->gsi mappings (but unique PCI devices); * we only program the IOAPIC on the first. */ + ioapic = irq_attr->ioapic; + pin = irq_attr->ioapic_pin; if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", mp_ioapics[ioapic].apicid, pin); @@ -3918,8 +3926,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq, } set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - return __io_apic_set_pci_routing(dev, ioapic, pin, irq, - triggering, polarity); + return __io_apic_set_pci_routing(dev, irq, irq_attr); } /* -------------------------------------------------------------------------- -- cgit v1.1 From 2759c3287de27266e06f1f4e82cbd2d65f6a044c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: x86: don't call read_apic_id if !cpu_has_apic should not call that if apic is disabled. [ Impact: fix crash on certain UP configs ] Signed-off-by: Yinghai Lu Cc: Cyrill Gorcunov LKML-Reference: <4A09CCBB.2000306@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/common.c | 6 ++++++ arch/x86/kernel/cpu/intel.c | 6 +++--- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 744e6d8..d0c99ab 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -161,7 +161,7 @@ static int flat_apic_id_registered(void) static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return initial_apic_id >> index_msb; } struct apic apic_flat = { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459..728b375 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -272,7 +272,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = hard_smp_processor_id(); + unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1caefc..017c600 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -761,6 +761,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa1..daed39b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) } #endif -static void __cpuinit srat_detect_node(void) +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) unsigned node; int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); + int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ @@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } /* Work around errata */ - srat_detect_node(); + srat_detect_node(c); if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); -- cgit v1.1 From 35d5a9a61490bf39d2e48d7f499c8c801a39ebe9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:59:37 -0700 Subject: x86: fix system without memory on node0 Jack found a boot crash on a system which doesn't have memory on node0. It turns out with recent per_cpu changes, node_number for BSP will always be 0, and it is not consistent to cpu_to_node() that might set it to a different (nearer) node already. aka when numa_set_node() for node0 is called early before per_cpu area is setup: two places touched that per_cpu(node_number,): 1. in cpu/common.c::cpu_init() and it is not for BP | #ifdef CONFIG_NUMA | if (cpu != 0 && percpu_read(node_number) == 0 && | cpu_to_node(cpu) != NUMA_NO_NODE) | percpu_write(node_number, cpu_to_node(cpu)); | #endif for BP: traps_init ==> cpu_init for AP: start_secondary ==> cpu_init 2. cpu/intel.c or amd.c::srat_detect_node via numa_set_node() for BP: check_bugs ==> identify_boot_cpu ==> identify_cpu() that is rather later before numa_node_id() is used for BP... for AP: start_secondary => smp_callin => smp_store_cpu_info() => => identify_secondary_cpu => identify_cpu() so try to set that for BP earlier in setup_per_cpu_areas(), and don't bother to set that for APs there (it will be updated later and will be used later) (and don't mess the 0 before the copying BP per_cpu data to APs) [ Impact: fix boot crash on memoryless node-0 ] Reported-and-tested-by: Jack Steiner Cc: Tejun Heo Signed-off-by: Yinghai Lu LKML-Reference: <4A0C4A02.7050401@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 3a97a4c..3b5f327 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -423,6 +423,14 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) + /* + * make sure boot cpu node_number is right, when boot cpu is on the + * node that doesn't have mem installed + */ + per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); +#endif + /* Setup node to cpumask map */ setup_node_to_cpumask_map(); -- cgit v1.1 From 629e15d245f46bef9d26199b450f882f9437a8fe Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: x86, irq: update_mptable needs pci_routeirq To get all device irq routing and to save them. This is basically an implicit pci=routeirq enablement if (and on if) the update_mptable boot option (which is off by default) has been specified. [ Impact: extend the update_mptable boot opion's scope ] Signed-off-by: Yinghai Lu Cc: Jesse Barnes LKML-Reference: <4A0DB7B4.4060702@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index cd2a41a..e6bf9d0 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -961,6 +962,9 @@ static int __initdata enable_update_mptable; static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif return 0; } early_param("update_mptable", update_mptable_setup); @@ -973,6 +977,9 @@ static int __initdata alloc_mptable; static int __init parse_alloc_mptable_opt(char *p) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif alloc_mptable = 1; if (!p) return 0; -- cgit v1.1 From f1bdb523880c7f6990e9e8e50b0fc972ca475e84 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:05:16 -0700 Subject: x86, irq: don't call mp_config_acpi_gsi() if update_mptable is not enabled Len expressed concern that the update_mptable feature has side-effects on the ACPI code. Make it sure explicitly that the code only ever gets called if the (default disabled) update_mptable boot quirk option is disabled. [ Impact: isolate the update_mptable feature from ACPI code more ] Signed-off-by: Yinghai Lu Cc: Len Brown LKML-Reference: <4A0DC832.5090200@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 4 +++- arch/x86/kernel/mpparse.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4af63df..844e5e2 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1226,7 +1226,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) ioapic_pin); return gsi; } - mp_config_acpi_gsi(dev, gsi, trigger, polarity); + + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e6bf9d0..651c93b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -957,7 +957,7 @@ out: return 0; } -static int __initdata enable_update_mptable; +int enable_update_mptable; static int __init update_mptable_setup(char *str) { -- cgit v1.1 From b68f1d2e7aa21029d73c7d453a8046e95d351740 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 19:37:25 +0200 Subject: perf_counter, x86: speed up the scheduling fast-path We have to set up the LVT entry only at counter init time, not at every switch-in time. There's friction between NMI and non-NMI use here - we'll probably remove the per counter configurability of it - but until then, dont slow down things ... [ Impact: micro-optimization ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5bfd30a..c109819 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EACCES; hwc->nmi = 1; } + perf_counters_lapic_init(hwc->nmi); if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -603,8 +604,6 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } - perf_counters_lapic_init(hwc->nmi); - x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; @@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(0); + perf_counters_lapic_init(1); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.1 From 4c6f18fc81565967da20f2d4a3922cdba33f8e2b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 18 May 2009 10:23:28 -0700 Subject: x86, io-apic: Don't mark pin_programmed early Peter bisected that: | commit b9c61b70075c87a8612624736faf4a2de5b1ed30 | Date: Wed May 6 10:10:06 2009 -0700 | | x86/pci: update pirq_enable_irq() to setup io apic routing | | So we can set io apic routing only when enabling the device irq. wrecked his opteron box, ata1 interrupts fail to get through. ata1 is using irq 11: [ 1.451839] sata_svw 0000:01:0e.0: version 2.3 [ 1.456333] sata_svw 0000:01:0e.0: PCI INT A -> GSI 11 (level, low) -> IRQ 11 [ 1.463639] scsi0 : sata_svw [ 1.466949] scsi1 : sata_svw [ 1.470022] scsi2 : sata_svw [ 1.473090] scsi3 : sata_svw [ 1.476112] ata1: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe000 irq 11 [ 1.483490] ata2: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe100 irq 11 [ 1.490870] ata3: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe200 irq 11 [ 1.498247] ata4: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe300 irq 11 that pin is overlapped with pin with legacy ones. We should not set bits in pin_programmed here, so that those bit could be set later via io_apic_set_pci_routing(). [ Impact: fix boot hang on certain systems ] Reported-by: Peter Zijlstra Signed-off-by: Yinghai Lu Tested-by: Peter Zijlstra Cc: Jack Steiner LKML-Reference: <4A119990.9020606@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ce1ac74..ac7f3b6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1537,7 +1537,10 @@ static void __init setup_IO_APIC_irqs(void) } cfg = desc->chip_data; add_pin_to_irq_node(cfg, node, apic_id, pin); - set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + /* + * don't mark it in pin_programmed, so later acpi could + * set it correctly when irq < 16 + */ setup_IO_APIC_irq(apic_id, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); } -- cgit v1.1 From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 May 2009 20:13:28 +0200 Subject: perf_counter: Fix context removal deadlock Disable the PMU globally before removing a counter from a context. This fixes the following lockup: [22081.741922] ------------[ cut here ]------------ [22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e() [22081.755624] Hardware name: X8DTN [22081.758903] perfcounters: irq loop stuck! [22081.762985] Modules linked in: [22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226 [22081.772432] Call Trace: [22081.774940] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.781993] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.788368] [] ? warn_slowpath_common+0x77/0xa3 [22081.794649] [] ? warn_slowpath_fmt+0x40/0x45 [22081.800696] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.807080] [] ? perf_counter_nmi_handler+0x3f/0x4a [22081.813751] [] ? notifier_call_chain+0x58/0x86 [22081.819951] [] ? notify_die+0x2d/0x32 [22081.825392] [] ? do_nmi+0x8e/0x242 [22081.830538] [] ? nmi+0x1a/0x20 [22081.835342] [] ? selinux_file_free_security+0x0/0x1a [22081.842105] [] ? x86_pmu_disable_counter+0x15/0x41 [22081.848673] <> [] ? x86_pmu_disable+0x86/0x103 [22081.855512] [] ? __perf_counter_remove_from_context+0x0/0xfe [22081.862926] [] ? counter_sched_out+0x30/0xce [22081.868909] [] ? __perf_counter_remove_from_context+0x59/0xfe [22081.876382] [] ? smp_call_function_single+0x6c/0xe6 [22081.882955] [] ? perf_release+0x86/0x14c [22081.888600] [] ? __fput+0xe7/0x195 [22081.893718] [] ? filp_close+0x5b/0x62 [22081.899107] [] ? put_files_struct+0x64/0xc2 [22081.905031] [] ? do_exit+0x1e2/0x6ef [22081.910360] [] ? _spin_lock_irqsave+0x9/0xe [22081.916292] [] ? do_group_exit+0x67/0x93 [22081.921953] [] ? sys_exit_group+0x12/0x16 [22081.927759] [] ? system_call_fastpath+0x16/0x1b [22081.934076] ---[ end trace 3a3936ce3e1b4505 ]--- And could potentially also fix the lockup reported by Marcelo Tosatti. Also, print more debug info in case of a detected lockup. [ Impact: fix lockup ] Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c109819..6cc1660 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -740,6 +740,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + perf_counter_print_debug(); return 1; } -- cgit v1.1 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e9021a9..b4f6440 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include #include #include #include -- cgit v1.1 From ff99be573e02e9f7edc23b472c7f9a5ddba12795 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:03 +0200 Subject: perf_counter: x86: Expose INV and EDGE bits Expose the INV and EDGE bits of the PMU to raw configs. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.494709027@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cc1660..c14437f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ CORE_EVNTSEL_COUNTER_MASK) return event & CORE_EVNTSEL_MASK; @@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ K7_EVNTSEL_COUNTER_MASK) return event & K7_EVNTSEL_MASK; -- cgit v1.1 From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:04 +0200 Subject: perf_counter: x86: Remove interrupt throttle remove the x86 specific interrupt throttle Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.616671838@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/cpu/perf_counter.c | 47 ++++---------------------------------- 2 files changed, 5 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b4f6440..89b63b5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); - - perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c14437f..8c8177f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -719,11 +719,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) } /* - * Maximum interrupt frequency of 100KHz per CPU - */ -#define PERFMON_MAX_INTERRUPTS (100000/HZ) - -/* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ @@ -775,15 +770,14 @@ again: if (status) goto again; - if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) - perf_enable(); + perf_enable(); return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu, idx, throttle = 0, handled = 0; + int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; @@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { - throttle = 1; - __perf_disable(); - cpuc->enabled = 0; - barrier(); - } - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - int disable = 0; - if (!test_bit(idx, cpuc->active_mask)) continue; @@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) hwc = &counter->hw; if (counter->hw_event.nmi != nmi) - goto next; + continue; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - goto next; + continue; /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - disable = perf_counter_overflow(counter, nmi, regs, 0); - -next: - if (disable || throttle) + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void perf_counter_unthrottle(void) -{ - struct cpu_hw_counters *cpuc; - - if (!x86_pmu_initialized()) - return; - - cpuc = &__get_cpu_var(cpu_hw_counters); - if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - /* - * Clear them before re-enabling irqs/NMIs again: - */ - cpuc->interrupts = 0; - perf_enable(); - } else { - cpuc->interrupts = 0; - } -} - void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); -- cgit v1.1 From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:05 +0200 Subject: perf_counter: Generic per counter interrupt throttle Introduce a generic per counter interrupt throttle. This uses the perf_counter_overflow() quick disable to throttle a specific counter when its going too fast when a pmu->unthrottle() method is provided which can undo the quick disable. Power needs to implement both the quick disable and the unthrottle method. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8c8177f..c4b543d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -623,6 +623,18 @@ try_generic: return 0; } +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->counters[hwc->idx] != counter)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; @@ -1038,6 +1050,7 @@ static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, }; const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -- cgit v1.1 From 53b441a565bf4036ab49c8ea04c5ad06ace7dd6b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 21:41:28 +0200 Subject: Revert "perf_counter, x86: speed up the scheduling fast-path" This reverts commit b68f1d2e7aa21029d73c7d453a8046e95d351740. It is causing problems (stuck/stuttering profiling) - when mixed NMI and non-NMI counters are used. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c4b543d..189bf9d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -293,7 +293,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EACCES; hwc->nmi = 1; } - perf_counters_lapic_init(hwc->nmi); if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -612,6 +611,8 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } + perf_counters_lapic_init(hwc->nmi); + x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; @@ -1037,7 +1038,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(1); + perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.1 From 79202ba9ff8cf570a75596f42e011167734d1c4b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: perf_counter, x86: Fix APIC NMI programming My Nehalem box locks up in certain situations (with an always-asserted NMI causing a lockup) if the PMU LVT entry is programmed between NMI and IRQ mode with a high frequency. Standardize exlusively on NMIs instead. [ Impact: fix lockup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 189bf9d..ece3813 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,14 +285,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; /* - * If privileged enough, allow NMI events: + * Use NMI events all the time: */ - hwc->nmi = 0; - if (hw_event->nmi) { - if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) - return -EACCES; - hwc->nmi = 1; - } + hwc->nmi = 1; + hw_event->nmi = 1; if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -553,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - if (unlikely(hwc->nmi)) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) @@ -806,9 +799,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) counter = cpuc->counters[idx]; hwc = &counter->hw; - if (counter->hw_event.nmi != nmi) - continue; - val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; -- cgit v1.1 From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: perf_counter, x86: Make NMI lockups more robust We have a debug check that detects stuck NMIs and returns with the PMU disabled in the global ctrl MSR - but i managed to trigger a situation where this was not enough to deassert the NMI. So clear/reset the full PMU and keep the disable count balanced when exiting from here. This way the box produces a debug warning but stays up and is more debuggable. [ Impact: in case of PMU related bugs, recover more gracefully ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ece3813..2eeaa99 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } +static void intel_pmu_reset(void) +{ + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + + local_irq_restore(flags); +} + + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -750,6 +774,8 @@ again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); perf_counter_print_debug(); + intel_pmu_reset(); + perf_enable(); return 1; } -- cgit v1.1 From 7d96fd41cadc55f4e00231c8c71b8e25c779f122 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 25 May 2009 11:02:02 +0200 Subject: x86: move rdtsc_barrier() into the TSC vread method The *fence instructions were moved to vsyscall_64.c by commit cb9e35dce94a1b9c59d46224e8a94377d673e204. But this breaks the vDSO, because vread methods are also called from there. Besides, the synchronization might be unnecessary for other time sources than TSC. [ Impact: fix potential time warp in VDSO ] Signed-off-by: Petr Tesarik LKML-Reference: <9d0ea9ea0f866bdc1f4d76831221ae117f11ea67.1243241859.git.ptesarik@suse.cz> Signed-off-by: Thomas Gleixner Cc: --- arch/x86/kernel/tsc.c | 11 ++++++++++- arch/x86/kernel/vsyscall_64.c | 8 -------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d57de05..cf8611d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs) #ifdef CONFIG_X86_64 static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)vget_cycles(); + cycle_t ret; + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + rdtsc_barrier(); return ret >= __vsyscall_gtod_data.clock.cycle_last ? ret : __vsyscall_gtod_data.clock.cycle_last; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 44153af..25ee06a 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) return; } - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); now = vread(); - rdtsc_barrier(); - base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; -- cgit v1.1 From fefda117ddb324b872312f1f061230e627c9f5ee Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 12:21:42 +0200 Subject: amd-iommu: add amd_iommu_dump parameter This kernel parameter will be useful to get some AMD IOMMU related information in dmesg that is not necessary for the default user but may be helpful in debug situations. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be09..57fb7a7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -115,6 +115,8 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); +bool amd_iommu_dump; + static int __initdata amd_iommu_detected; u16 amd_iommu_last_bdf; /* largest PCI device id we have @@ -1211,6 +1213,13 @@ void __init amd_iommu_detect(void) * ****************************************************************************/ +static int __init parse_amd_iommu_dump(char *str) +{ + amd_iommu_dump = true; + + return 1; +} + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -1235,5 +1244,6 @@ static int __init parse_amd_iommu_size_options(char *str) return 1; } +__setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); __setup("amd_iommu_size=", parse_amd_iommu_size_options); -- cgit v1.1 From 9c72041f719e2864d4208a89341c36b316dbf893 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 13:53:57 +0200 Subject: amd-iommu: add dump for iommus described in ivrs table Add information about IOMMU devices described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 57fb7a7..2816590 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -748,6 +748,15 @@ static int __init init_iommu_all(struct acpi_table_header *table) h = (struct ivhd_header *)p; switch (*p) { case ACPI_IVHD_TYPE: + + DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + "seg: %d flags: %01x info %04x\n", + PCI_BUS(h->devid), PCI_SLOT(h->devid), + PCI_FUNC(h->devid), h->cap_ptr, + h->pci_seg, h->flags, h->info); + DUMP_printk(" mmio-addr: %016llx\n", + h->mmio_phys); + iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); if (iommu == NULL) return -ENOMEM; -- cgit v1.1 From 42a698f40a0946f5517308411b9e003ae031414d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 15:41:28 +0200 Subject: amd-iommu: print ivhd information to dmesg when requested Add information about devices belonging to an IOMMU as described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2816590..fe3e645 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -598,32 +598,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, p += sizeof(struct ivhd_header); end += h->length; + while (p < end) { e = (struct ivhd_entry *)p; switch (e->type) { case IVHD_DEV_ALL: + + DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x" + " last device %02x:%02x.%x flags: %02x\n", + PCI_BUS(iommu->first_device), + PCI_SLOT(iommu->first_device), + PCI_FUNC(iommu->first_device), + PCI_BUS(iommu->last_device), + PCI_SLOT(iommu->last_device), + PCI_FUNC(iommu->last_device), + e->flags); + for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); break; case IVHD_DEV_SELECT: + + DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x " + "flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: + + DUMP_printk(" DEV_SELECT_RANGE_START\t " + "devid: %02x:%02x.%x flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid_start = e->devid; flags = e->flags; ext_flags = 0; alias = false; break; case IVHD_DEV_ALIAS: + + DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x " + "flags: %02x devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid = e->devid; devid_to = e->ext >> 8; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: + + DUMP_printk(" DEV_ALIAS_RANGE\t\t " + "devid: %02x:%02x.%x flags: %02x " + "devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid_start = e->devid; flags = e->flags; devid_to = e->ext >> 8; @@ -631,17 +682,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, alias = true; break; case IVHD_DEV_EXT_SELECT: + + DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " + "flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: + + DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " + "%02x:%02x.%x flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid_start = e->devid; flags = e->flags; ext_flags = e->ext; alias = false; break; case IVHD_DEV_RANGE_END: + + DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid)); + devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) -- cgit v1.1 From 02acc43a294098c2a4cd22cf24e9c988644f9f7f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 May 2009 16:24:21 +0200 Subject: amd-iommu: print ivmd information to dmesg when requested Add information about device memory mapping requirements for the IOMMU as described in the IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the kernel command line. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fe3e645..b90a78c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -983,6 +983,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; + char *s; e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) @@ -991,13 +992,16 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: case ACPI_IVMD_TYPE: + s = "IVMD_TYPEi\t\t\t"; e->devid_start = e->devid_end = m->devid; break; case ACPI_IVMD_TYPE_ALL: + s = "IVMD_TYPE_ALL\t\t"; e->devid_start = 0; e->devid_end = amd_iommu_last_bdf; break; case ACPI_IVMD_TYPE_RANGE: + s = "IVMD_TYPE_RANGE\t\t"; e->devid_start = m->devid; e->devid_end = m->aux; break; @@ -1006,6 +1010,13 @@ static int __init init_unity_map_range(struct ivmd_header *m) e->address_end = e->address_start + PAGE_ALIGN(m->range_length); e->prot = m->flags >> 1; + DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" + " range_start: %016llx range_end: %016llx flags: %x\n", s, + PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), + PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), + PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), + e->address_start, e->address_end, m->flags); + list_add_tail(&e->list, &amd_iommu_unity_map); return 0; -- cgit v1.1 From b3b99ef8b4f80f3f093a72110e7697c2281ae45d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:02:48 +0200 Subject: amd-iommu: move protection domain printk to dump code This information is only helpful for debugging. Don't print it anymore unless explicitly requested. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99..3356599 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1009,8 +1009,9 @@ static int device_change_notifier(struct notifier_block *nb, if (!dma_domain) dma_domain = iommu->default_dom; attach_device(iommu, &dma_domain->domain, devid); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", dma_domain->domain.id, dev_name(dev)); + DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " + "%d for device %s\n", + dma_domain->domain.id, dev_name(dev)); break; case BUS_NOTIFY_UNBIND_DRIVER: if (!domain) @@ -1133,8 +1134,9 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", (*domain)->id, dev_name(dev)); + DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " + "%d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) -- cgit v1.1 From 2e8b569614b89c9b1b85cba37db36daeeeff744e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:44:03 +0200 Subject: amd-iommu: disable device isolation with CONFIG_IOMMU_STRESS With device isolation disabled we can test better for race conditions in dma_ops related code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b90a78c..6694112 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -124,8 +124,14 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ + +#ifdef CONFIG_IOMMU_STRESS +bool amd_iommu_isolate = false; +#else bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ +#endif + bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the -- cgit v1.1 From 421f909c803d1c397f6c66b75653f238696c39ee Mon Sep 17 00:00:00 2001 From: Neil Turton Date: Thu, 14 May 2009 14:00:35 +0100 Subject: amd-iommu: fix an off-by-one error in the AMD IOMMU driver. The variable amd_iommu_last_bdf holds the maximum bdf of any device controlled by an IOMMU, so the number of device entries needed is amd_iommu_last_bdf+1. The function tbl_size used amd_iommu_last_bdf instead. This would be a problem if the last device were a large enough power of 2. [ Impact: fix amd_iommu_last_bdf off-by-one error ] Signed-off-by: Neil Turton Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be09..35fc965 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -175,7 +175,7 @@ static inline void update_last_devid(u16 devid) static inline unsigned long tbl_size(int entry_size) { unsigned shift = PAGE_SHIFT + - get_order(amd_iommu_last_bdf * entry_size); + get_order(((int)amd_iommu_last_bdf + 1) * entry_size); return 1UL << shift; } -- cgit v1.1 From 7455aab1f95f6464c5af3fbdee28744e73f38564 Mon Sep 17 00:00:00 2001 From: Neil Turton Date: Thu, 14 May 2009 14:08:11 +0100 Subject: amd-iommu: fix the handling of device aliases in the AMD IOMMU driver. The devid parameter to set_dev_entry_from_acpi is the requester ID rather than the device ID since it is used to index the IOMMU device table. The handling of IVHD_DEV_ALIAS used to pass the device ID. This patch fixes it to pass the requester ID. [ Impact: fix setting the wrong req-id in acpi-table parsing ] Signed-off-by: Neil Turton Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 35fc965..53f93db 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -618,7 +618,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALIAS: devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(iommu, devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: -- cgit v1.1 From 0bc252f430d6a3ac7836d40f00d0ae020593b11b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:48:05 +0200 Subject: amd-iommu: make sure only ivmd entries are parsed The bug never triggered. But it should be fixed to protect against broken ACPI tables in the future. [ Impact: protect against broken ivrs acpi table ] Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 53f93db..a3a2b98 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -906,6 +906,8 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: + kfree(e); + return 0; case ACPI_IVMD_TYPE: e->devid_start = e->devid_end = m->devid; break; -- cgit v1.1 From c1eee67b2d8464781f5868a34168df61e40e85a6 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 21 May 2009 00:56:58 -0700 Subject: amd iommu: properly detach from protection domain on ->remove Some drivers may use the dma api during ->remove which will cause a protection domain to get reattached to a device. Delay the detach until after the driver is completely unbound. [ joro: added a little merge helper ] [ Impact: fix too early device<->domain removal ] Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99..d689883 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -57,6 +57,10 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, static struct dma_ops_domain *find_protection_domain(u16 devid); +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1012,7 +1016,7 @@ static int device_change_notifier(struct notifier_block *nb, printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " "device %s\n", dma_domain->domain.id, dev_name(dev)); break; - case BUS_NOTIFY_UNBIND_DRIVER: + case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; detach_device(domain, devid); -- cgit v1.1 From 3bd221724adb9d642270df0e78b0105fb61e4a1c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 15:06:20 +0200 Subject: amd-iommu: introduce for_each_iommu* macros This patch introduces the for_each_iommu and for_each_iommu_safe macros to simplify the developers life when having to iterate over all AMD IOMMUs in the system. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 8 ++++---- arch/x86/kernel/amd_iommu_init.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99..d9e9dc1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -213,7 +213,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) + for_each_iommu(iommu) iommu_poll_events(iommu); return IRQ_HANDLED; @@ -440,7 +440,7 @@ static void iommu_flush_domain(u16 domid) __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, domid, 1, 1); - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { spin_lock_irqsave(&iommu->lock, flags); __iommu_queue_command(iommu, &cmd); __iommu_completion_wait(iommu); @@ -1672,7 +1672,7 @@ int __init amd_iommu_init_dma_ops(void) * found in the system. Devices not assigned to any other * protection domain will be assigned to the default one. */ - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) return -ENOMEM; @@ -1710,7 +1710,7 @@ int __init amd_iommu_init_dma_ops(void) free_domains: - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { if (iommu->default_dom) dma_ops_domain_free(iommu->default_dom); } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be09..675a4b6 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -679,7 +679,7 @@ static void __init free_iommu_all(void) { struct amd_iommu *iommu, *next; - list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { + for_each_iommu_safe(iommu, next) { list_del(&iommu->list); free_iommu_one(iommu); kfree(iommu); @@ -779,7 +779,7 @@ static int __init iommu_setup_msix(struct amd_iommu *iommu) struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ int nvec = 0, i; - list_for_each_entry(curr, &amd_iommu_list, list) { + for_each_iommu(curr) { if (curr->dev == iommu->dev) { entries[nvec].entry = curr->evt_msi_num; entries[nvec].vector = 0; @@ -818,7 +818,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) int r; struct amd_iommu *curr; - list_for_each_entry(curr, &amd_iommu_list, list) { + for_each_iommu(curr) { if (curr->dev == iommu->dev) curr->int_enabled = true; } @@ -971,7 +971,7 @@ static void __init enable_iommus(void) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable_event_logging(iommu); -- cgit v1.1 From 58492e128892e3b55f1a6ef0cf3c3ab4ce7cc214 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:41:16 +0200 Subject: amd-iommu: consolidate hardware initialization to one function This patch restructures the AMD IOMMU initialization code to initialize all hardware registers with one single function call. This is helpful for suspend/resume support. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 50 +++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 675a4b6..74f4f1f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,13 +252,6 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } -/* Function to enable IOMMU event logging and event interrupts */ -static void __init iommu_enable_event_logging(struct amd_iommu *iommu) -{ - iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); -} - /* * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in * the system has one. @@ -413,25 +406,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - entry = (u64)virt_to_phys(cmd_buf); + return cmd_buf; +} + +/* + * This function writes the command buffer address to the hardware and + * enables it. + */ +static void iommu_enable_command_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->cmd_buf == NULL); + + entry = (u64)virt_to_phys(iommu->cmd_buf); entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); + &entry, sizeof(entry)); /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); - - return cmd_buf; } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -443,20 +447,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu) /* allocates the memory where the IOMMU will log its events to */ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) { - u64 entry; iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(EVT_BUFFER_SIZE)); if (iommu->evt_buf == NULL) return NULL; + return iommu->evt_buf; +} + +static void iommu_enable_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->evt_buf == NULL); + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, &entry, sizeof(entry)); - iommu->evt_buf_size = EVT_BUFFER_SIZE; - - return iommu->evt_buf; + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); } static void __init free_event_buffer(struct amd_iommu *iommu) @@ -710,7 +721,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->mmio_base) return -ENOMEM; - iommu_set_device_table(iommu); iommu->cmd_buf = alloc_command_buffer(iommu); if (!iommu->cmd_buf) return -ENOMEM; @@ -837,6 +847,8 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + return 0; } @@ -972,9 +984,11 @@ static void __init enable_iommus(void) struct amd_iommu *iommu; for_each_iommu(iommu) { + iommu_set_device_table(iommu); + iommu_enable_command_buffer(iommu); + iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); - iommu_enable_event_logging(iommu); iommu_enable(iommu); } } -- cgit v1.1 From fab6afa30954a0684ef8ac1d9a606e74a6215ab6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:46:34 +0200 Subject: amd-iommu: drop pointless iommu-loop in msi setup code It is not necessary to loop again over all IOMMUs in this code. So drop the loop. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 74f4f1f..cc99f60 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -826,13 +826,6 @@ out_free: static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; - struct amd_iommu *curr; - - for_each_iommu(curr) { - if (curr->dev == iommu->dev) - curr->int_enabled = true; - } - if (pci_enable_msi(iommu->dev)) return 1; @@ -847,6 +840,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu->int_enabled = true; iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); return 0; -- cgit v1.1 From d91cecdd796c27df46339e80ed436a980c56fcad Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2009 18:51:00 +0200 Subject: amd-iommu: remove support for msi-x Current hardware uses msi instead of msi-x so this code it not necessary and can not be tested. The best thing is to drop this code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 44 +--------------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index cc99f60..feee475 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -783,46 +783,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msix(struct amd_iommu *iommu) -{ - struct amd_iommu *curr; - struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ - int nvec = 0, i; - - for_each_iommu(curr) { - if (curr->dev == iommu->dev) { - entries[nvec].entry = curr->evt_msi_num; - entries[nvec].vector = 0; - curr->int_enabled = true; - nvec++; - } - } - - if (pci_enable_msix(iommu->dev, entries, nvec)) { - pci_disable_msix(iommu->dev); - return 1; - } - - for (i = 0; i < nvec; ++i) { - int r = request_irq(entries->vector, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD IOMMU", - NULL); - if (r) - goto out_free; - } - - return 0; - -out_free: - for (i -= 1; i >= 0; --i) - free_irq(entries->vector, NULL); - - pci_disable_msix(iommu->dev); - - return 1; -} - static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; @@ -851,9 +811,7 @@ static int __init iommu_init_msi(struct amd_iommu *iommu) if (iommu->int_enabled) return 0; - if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) - return iommu_setup_msix(iommu); - else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) return iommu_setup_msi(iommu); return 1; -- cgit v1.1 From 92ac4320af6ed4294c2c221dd4ccbfd9026a3aa7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 19:06:27 +0200 Subject: amd-iommu: add function to disable all iommus This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index feee475..ed10c0f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,6 +252,11 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +static void iommu_disable(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_IOMMU_EN); +} + /* * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in * the system has one. @@ -945,6 +950,14 @@ static void __init enable_iommus(void) } } +static void disable_iommus(void) +{ + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_disable(iommu); +} + /* * Suspend/Resume support * disable suspend until real resume implemented -- cgit v1.1 From bfd1be1857e5a3385bf146e02e6dc3dd4241bec1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 May 2009 15:33:57 +0200 Subject: amd-iommu: add function to flush tlb for all domains This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d9e9dc1..826ad079 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -449,6 +449,17 @@ static void iommu_flush_domain(u16 domid) } } +void amd_iommu_flush_all_domains(void) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + iommu_flush_domain(i); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for -- cgit v1.1 From 7d7a110c6127b7fc683dc6d764555f2dbd22b054 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 May 2009 15:48:10 +0200 Subject: amd-iommu: add function to flush tlb for all devices This function is required for suspend/resume support with AMD IOMMU enabled. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 826ad079..92b0e18 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -460,6 +460,24 @@ void amd_iommu_flush_all_domains(void) } } +void amd_iommu_flush_all_devices(void) +{ + struct amd_iommu *iommu; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] == NULL) + continue; + + iommu = amd_iommu_rlookup_table[i]; + if (!iommu) + continue; + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for -- cgit v1.1 From 05f92db9f47f852ff48bbed1b063b8ab8ad00285 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 09:52:46 +0200 Subject: amd_iommu: un __init functions required for suspend/resume This patch makes sure that no function required for suspend/resume of AMD IOMMU driver is thrown away after boot. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ed10c0f..330896b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -193,7 +193,7 @@ static inline unsigned long tbl_size(int entry_size) * This function set the exclusion range in the IOMMU. DMA accesses to the * exclusion range are passed through untranslated */ -static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) +static void iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; @@ -225,7 +225,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) } /* Generic functions to enable/disable certain features of the IOMMU. */ -static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -244,7 +244,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -static void __init iommu_enable(struct amd_iommu *iommu) +static void iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); @@ -811,7 +811,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 0; } -static int __init iommu_init_msi(struct amd_iommu *iommu) +static int iommu_init_msi(struct amd_iommu *iommu) { if (iommu->int_enabled) return 0; @@ -936,7 +936,7 @@ static void init_device_table(void) * This function finally enables all IOMMUs found in the system after * they have been initialized */ -static void __init enable_iommus(void) +static void enable_iommus(void) { struct amd_iommu *iommu; -- cgit v1.1 From 736501ee000757082a4f0832826ae1eda7ea106e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 09:56:12 +0200 Subject: amd-iommu: implement suspend/resume This patch puts everything together and enables suspend/resume support in the AMD IOMMU driver. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 330896b..4ca8fbf 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -965,12 +965,31 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + /* + * Disable IOMMUs before reprogramming the hardware registers. + * IOMMU is still enabled from the resume kernel. + */ + disable_iommus(); + + /* re-load the hardware */ + enable_iommus(); + + /* + * we have to flush after the IOMMUs are enabled because a + * disabled IOMMU will never execute the commands we send + */ + amd_iommu_flush_all_domains(); + amd_iommu_flush_all_devices(); + return 0; } static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) { - return -EINVAL; + /* disable IOMMUs to go out of the way for BIOS */ + disable_iommus(); + + return 0; } static struct sysdev_class amd_iommu_sysdev_class = { -- cgit v1.1 From c3239567a20e90e3026ac5453d5267506ef7b030 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 10:56:44 +0200 Subject: amd-iommu: introduce aperture_range structure This is a preperation for extended address allocator. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 46 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99..62acd09 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -595,7 +595,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, * as allocated in the aperture */ if (addr < dma_dom->aperture_size) - __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); + __set_bit(addr >> PAGE_SHIFT, + dma_dom->aperture.bitmap); } return 0; @@ -656,11 +657,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, dom->need_flush = true; } - address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, align_mask); + address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit, + pages, 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, align_mask); + address = iommu_area_alloc(dom->aperture.bitmap, limit, 0, + pages, 0, boundary_size, + align_mask); dom->need_flush = true; } @@ -685,7 +687,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned int pages) { address >>= PAGE_SHIFT; - iommu_area_free(dom->bitmap, address, pages); + iommu_area_free(dom->aperture.bitmap, address, pages); if (address >= dom->next_bit) dom->need_flush = true; @@ -741,7 +743,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->bitmap, start_page, pages); + iommu_area_reserve(dom->aperture.bitmap, start_page, pages); } static void free_pagetable(struct protection_domain *domain) @@ -785,9 +787,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) free_pagetable(&dom->domain); - kfree(dom->pte_pages); - - kfree(dom->bitmap); + free_page((unsigned long)dom->aperture.bitmap); kfree(dom); } @@ -826,16 +826,15 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = (1ULL << order); - dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), - GFP_KERNEL); - if (!dma_dom->bitmap) + dma_dom->aperture_size = APERTURE_RANGE_SIZE; + dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL); + if (!dma_dom->aperture.bitmap) goto free_dma_dom; /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value */ - dma_dom->bitmap[0] = 1; + dma_dom->aperture.bitmap[0] = 1; dma_dom->next_bit = 0; dma_dom->need_flush = false; @@ -854,13 +853,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, /* * At the last step, build the page tables so we don't need to * allocate page table pages in the dma_ops mapping/unmapping - * path. + * path for the first 128MB of dma address space. */ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), - GFP_KERNEL); - if (!dma_dom->pte_pages) - goto free_dma_dom; l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); if (l2_pde == NULL) @@ -869,10 +864,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); for (i = 0; i < num_pte_pages; ++i) { - dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->pte_pages[i]) + u64 **pte_page = &dma_dom->aperture.pte_pages[i]; + *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!*pte_page) goto free_dma_dom; - address = virt_to_phys(dma_dom->pte_pages[i]); + address = virt_to_phys(*pte_page); l2_pde[i] = IOMMU_L1_PDE(address); } @@ -1159,7 +1155,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1192,7 +1188,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); -- cgit v1.1 From 8bda3092bcfa68f786d94549ae026e8db1eff041 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 12:02:46 +0200 Subject: amd-iommu: move page table allocation code to seperate function This patch makes page table allocation usable for dma_ops code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 86 ++++++++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 62acd09..ded79f7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -55,7 +55,9 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); - +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 + **pte_page, gfp_t gfp); #ifdef CONFIG_AMD_IOMMU_STATS @@ -468,7 +470,7 @@ static int iommu_map_page(struct protection_domain *dom, unsigned long phys_addr, int prot) { - u64 __pte, *pte, *page; + u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); @@ -477,27 +479,7 @@ static int iommu_map_page(struct protection_domain *dom, if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; + pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -1140,6 +1122,61 @@ static int get_device_resources(struct device *dev, } /* + * If the pte_page is not yet allocated this function is called + */ +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 **pte_page, gfp_t gfp) +{ + u64 *pte, *page; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L2_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L1_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page) + *pte_page = pte; + + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function fetches the PTE for a given address in the aperture + */ +static u64* dma_ops_get_pte(struct dma_ops_domain *dom, + unsigned long address) +{ + struct aperture_range *aperture = &dom->aperture; + u64 *pte, *pte_page; + + pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + if (!pte) { + pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page; + } + + return pte; +} + +/* * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ @@ -1155,8 +1192,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; - pte += IOMMU_PTE_L0_INDEX(address); + pte = dma_ops_get_pte(dom, address); __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; -- cgit v1.1 From 53812c115cda1f660b286c939669154a56976f6b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 12 May 2009 12:17:38 +0200 Subject: amd-iommu: handle page table allocation failures in dma_ops code The code will be required when the aperture size increases dynamically in the extended address allocator. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index ded79f7..a467add 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1193,6 +1193,8 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; pte = dma_ops_get_pte(dom, address); + if (!pte) + return bad_dma_address; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1248,7 +1250,7 @@ static dma_addr_t __map_single(struct device *dev, u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; - dma_addr_t address, start; + dma_addr_t address, start, ret; unsigned int pages; unsigned long align_mask = 0; int i; @@ -1271,7 +1273,10 @@ static dma_addr_t __map_single(struct device *dev, start = address; for (i = 0; i < pages; ++i) { - dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + if (ret == bad_dma_address) + goto out_unmap; + paddr += PAGE_SIZE; start += PAGE_SIZE; } @@ -1287,6 +1292,17 @@ static dma_addr_t __map_single(struct device *dev, out: return address; + +out_unmap: + + for (--i; i >= 0; --i) { + start -= PAGE_SIZE; + dma_ops_domain_unmap(iommu, dma_dom, start); + } + + dma_ops_free_addresses(dma_dom, address, pages); + + return bad_dma_address; } /* -- cgit v1.1 From 384de72910a7bf96a02a6d8023fe9e16d872beb2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 15 May 2009 12:30:05 +0200 Subject: amd-iommu: make address allocator aware of multiple aperture ranges This patch changes the AMD IOMMU address allocator to allow up to 32 aperture ranges per dma_ops domain. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 138 ++++++++++++++++++++++++++++++++------------ 1 file changed, 101 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a467add..794163a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -578,7 +578,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, */ if (addr < dma_dom->aperture_size) __set_bit(addr >> PAGE_SHIFT, - dma_dom->aperture.bitmap); + dma_dom->aperture[0]->bitmap); } return 0; @@ -615,43 +615,74 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, ****************************************************************************/ /* - * The address allocator core function. + * The address allocator core functions. * * called with domain->lock held */ + +static unsigned long dma_ops_area_alloc(struct device *dev, + struct dma_ops_domain *dom, + unsigned int pages, + unsigned long align_mask, + u64 dma_mask, + unsigned long start) +{ + unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES; + int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i = start >> APERTURE_RANGE_SHIFT; + unsigned long boundary_size; + unsigned long address = -1; + unsigned long limit; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + for (;i < max_index; ++i) { + unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; + + if (dom->aperture[i]->offset >= dma_mask) + break; + + limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, + dma_mask >> PAGE_SHIFT); + + address = iommu_area_alloc(dom->aperture[i]->bitmap, + limit, next_bit, pages, 0, + boundary_size, align_mask); + if (address != -1) { + address = dom->aperture[i]->offset + + (address << PAGE_SHIFT); + dom->next_bit = (address >> PAGE_SHIFT) + pages; + break; + } + + next_bit = 0; + } + + return address; +} + static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, unsigned long align_mask, u64 dma_mask) { - unsigned long limit; unsigned long address; - unsigned long boundary_size; + unsigned long start = dom->next_bit << PAGE_SHIFT; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, - dma_mask >> PAGE_SHIFT); - if (dom->next_bit >= limit) { - dom->next_bit = 0; - dom->need_flush = true; - } + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, start); - address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit, - pages, 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->aperture.bitmap, limit, 0, - pages, 0, boundary_size, - align_mask); + dom->next_bit = 0; + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, 0); dom->need_flush = true; } - if (likely(address != -1)) { - dom->next_bit = address + pages; - address <<= PAGE_SHIFT; - } else + if (unlikely(address == -1)) address = bad_dma_address; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -668,11 +699,17 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) { - address >>= PAGE_SHIFT; - iommu_area_free(dom->aperture.bitmap, address, pages); + unsigned i = address >> APERTURE_RANGE_SHIFT; + struct aperture_range *range = dom->aperture[i]; + + BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); - if (address >= dom->next_bit) + if ((address >> PAGE_SHIFT) >= dom->next_bit) dom->need_flush = true; + + address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + iommu_area_free(range->bitmap, address, pages); + } /**************************************************************************** @@ -720,12 +757,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) { - unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->aperture.bitmap, start_page, pages); + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); + } } static void free_pagetable(struct protection_domain *domain) @@ -764,12 +805,19 @@ static void free_pagetable(struct protection_domain *domain) */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { + int i; + if (!dom) return; free_pagetable(&dom->domain); - free_page((unsigned long)dom->aperture.bitmap); + for (i = 0; i < APERTURE_MAX_RANGES; ++i) { + if (!dom->aperture[i]) + continue; + free_page((unsigned long)dom->aperture[i]->bitmap); + kfree(dom->aperture[i]); + } kfree(dom); } @@ -797,6 +845,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom) return NULL; + dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range), + GFP_KERNEL); + if (!dma_dom->aperture[0]) + goto free_dma_dom; + spin_lock_init(&dma_dom->domain.lock); dma_dom->domain.id = domain_id_alloc(); @@ -809,14 +862,14 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom->domain.pt_root) goto free_dma_dom; dma_dom->aperture_size = APERTURE_RANGE_SIZE; - dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->aperture.bitmap) + dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL); + if (!dma_dom->aperture[0]->bitmap) goto free_dma_dom; /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value */ - dma_dom->aperture.bitmap[0] = 1; + dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_bit = 0; dma_dom->need_flush = false; @@ -846,7 +899,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); for (i = 0; i < num_pte_pages; ++i) { - u64 **pte_page = &dma_dom->aperture.pte_pages[i]; + u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i]; *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); if (!*pte_page) goto free_dma_dom; @@ -1164,14 +1217,19 @@ static u64* alloc_pte(struct protection_domain *dom, static u64* dma_ops_get_pte(struct dma_ops_domain *dom, unsigned long address) { - struct aperture_range *aperture = &dom->aperture; + struct aperture_range *aperture; u64 *pte, *pte_page; - pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return NULL; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); - aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page; - } + aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; + } else + pte += IOMMU_PTE_L0_INDEX(address); return pte; } @@ -1219,14 +1277,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) { + struct aperture_range *aperture; u64 *pte; if (address >= dom->aperture_size) return; - WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) + return; - pte = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); -- cgit v1.1 From 803b8cb4d9a93b90c67aba2aab7f2c54d595b5b9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 May 2009 15:32:48 +0200 Subject: amd-iommu: change dma_dom->next_bit to dma_dom->next_address Simplify the code a little bit by using the same unit for all address space related state in the dma_ops domain structure. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 794163a..c1a08b9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -627,13 +627,15 @@ static unsigned long dma_ops_area_alloc(struct device *dev, u64 dma_mask, unsigned long start) { - unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES; + unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; int i = start >> APERTURE_RANGE_SHIFT; unsigned long boundary_size; unsigned long address = -1; unsigned long limit; + next_bit >>= PAGE_SHIFT; + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; @@ -652,7 +654,7 @@ static unsigned long dma_ops_area_alloc(struct device *dev, if (address != -1) { address = dom->aperture[i]->offset + (address << PAGE_SHIFT); - dom->next_bit = (address >> PAGE_SHIFT) + pages; + dom->next_address = address + (pages << PAGE_SHIFT); break; } @@ -669,14 +671,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, u64 dma_mask) { unsigned long address; - unsigned long start = dom->next_bit << PAGE_SHIFT; - address = dma_ops_area_alloc(dev, dom, pages, align_mask, - dma_mask, start); + dma_mask, dom->next_address); if (address == -1) { - dom->next_bit = 0; + dom->next_address = 0; address = dma_ops_area_alloc(dev, dom, pages, align_mask, dma_mask, 0); dom->need_flush = true; @@ -704,10 +704,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); - if ((address >> PAGE_SHIFT) >= dom->next_bit) + if (address >= dom->next_address) dom->need_flush = true; address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + iommu_area_free(range->bitmap, address, pages); } @@ -870,7 +871,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, * a valid dma-address. So we can use 0 as error value */ dma_dom->aperture[0]->bitmap[0] = 1; - dma_dom->next_bit = 0; + dma_dom->next_address = 0; dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; -- cgit v1.1 From 9cabe89b99773e682538a8809abc7d4000c77083 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 May 2009 16:38:55 +0200 Subject: amd-iommu: move aperture_range allocation code to seperate function This patch prepares the dynamic increasement of dma_ops domain apertures. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 95 ++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c1a08b9..8ff02ee 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -620,6 +620,59 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ +/* + * This function is used to add a new aperture range to an existing + * aperture in case of dma_ops domain allocation or address allocation + * failure. + */ +static int alloc_new_range(struct dma_ops_domain *dma_dom, + bool populate, gfp_t gfp) +{ + int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + + if (index >= APERTURE_MAX_RANGES) + return -ENOMEM; + + dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); + if (!dma_dom->aperture[index]) + return -ENOMEM; + + dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); + if (!dma_dom->aperture[index]->bitmap) + goto out_free; + + dma_dom->aperture[index]->offset = dma_dom->aperture_size; + + if (populate) { + unsigned long address = dma_dom->aperture_size; + int i, num_ptes = APERTURE_RANGE_PAGES / 512; + u64 *pte, *pte_page; + + for (i = 0; i < num_ptes; ++i) { + pte = alloc_pte(&dma_dom->domain, address, + &pte_page, gfp); + if (!pte) + goto out_free; + + dma_dom->aperture[index]->pte_pages[i] = pte_page; + + address += APERTURE_RANGE_SIZE / 64; + } + } + + dma_dom->aperture_size += APERTURE_RANGE_SIZE; + + return 0; + +out_free: + free_page((unsigned long)dma_dom->aperture[index]->bitmap); + + kfree(dma_dom->aperture[index]); + dma_dom->aperture[index] = NULL; + + return -ENOMEM; +} + static unsigned long dma_ops_area_alloc(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, @@ -832,9 +885,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, unsigned order) { struct dma_ops_domain *dma_dom; - unsigned i, num_pte_pages; - u64 *l2_pde; - u64 address; /* * Currently the DMA aperture must be between 32 MB and 1GB in size @@ -846,11 +896,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (!dma_dom) return NULL; - dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range), - GFP_KERNEL); - if (!dma_dom->aperture[0]) - goto free_dma_dom; - spin_lock_init(&dma_dom->domain.lock); dma_dom->domain.id = domain_id_alloc(); @@ -862,10 +907,13 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = APERTURE_RANGE_SIZE; - dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->aperture[0]->bitmap) + + dma_dom->need_flush = false; + dma_dom->target_dev = 0xffff; + + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; + /* * mark the first page as allocated so we never return 0 as * a valid dma-address. So we can use 0 as error value @@ -873,9 +921,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_address = 0; - dma_dom->need_flush = false; - dma_dom->target_dev = 0xffff; - /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { @@ -886,28 +931,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_ops_reserve_addresses(dma_dom, startpage, pages); } - /* - * At the last step, build the page tables so we don't need to - * allocate page table pages in the dma_ops mapping/unmapping - * path for the first 128MB of dma address space. - */ - num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - - l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); - if (l2_pde == NULL) - goto free_dma_dom; - - dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); - - for (i = 0; i < num_pte_pages; ++i) { - u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i]; - *pte_page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!*pte_page) - goto free_dma_dom; - address = virt_to_phys(*pte_page); - l2_pde[i] = IOMMU_L1_PDE(address); - } - return dma_dom; free_dma_dom: -- cgit v1.1 From 00cd122ae5e5e7c60cce2af3c35b190d4c3f2d0d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 09:52:40 +0200 Subject: amd-iommu: handle exlusion ranges and unity mappings in alloc_new_range This patch makes sure no reserved addresses are allocated in an dma_ops domain when the aperture is increased dynamically. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 71 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8ff02ee..59ee1b9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -58,6 +58,9 @@ static struct dma_ops_domain *find_protection_domain(u16 devid); static u64* alloc_pte(struct protection_domain *dom, unsigned long address, u64 **pte_page, gfp_t gfp); +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages); #ifdef CONFIG_AMD_IOMMU_STATS @@ -621,14 +624,42 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, */ /* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64* fetch_pte(struct protection_domain *domain, + unsigned long address) +{ + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* * This function is used to add a new aperture range to an existing * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i; if (index >= APERTURE_MAX_RANGES) return -ENOMEM; @@ -662,6 +693,33 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, dma_dom->aperture_size += APERTURE_RANGE_SIZE; + /* Intialize the exclusion range if necessary */ + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset && + iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } + + /* + * Check for areas already mapped as present in the new aperture + * range and mark those pages as reserved in the allocator. Such + * mappings may already exist as a result of requested unity + * mappings for devices. + */ + for (i = dma_dom->aperture[index]->offset; + i < dma_dom->aperture_size; + i += PAGE_SIZE) { + u64 *pte = fetch_pte(&dma_dom->domain, i); + if (!pte || !IOMMU_PTE_PRESENT(*pte)) + continue; + + dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); + } + return 0; out_free: @@ -911,7 +969,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - if (alloc_new_range(dma_dom, true, GFP_KERNEL)) + if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -921,15 +979,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->aperture[0]->bitmap[0] = 1; dma_dom->next_address = 0; - /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); - } return dma_dom; -- cgit v1.1 From 11b83888ae729457b5cfb936dbd498481f6408df Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 10:23:15 +0200 Subject: amd-iommu: enlarge the aperture dynamically By dynamically increasing the aperture the extended allocator is now ready for use. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 59ee1b9..d129d8f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1403,10 +1403,26 @@ static dma_addr_t __map_single(struct device *dev, if (align) align_mask = (1UL << get_order(size)) - 1; +retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) - goto out; + if (unlikely(address == bad_dma_address)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the + * first run. This is a small optimization. + */ + dma_dom->next_address = dma_dom->aperture_size; + + if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + goto out; + + /* + * aperture was sucessfully enlarged by 128 MB, try + * allocation again + */ + goto retry; + } start = address; for (i = 0; i < pages; ++i) { -- cgit v1.1 From d9cfed925448f097ec7faab80d903eb7e5f99712 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2009 12:16:29 +0200 Subject: amd-iommu: remove amd_iommu_size kernel parameter This parameter is not longer necessary when aperture increases dynamically. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 18 ++++-------------- arch/x86/kernel/amd_iommu_init.c | 15 --------------- 2 files changed, 4 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d129d8f..31d56c3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -939,17 +939,10 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, - unsigned order) +static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) { struct dma_ops_domain *dma_dom; - /* - * Currently the DMA aperture must be between 32 MB and 1GB in size - */ - if ((order < 25) || (order > 30)) - return NULL; - dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); if (!dma_dom) return NULL; @@ -1087,7 +1080,6 @@ static int device_change_notifier(struct notifier_block *nb, struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; unsigned long flags; if (devid > amd_iommu_last_bdf) @@ -1126,7 +1118,7 @@ static int device_change_notifier(struct notifier_block *nb, dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu, order); + dma_domain = dma_ops_domain_alloc(iommu); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1826,7 +1818,6 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -1839,7 +1830,7 @@ static void prealloc_protection_domains(void) iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - dma_dom = dma_ops_domain_alloc(iommu, order); + dma_dom = dma_ops_domain_alloc(iommu); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); @@ -1865,7 +1856,6 @@ static struct dma_map_ops amd_iommu_dma_ops = { int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; int ret; /* @@ -1874,7 +1864,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ list_for_each_entry(iommu, &amd_iommu_list, list) { - iommu->default_dom = dma_ops_domain_alloc(iommu, order); + iommu->default_dom = dma_ops_domain_alloc(iommu); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be09..762a4ee 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -121,7 +121,6 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ @@ -1137,9 +1136,6 @@ int __init amd_iommu_init(void) enable_iommus(); - printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", - (1 << (amd_iommu_aperture_order-20))); - printk(KERN_INFO "AMD IOMMU: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); @@ -1225,15 +1221,4 @@ static int __init parse_amd_iommu_options(char *str) return 1; } -static int __init parse_amd_iommu_size_options(char *str) -{ - unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); - - if ((order > 24) && (order < 31)) - amd_iommu_aperture_order = order; - - return 1; -} - __setup("amd_iommu=", parse_amd_iommu_options); -__setup("amd_iommu_size=", parse_amd_iommu_size_options); -- cgit v1.1 From fe16f088a88fb73161bba8784375c829f7e87b54 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:27:53 +0200 Subject: amd-iommu: disable round-robin allocator for CONFIG_IOMMU_STRESS Disabling the round-robin allocator results in reusing the same dma-addresses again very fast. This is a good test if the iotlb flushing is working correctly. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 31d56c3..543822b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -783,6 +783,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, { unsigned long address; +#ifdef CONFIG_IOMMU_STRESS + dom->next_address = 0; + dom->need_flush = true; +#endif + address = dma_ops_area_alloc(dev, dom, pages, align_mask, dma_mask, dom->next_address); -- cgit v1.1 From f5e9705c6429d24dee832b2edd7f4848d432ea03 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:31:53 +0200 Subject: amd-iommu: don't preallocate page tables with CONFIG_IOMMU_STRESS This forces testing of on-demand page table allocation code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 543822b..33434c4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -661,6 +661,10 @@ static int alloc_new_range(struct amd_iommu *iommu, int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; int i; +#ifdef CONFIG_IOMMU_STRESS + populate = false; +#endif + if (index >= APERTURE_MAX_RANGES) return -ENOMEM; -- cgit v1.1 From 47bccd6bb2b866449d3ecf2ba350ac1c7473b2b8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 May 2009 12:40:54 +0200 Subject: amd-iommu: don't free dma adresses below 512MB with CONFIG_IOMMU_STRESS This will test the automatic aperture enlargement code. This is important because only very few devices will ever trigger this code path. So force it under CONFIG_IOMMU_STRESS. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 33434c4..04ff5ec 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -824,6 +824,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); +#ifdef CONFIG_IOMMU_STRESS + if (i < 4) + return; +#endif + if (address >= dom->next_address) dom->need_flush = true; -- cgit v1.1 From c323d95fa4dbe0b6bf6d59e24a0b7db067dd08a7 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Fri, 29 May 2009 13:28:35 +0800 Subject: perf_counter/x86: Always use NMI for performance-monitoring interrupt Always use NMI for performance-monitoring interrupt as there could be racy situations if we switch between irq and nmi mode frequently. Signed-off-by: Yong Wang LKML-Reference: <20090529052835.GA13657@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 19 +++++-------------- 2 files changed, 6 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 89b63b5..60df2ef 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1135,7 +1135,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(0); + perf_counters_lapic_init(); preempt_disable(); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2eeaa99..316b0c9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -604,7 +604,7 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } - perf_counters_lapic_init(hwc->nmi); + perf_counters_lapic_init(); x86_pmu.disable(hwc, idx); @@ -863,24 +863,15 @@ void set_perf_counter_pending(void) apic->send_IPI_self(LOCAL_PENDING_VECTOR); } -void perf_counters_lapic_init(int nmi) +void perf_counters_lapic_init(void) { - u32 apic_val; - if (!x86_pmu_initialized()) return; /* - * Enable the performance counter vector in the APIC LVT: + * Always use NMI for PMU */ - apic_val = apic_read(APIC_LVTERR); - - apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); - if (nmi) - apic_write(APIC_LVTPC, APIC_DM_NMI); - else - apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); - apic_write(APIC_LVTERR, apic_val); + apic_write(APIC_LVTPC, APIC_DM_NMI); } static int __kprobes @@ -1054,7 +1045,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(0); + perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.1 From 58f892e022e88438183c48661dcdc6a2997dab99 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 26 May 2009 21:48:07 +0000 Subject: x86: Print real IOAPIC version for x86-64 Fix the fact that the IOAPIC version number in the x86_64 code path always gets assigned to 0, instead of the correct value. Before the patch: (from "dmesg" output): ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0]) IOAPIC[0]: apic_id 8, version 0, address 0xfec00000, GSI 0-23 <--- After the patch: ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0]) IOAPIC[0]: apic_id 8, version 32, address 0xfec00000, GSI 0-23 <--- History: io_apic_get_version() was compiled out of the x86_64 code path in the commit f2c2cca3acef8b253a36381d9b469ad4fb08563a: Author: Andi Kleen Date: Tue Sep 26 10:52:37 2006 +0200 [PATCH] Remove APIC version/cpu capability mpparse checking/printing ACPI went to great trouble to get the APIC version and CPU capabilities of different CPUs before passing them to the mpparser. But all that data was used was to print it out. Actually it even faked some data based on the boot cpu, not on the actual CPU being booted. Remove all this code because it's not needed. Cc: len.brown@intel.com At the time, the IOAPIC version number was deliberately not printed in the x86_64 code path. However, after the x86 and x86_64 files were merged, the net result is that the IOAPIC version is printed incorrectly in the x86_64 code path. The patch below provides a fix. I have tested it with acpi, and with acpi=off, and did not see any problems. Signed-off-by: Naga Chumbalkar Acked-by: Yinghai Lu LKML-Reference: <20090416014230.4885.94926.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar ************************* --- arch/x86/kernel/acpi/boot.c | 5 +---- arch/x86/kernel/apic/io_apic.c | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 844e5e2..6310861 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -985,11 +985,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 mp_ioapics[idx].apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].apicver = 0; -#endif + /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac7f3b6..f712f8f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4012,6 +4012,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } +#endif int __init io_apic_get_version(int ioapic) { @@ -4024,7 +4025,6 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } -#endif int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { -- cgit v1.1 From 3d58829b0510244596079c1d2f1762c53aef2e97 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 28 May 2009 09:54:47 +0200 Subject: x86, apic: Restore irqs on fail paths lapic_resume forgets to restore interrupts on fail paths. Fix that. Signed-off-by: Jiri Slaby Acked-by: Cyrill Gorcunov LKML-Reference: <1243497289-18591-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b0fd264..e82488d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2027,7 +2027,7 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - int ret; + int ret = 0; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) @@ -2038,14 +2038,15 @@ static int lapic_resume(struct sys_device *dev) ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - return -ENOMEM; + ret = -ENOMEM; + goto restore; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { WARN(1, "Saving IO-APIC state failed: %d\n", ret); free_ioapic_entries(ioapic_entries); - return ret; + goto restore; } mask_IO_APIC_setup(ioapic_entries); @@ -2097,10 +2098,10 @@ static int lapic_resume(struct sys_device *dev) restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } - +restore: local_irq_restore(flags); - return 0; + return ret; } /* -- cgit v1.1 From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:13:03 +0200 Subject: perf_counter: Rename various fields A few renames: s/irq_period/sample_period/ s/irq_freq/sample_freq/ s/PERF_RECORD_/PERF_SAMPLE_/ s/record_type/sample_type/ And change both the new sample_type and read_format to u64. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 316b0c9..ec06aa5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; hw_event->nmi = 1; - if (!hwc->irq_period) - hwc->irq_period = x86_pmu.max_period; + if (!hwc->sample_period) + hwc->sample_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, - min(x86_pmu.max_period, hwc->irq_period)); + min(x86_pmu.max_period, hwc->sample_period)); /* * Raw event type provide the config in the event structure @@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = min(x86_pmu.max_period, hwc->irq_period); + s64 period = min(x86_pmu.max_period, hwc->sample_period); int err; /* -- cgit v1.1 From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:27:45 +0200 Subject: perf_counter: Remove the last nmi/irq bits IRQ (non-NMI) sampling is not used anymore - remove the last few bits. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ec06aa5..9e144fb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - /* - * Use NMI events all the time: - */ - hwc->nmi = 1; - hw_event->nmi = 1; - if (!hwc->sample_period) hwc->sample_period = x86_pmu.max_period; -- cgit v1.1 From e4abb5d4f7ddabc1fc7c392cf0a10d8e5868c9ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:08:20 +0200 Subject: perf_counter: x86: Emulate longer sample periods Do as Power already does, emulate sample periods up to 2^63-1 by composing them of smaller values limited by hardware capabilities. Only once we wrap the software period do we generate an overflow event. Just 10 lines of new code. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9e144fb..904571b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -287,8 +287,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hwc->sample_period) hwc->sample_period = x86_pmu.max_period; - atomic64_set(&hwc->period_left, - min(x86_pmu.max_period, hwc->sample_period)); + atomic64_set(&hwc->period_left, hwc->sample_period); /* * Raw event type provide the config in the event structure @@ -451,13 +450,13 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); * Set the next IRQ period, based on the hwc->period_left value. * To be called with the counter disabled in hw: */ -static void +static int x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = min(x86_pmu.max_period, hwc->sample_period); - int err; + s64 period = hwc->sample_period; + int err, ret = 0; /* * If we are way outside a reasoable range then just skip forward: @@ -465,11 +464,13 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + ret = 1; } if (unlikely(left <= 0)) { left += period; atomic64_set(&hwc->period_left, left); + ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 event is left: @@ -477,6 +478,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left < 2)) left = 2; + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + per_cpu(prev_left[idx], smp_processor_id()) = left; /* @@ -487,6 +491,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + + return ret; } static inline void @@ -706,16 +712,19 @@ static void x86_pmu_disable(struct perf_counter *counter) * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: */ -static void intel_pmu_save_and_restart(struct perf_counter *counter) +static int intel_pmu_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; + int ret; x86_perf_counter_update(counter, hwc, idx); - x86_perf_counter_set_period(counter, hwc, idx); + ret = x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) intel_pmu_enable_counter(hwc, idx); + + return ret; } static void intel_pmu_reset(void) @@ -782,7 +791,9 @@ again: if (!test_bit(bit, cpuc->active_mask)) continue; - intel_pmu_save_and_restart(counter); + if (!intel_pmu_save_and_restart(counter)) + continue; + if (perf_counter_overflow(counter, nmi, regs, 0)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -824,9 +835,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) continue; /* counter overflow */ - x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } -- cgit v1.1 From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 19:22:16 +0200 Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr The structure isn't hw only and when I read event, I think about those things that fall out the other end. Rename the thing. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 904571b..e16e8c1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void) } /* - * Setup the hardware configuration for a given hw_event_type + * Setup the hardware configuration for a given attr_type */ static int __hw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; + struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; int err; @@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Count user and OS events unless requested not to. */ - if (!hw_event->exclude_user) + if (!attr->exclude_user) hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!hw_event->exclude_kernel) + if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; if (!hwc->sample_period) @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); + if (perf_event_raw(attr)) { + hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); } else { - if (perf_event_id(hw_event) >= x86_pmu.max_events) + if (perf_event_id(attr) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(attr)); } counter->destroy = hw_perf_counter_destroy; -- cgit v1.1 From a32881066e58346f2901afe0ebdfbf0c562877e5 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 3 Jun 2009 13:12:55 +0800 Subject: perf_counter/x86: Remove the IRQ (non-NMI) handling bits Remove the IRQ (non-NMI) handling bits as NMI will be used always. Signed-off-by: Yong Wang Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090603051255.GA2791@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 21 ++++++--------------- arch/x86/kernel/entry_64.S | 2 -- arch/x86/kernel/irqinit_32.c | 1 - arch/x86/kernel/irqinit_64.c | 1 - 4 files changed, 6 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e16e8c1..12cc05e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -40,7 +40,7 @@ struct cpu_hw_counters { struct x86_pmu { const char *name; int version; - int (*handle_irq)(struct pt_regs *, int); + int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); void (*enable)(struct hw_perf_counter *, int); @@ -755,7 +755,7 @@ static void intel_pmu_reset(void) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) +static int intel_pmu_handle_irq(struct pt_regs *regs) { struct cpu_hw_counters *cpuc; struct cpu_hw_counters; @@ -794,7 +794,7 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; - if (perf_counter_overflow(counter, nmi, regs, 0)) + if (perf_counter_overflow(counter, 1, regs, 0)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -812,7 +812,7 @@ again: return 1; } -static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) +static int amd_pmu_handle_irq(struct pt_regs *regs) { int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; @@ -840,22 +840,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, nmi, regs, 0)) + if (perf_counter_overflow(counter, 1, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void smp_perf_counter_interrupt(struct pt_regs *regs) -{ - irq_enter(); - apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); - ack_APIC_irq(); - x86_pmu.handle_irq(regs, 0); - irq_exit(); -} - void smp_perf_pending_interrupt(struct pt_regs *regs) { irq_enter(); @@ -910,7 +901,7 @@ perf_counter_nmi_handler(struct notifier_block *self, * If the first NMI handles both, the latter will be empty and daze * the CPU. */ - x86_pmu.handle_irq(regs, 1); + x86_pmu.handle_irq(regs); return NOTIFY_STOP; } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8910046..7985c01 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1026,8 +1026,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt #ifdef CONFIG_PERF_COUNTERS -apicinterrupt LOCAL_PERF_VECTOR \ - perf_counter_interrupt smp_perf_counter_interrupt apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 3190a6b..205bdd8 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -165,7 +165,6 @@ static void __init apic_intr_init(void) alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); # ifdef CONFIG_PERF_COUNTERS - alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 53ceb26..fa6ef69 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -155,7 +155,6 @@ static void __init apic_intr_init(void) /* Performance monitoring interrupt: */ #ifdef CONFIG_PERF_COUNTERS - alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); #endif } -- cgit v1.1 From 367d04c4ec02dad34d80452e32e3370db7fb6fee Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 28 May 2009 09:54:48 +0200 Subject: amd_iommu: fix lock imbalance In alloc_coherent there is an omitted unlock on the path where mapping fails. Add the unlock. [ Impact: fix lock imbalance in alloc_coherent ] Signed-off-by: Jiri Slaby Cc: Joerg Roedel Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d689883..9f89bb6 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1541,8 +1541,10 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) + if (*dma_addr == bad_dma_address) { + spin_unlock_irqrestore(&domain->lock, flags); goto out_free; + } iommu_completion_wait(iommu); -- cgit v1.1 From 0e2595cdfd7df9f1128f7185152601ae5417483b Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 20 May 2009 08:10:57 -0500 Subject: x86: Fix UV BAU activation descriptor init The UV tlb shootdown code has a serious initialization error. An array of structures [32*8] is initialized as if it were [32]. The array is indexed by (cpu number on the blade)*8, so the short initialization works for up to 4 cpus on a blade. But above that, we provide an invalid opcode to the hub's broadcast assist unit. This patch changes the allocation of the array to use its symbolic dimensions for better clarity. And initializes all 32*8 entries. Shortened 'UV_ACTIVATION_DESCRIPTOR_SIZE' to 'UV_ADP_SIZE' per Ingo's recommendation. Tested on the UV simulator. Signed-off-by: Cliff Wickman Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ed0c337..16f0fd4 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode) struct bau_desc *adp; struct bau_desc *ad2; - adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); + /* + * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) + * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade + */ + adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* + UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ @@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode) (n << UV_DESC_BASE_PNODE_SHIFT | m)); } - for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { + /* + * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * cpu even though we only use the first one; one descriptor can + * describe a broadcast to 256 nodes. + */ + for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); + i++, ad2++) { memset(ad2, 0, sizeof(struct bau_desc)); ad2->header.sw_ack_flag = 1; /* -- cgit v1.1 From 128f048f0f0d2a477ad2555e7acd2ad15a1b6061 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2009 22:19:36 +0200 Subject: perf_counter: Fix throttling lock-up Throttling logic is broken and we can lock up with too small hw sampling intervals. Make the throttling code more robust: disable counters even if we already disabled them. ( Also clean up whitespace damage i noticed while reading various pieces of code related to throttling. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 12cc05e..8f53f3a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -91,7 +91,7 @@ static u64 intel_pmu_raw_event(u64 event) #define CORE_EVNTSEL_INV_MASK 0x00800000ULL #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL -#define CORE_EVNTSEL_MASK \ +#define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ CORE_EVNTSEL_EDGE_MASK | \ -- cgit v1.1 From fe2245c905631a3a353504fc04388ce3dfaf9d9e Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Sun, 5 Jul 2009 15:50:52 -0500 Subject: x86: enable GART-IOMMU only after setting up protection methods The current code to set up the GART as an IOMMU enables GART translations before it removes the aperture from the kernel memory map, sets the GART PTEs to UC, sets up the guard and scratch pages, or does a wbinvd(). This leaves the possibility of cache aliasing open and can cause system crashes. Re-order the code so as to enable the GART translations only after all safeguards are in place and the tlb has been flushed. AMD has tested this patch on both Istanbul systems and 1st generation Opteron systems with APG enabled and seen no adverse effects. Istanbul systems with HT Assist enabled sometimes see MCE errors due to cache artifacts with the unmodified code. Signed-off-by: Mark Langsdorf Cc: Cc: Joerg Roedel Cc: akpm@linux-foundation.org Cc: jbarnes@virtuousgeek.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 1e8920d..cfd9f90 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -658,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - enable_gart_translations(); - error = sysdev_class_register(&gart_sysdev_class); if (!error) error = sysdev_register(&device_gart); @@ -816,6 +814,14 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); /* * Try to workaround a bug (thanks to BenH): -- cgit v1.1 From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Jun 2009 09:58:57 +0200 Subject: perf_counter: Separate out attr->type from attr->config Counter type is a frequently used value and we do a lot of bit juggling by encoding and decoding it from attr->config. Clean this up by creating a separate attr->type field. Also clean up the various similarly complex user-space bits all around counter attribute management. The net improvement is significant, and it will be easier to add a new major type (which is what triggered this cleanup). (This changes the ABI, all tools are adapted.) (PowerPC build-tested.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8f53f3a..430e048 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(attr)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); } else { - if (perf_event_id(attr) >= x86_pmu.max_events) + if (attr->config >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(attr)); + hwc->config |= x86_pmu.event_map(attr->config); } counter->destroy = hw_perf_counter_destroy; -- cgit v1.1 From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2009 20:22:46 +0200 Subject: perf_counter: Implement generalized cache event types Extend generic event enumeration with the PERF_TYPE_HW_CACHE method. This is a 3-dimensional space: { L1-D, L1-I, L2, ITLB, DTLB, BPU } x { load, store, prefetch } x { accesses, misses } User-space passes in the 3 coordinates and the kernel provides a counter. (if the hardware supports that type and if the combination makes sense.) Combinations that make no sense produce a -EINVAL. Combinations that are not supported by the hardware produce -ENOTSUP. Extend the tools to deal with this, and rewrite the event symbol parsing code with various popular aliases for the units and access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are both valid aliases. ( x86 is supported for now, with the Nehalem event table filled in, and with Core2 and Atom having placeholder tables. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 201 +++++++++++++++++++++++++++++++++++-- 1 file changed, 193 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 430e048..e86679f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event) return intel_perfmon_event_map[event]; } +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES */ + [ C(RESULT_MISS) ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + /* To be filled in */ +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + /* To be filled in */ +}; + static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL @@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void) return x86_pmu.handle_irq != NULL; } +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + /* * Setup the hardware configuration for a given attr_type */ @@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, hwc->sample_period); + counter->destroy = hw_perf_counter_destroy; /* * Raw event type provide the config in the event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); - } else { - if (attr->config >= x86_pmu.max_events) - return -EINVAL; - /* - * The generic map: - */ - hwc->config |= x86_pmu.event_map(attr->config); + return 0; } - counter->destroy = hw_perf_counter_destroy; + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + /* + * The generic map: + */ + hwc->config |= x86_pmu.event_map(attr->config); return 0; } @@ -989,6 +1147,33 @@ static int intel_pmu_init(void) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + /* + * Nehalem: + */ + switch (boot_cpu_data.x86_model) { + case 17: + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Core2 event tables\n"); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Nehalem/Corei7 event tables\n"); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Atom event tables\n"); + break; + } return 0; } -- cgit v1.1 From 5095f59bda6793a7b8f0856096d6893fe98e0e51 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 5 Jun 2009 23:27:17 +0530 Subject: x86: cpu_debug: Remove model information to reduce encoding-decoding Remove model information, encoding/decoding and reduce bookkeeping. This, besides removing a lot of code and cleaning up the code, also enables these features on many more CPUs that were enumerated before. Reported-by: Ingo Molnar Signed-off-by: Jaswinder Singh Rajput Cc: Alan Cox LKML-Reference: <1244224637.8212.6.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu_debug.c | 417 +++++++++------------------------------- 1 file changed, 96 insertions(+), 321 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab..86afe13 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -32,9 +32,7 @@ static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); -static DEFINE_PER_CPU(unsigned, cpu_modelflag); static DEFINE_PER_CPU(int, cpu_priv_count); -static DEFINE_PER_CPU(unsigned, cpu_model); static DEFINE_MUTEX(cpu_debug_lock); @@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = { { "value", CPU_REG_ALL, 1 }, }; -/* Intel Registers Range */ -static struct cpu_debug_range cpu_intel_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, - { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, - { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, - { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, - - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, - { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, - { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, - - { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, - { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, - { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, - { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, - - { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, - - { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, - { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, - { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, - { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, - { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, - { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, - { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, - { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, - { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, - { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, - { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, - - { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, - { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, - { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, - { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, - { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, - { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, - { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, - { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, - { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, - { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, - - { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, - { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, - { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, - { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, - { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, - { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, - { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, - - { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, - - { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, - { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, +/* CPU Registers Range */ +static struct cpu_debug_range cpu_reg_range[] = { + { 0x00000000, 0x00000001, CPU_MC, }, + { 0x00000006, 0x00000007, CPU_MONITOR, }, + { 0x00000010, 0x00000010, CPU_TIME, }, + { 0x00000011, 0x00000013, CPU_PMC, }, + { 0x00000017, 0x00000017, CPU_PLATFORM, }, + { 0x0000001B, 0x0000001B, CPU_APIC, }, + { 0x0000002A, 0x0000002B, CPU_POWERON, }, + { 0x0000002C, 0x0000002C, CPU_FREQ, }, + { 0x0000003A, 0x0000003A, CPU_CONTROL, }, + { 0x00000040, 0x00000047, CPU_LBRANCH, }, + { 0x00000060, 0x00000067, CPU_LBRANCH, }, + { 0x00000079, 0x00000079, CPU_BIOS, }, + { 0x00000088, 0x0000008A, CPU_CACHE, }, + { 0x0000008B, 0x0000008B, CPU_BIOS, }, + { 0x0000009B, 0x0000009B, CPU_MONITOR, }, + { 0x000000C1, 0x000000C4, CPU_PMC, }, + { 0x000000CD, 0x000000CD, CPU_FREQ, }, + { 0x000000E7, 0x000000E8, CPU_PERF, }, + { 0x000000FE, 0x000000FE, CPU_MTRR, }, + + { 0x00000116, 0x0000011E, CPU_CACHE, }, + { 0x00000174, 0x00000176, CPU_SYSENTER, }, + { 0x00000179, 0x0000017B, CPU_MC, }, + { 0x00000186, 0x00000189, CPU_PMC, }, + { 0x00000198, 0x00000199, CPU_PERF, }, + { 0x0000019A, 0x0000019A, CPU_TIME, }, + { 0x0000019B, 0x0000019D, CPU_THERM, }, + { 0x000001A0, 0x000001A0, CPU_MISC, }, + { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, + { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, }, + { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, }, + { 0x00000250, 0x00000250, CPU_MTRR, }, + { 0x00000258, 0x00000259, CPU_MTRR, }, + { 0x00000268, 0x0000026F, CPU_MTRR, }, + { 0x00000277, 0x00000277, CPU_PAT, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, }, + + { 0x00000300, 0x00000311, CPU_PMC, }, + { 0x00000345, 0x00000345, CPU_PMC, }, + { 0x00000360, 0x00000371, CPU_PMC, }, + { 0x0000038D, 0x00000390, CPU_PMC, }, + { 0x000003A0, 0x000003BE, CPU_PMC, }, + { 0x000003C0, 0x000003CD, CPU_PMC, }, + { 0x000003E0, 0x000003E1, CPU_PMC, }, + { 0x000003F0, 0x000003F2, CPU_PMC, }, + + { 0x00000400, 0x00000417, CPU_MC, }, + { 0x00000480, 0x0000048B, CPU_VMX, }, + + { 0x00000600, 0x00000600, CPU_DEBUG, }, + { 0x00000680, 0x0000068F, CPU_LBRANCH, }, + { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, + + { 0x000107CC, 0x000107D3, CPU_PMC, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, }, + { 0xC0000081, 0xC0000084, CPU_CALL, }, + { 0xC0000100, 0xC0000102, CPU_BASE, }, + { 0xC0000103, 0xC0000103, CPU_TIME, }, + + { 0xC0010000, 0xC0010007, CPU_PMC, }, + { 0xC0010010, 0xC0010010, CPU_CONF, }, + { 0xC0010015, 0xC0010015, CPU_CONF, }, + { 0xC0010016, 0xC001001A, CPU_MTRR, }, + { 0xC001001D, 0xC001001D, CPU_MTRR, }, + { 0xC001001F, 0xC001001F, CPU_CONF, }, + { 0xC0010030, 0xC0010035, CPU_BIOS, }, + { 0xC0010044, 0xC0010048, CPU_MC, }, + { 0xC0010050, 0xC0010056, CPU_SMM, }, + { 0xC0010058, 0xC0010058, CPU_CONF, }, + { 0xC0010060, 0xC0010060, CPU_CACHE, }, + { 0xC0010061, 0xC0010068, CPU_SMM, }, + { 0xC0010069, 0xC001006B, CPU_SMM, }, + { 0xC0010070, 0xC0010071, CPU_SMM, }, + { 0xC0010111, 0xC0010113, CPU_SMM, }, + { 0xC0010114, 0xC0010118, CPU_SVM, }, + { 0xC0010140, 0xC0010141, CPU_OSVM, }, + { 0xC0011022, 0xC0011023, CPU_CONF, }, }; -/* AMD Registers Range */ -static struct cpu_debug_range cpu_amd_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, - { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, - { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, - { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, - { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, - { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, - { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, - { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, - { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, - { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, - { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, - { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, - { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, - { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, -}; - - -/* Intel */ -static int get_intel_modelflag(unsigned model) -{ - int flag; - - switch (model) { - case 0x0501: - case 0x0502: - case 0x0504: - flag = CPU_INTEL_PENTIUM; - break; - case 0x0601: - case 0x0603: - case 0x0605: - case 0x0607: - case 0x0608: - case 0x060A: - case 0x060B: - flag = CPU_INTEL_P6; - break; - case 0x0609: - case 0x060D: - flag = CPU_INTEL_PENTIUM_M; - break; - case 0x060E: - flag = CPU_INTEL_CORE; - break; - case 0x060F: - case 0x0617: - flag = CPU_INTEL_CORE2; - break; - case 0x061C: - flag = CPU_INTEL_ATOM; - break; - case 0x0F00: - case 0x0F01: - case 0x0F02: - case 0x0F03: - case 0x0F04: - flag = CPU_INTEL_XEON_P4; - break; - case 0x0F06: - flag = CPU_INTEL_XEON_MP; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -/* AMD */ -static int get_amd_modelflag(unsigned model) -{ - int flag; - - switch (model >> 8) { - case 0x6: - flag = CPU_AMD_K6; - break; - case 0x7: - flag = CPU_AMD_K7; - break; - case 0x8: - flag = CPU_AMD_K8; - break; - case 0xf: - flag = CPU_AMD_0F; - break; - case 0x10: - flag = CPU_AMD_10; - break; - case 0x11: - flag = CPU_AMD_11; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_modelflag(unsigned cpu) -{ - int flag; - - flag = per_cpu(cpu_model, cpu); - - switch (flag >> 16) { - case X86_VENDOR_INTEL: - flag = get_intel_modelflag(flag); - break; - case X86_VENDOR_AMD: - flag = get_amd_modelflag(flag & 0xffff); - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_range_count(unsigned cpu) -{ - int index; - - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - index = ARRAY_SIZE(cpu_intel_range); - break; - case X86_VENDOR_AMD: - index = ARRAY_SIZE(cpu_amd_range); - break; - default: - index = 0; - break; - } - - return index; -} - static int is_typeflag_valid(unsigned cpu, unsigned flag) { - unsigned vendor, modelflag; - int i, index; + int i; /* Standard Registers should be always valid */ if (flag >= CPU_TSS) return 1; - modelflag = per_cpu(cpu_modelflag, cpu); - vendor = per_cpu(cpu_model, cpu) >> 16; - index = get_cpu_range_count(cpu); - - for (i = 0; i < index; i++) { - switch (vendor) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[i].model & modelflag) && - (cpu_intel_range[i].flag & flag)) - return 1; - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[i].model & modelflag) && - (cpu_amd_range[i].flag & flag)) - return 1; - break; - } + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { + if (cpu_reg_range[i].flag == flag) + return 1; } /* Invalid */ @@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, int index, unsigned flag) { - unsigned modelflag; - - modelflag = per_cpu(cpu_modelflag, cpu); - *max = 0; - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[index].model & modelflag) && - (cpu_intel_range[index].flag & flag)) { - *min = cpu_intel_range[index].min; - *max = cpu_intel_range[index].max; - } - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[index].model & modelflag) && - (cpu_amd_range[index].flag & flag)) { - *min = cpu_amd_range[index].min; - *max = cpu_amd_range[index].max; - } - break; - } + if (cpu_reg_range[index].flag == flag) { + *min = cpu_reg_range[index].min; + *max = cpu_reg_range[index].max; + } else + *max = 0; return *max; } @@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) unsigned msr, msr_min, msr_max; struct cpu_private *priv; u32 low, high; - int i, range; + int i; if (seq) { priv = seq->private; @@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) } } - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) continue; @@ -788,13 +569,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) { struct dentry *cpu_dentry = NULL; unsigned reg, reg_min, reg_max; - int i, range, err = 0; + int i, err = 0; char reg_dir[12]; u32 low, high; - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, ®_min, ®_max, i, cpu_base[type].flag)) continue; @@ -850,10 +629,6 @@ static int cpu_init_cpu(void) cpui = &cpu_data(cpu); if (!cpu_has(cpui, X86_FEATURE_MSR)) continue; - per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | - (cpui->x86 << 8) | - (cpui->x86_model)); - per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); sprintf(cpu_dir, "cpu%d", cpu); cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); -- cgit v1.1 From 4a4aca641bc4598e77b866804f47c651ec4a764d Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 5 Jun 2009 12:02:38 +0200 Subject: x86: Add quirk for reboot stalls on a Dell Optiplex 360 The Dell Optiplex 360 hangs on reboot, just like the Optiplex 330, so the same quirk is needed. Signed-off-by: Jean Delvare Cc: Steve Conklin Cc: Leann Ogasawara Cc: LKML-Reference: <200906051202.38311.jdelvare@suse.de> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 667188e..d2d1ce8 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", -- cgit v1.1 From 103428e57be323c3c5545db8ad12667099bc6005 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 7 Jun 2009 16:48:40 +0400 Subject: x86, apic: Fix dummy apic read operation together with broken MP handling Ingo Molnar reported that read_apic is buggy novadays: [ 0.000000] Using APIC driver default [ 0.000000] SMP: Allowing 1 CPUs, 0 hotplug CPUs [ 0.000000] Local APIC disabled by BIOS -- you can enable it with "lapic" [ 0.000000] APIC: disable apic facility [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: at arch/x86/kernel/apic/apic.c:254 native_apic_read_dummy+0x2d/0x3b() [ 0.000000] Hardware name: HP OmniBook PC Indeed we still rely on apic->read operation for SMP compiled kernel. And instead of disfigure the SMP code with #ifdef we allow to call apic->read. To capture any unexpected results we check for apic->read being called for sane reason via WARN_ON_ONCE but(!) instead of OR we should use AND logical operation (thanks Yinghai for spotting the root of the problem). Along with that we could be have bad MP table and we are to fix it that way no SMP started and no complains about BIOS bug if apic was just disabled via command line. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090607124840.GD4547@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 9 ++++++++- arch/x86/kernel/smpboot.c | 8 +++++--- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e82488d..a4c9cf0 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -249,7 +249,7 @@ static void native_apic_write_dummy(u32 reg, u32 v) static u32 native_apic_read_dummy(u32 reg) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); return 0; } @@ -1609,6 +1609,13 @@ void __init init_apic_mappings(void) new_apicid = read_apic_id(); if (boot_cpu_physical_apicid != new_apicid) { boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since smp_sanity_check is prepared for such a case + * and disable smp mode + */ apic_version[new_apicid] = GET_APIC_VERSION(apic_read(APIC_LVR)); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d2e8de9..7c80007 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -992,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - printk(KERN_ERR "... forcing use of dummy APIC emulation." + if (!disable_apic) { + pr_err("BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + pr_err("... forcing use of dummy APIC emulation." "(tell your hw vendor)\n"); + } smpboot_clear_io_apic(); arch_disable_smp_support(); return -1; -- cgit v1.1 From 3aa6b186f86c5d06d6d92d14311ffed51f091f40 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Sun, 7 Jun 2009 16:23:48 +0200 Subject: x86: Fix non-lazy GS handling in sys_vm86() This fixes a stack corruption panic or null dereference oops due to a bad GS in resume_userspace() when returning from sys_vm86() and calling lockdep_sys_exit(). Only a problem when CONFIG_LOCKDEP and CONFIG_CC_STACKPROTECTOR enabled. Signed-off-by: Lubomir Rintel Cc: H. Peter Anvin LKML-Reference: <1244384628.2323.4.camel@bimbo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vm86_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e..6a17769 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs.pt.ds = 0; info->regs.pt.es = 0; info->regs.pt.fs = 0; - -/* we are clearing gs later just before "jmp resume_userspace", - * because it is not saved/restored. - */ +#ifndef CONFIG_X86_32_LAZY_GS + info->regs.pt.gs = 0; +#endif /* * The flags register is also special: we cannot trust that the user @@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" +#ifdef CONFIG_X86_32_LAZY_GS "mov %2, %%gs\n\t" +#endif "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); -- cgit v1.1 From aeef50bc0483fa70ce0bddb686ec84a274b7f3d4 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Sun, 7 Jun 2009 22:30:36 +0800 Subject: x86, microcode: Simplify vfree() use vfree() does its own 'NULL' check, so no need for check before calling it. In v2, remove the stray newline. [ Impact: cleanup ] Signed-off-by: Figo.zhang Cc: Dmitry Adamushko LKML-Reference: <1244385036.3402.11.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c8be20f..366baa1 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -241,10 +241,8 @@ static int install_equiv_cpu_table(const u8 *buf) static void free_equiv_cpu_table(void) { - if (equiv_cpu_table) { - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; - } + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; } static enum ucode_state @@ -279,8 +277,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) mc_header = (struct microcode_header_amd *)mc; if (get_matching_microcode(cpu, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; } else @@ -292,8 +289,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (new_mc) { if (!leftover) { - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = new_mc; pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", -- cgit v1.1 From 0312af84164215a452f2a94957ebd9bce86e0204 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 07:42:04 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add Core2 support Fill in core2_hw_cache_event_id[] with the Core2 model specific events. The events can be used in all the tools via the -e (--event) parameter, for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses". Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e86679f..b1f71ff 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -194,7 +194,90 @@ static const u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - /* To be filled in */ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static const u64 atom_hw_cache_event_ids -- cgit v1.1 From ad689220614b6c7c0b13b70d742f358e9310e71e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 09:30:41 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add Atom support Fill in core2_hw_cache_event_id[] with the Atom model specific events. The events can be used in all the tools via the -e (--event) parameter, for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses". ( Note: these are straight from the Intel manuals - not tested yet.) Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b1f71ff..71590e0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,7 +285,90 @@ static const u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - /* To be filled in */ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 intel_pmu_raw_event(u64 event) -- cgit v1.1 From 1123e3ad73697d64ad99f0104bbe49f8b52d7d65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 May 2009 11:25:09 +0200 Subject: perf_counter: Clean up x86 boot messages Standardize and tidy up all the messages we print during perfcounter initialization. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 46 ++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 71590e0..0339d19 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1298,23 +1298,22 @@ static int intel_pmu_init(void) if (version < 2) return -ENODEV; - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; + x86_pmu = intel_pmu; + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; /* * Quirk: v2 perfmon does not report fixed-purpose counters, so * assume at least 3 counters: */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); /* - * Nehalem: + * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { case 17: @@ -1322,7 +1321,7 @@ static int intel_pmu_init(void) sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Core2 event tables\n"); + pr_cont("Core2 events, "); break; default: case 26: @@ -1330,14 +1329,14 @@ static int intel_pmu_init(void) sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Nehalem/Corei7 event tables\n"); + pr_cont("Nehalem/Corei7 events, "); break; case 28: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Atom event tables\n"); + pr_cont("Atom events, "); break; } return 0; @@ -1353,6 +1352,8 @@ void __init init_hw_perf_counters(void) { int err; + pr_info("Performance Counters: "); + switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); @@ -1363,14 +1364,13 @@ void __init init_hw_perf_counters(void) default: return; } - if (err != 0) + if (err != 0) { + pr_cont("no PMU driver, software counters only.\n"); return; + } - pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_cont("%s PMU driver.\n", x86_pmu.name); - pr_info("... num counters: %d\n", x86_pmu.num_counters); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -1379,23 +1379,25 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << x86_pmu.num_counters) - 1; perf_max_counters = x86_pmu.num_counters; - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); } - pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed); perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... generic counters: %d\n", x86_pmu.num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); } static inline void x86_pmu_read(struct perf_counter *counter) -- cgit v1.1 From c4ed3f04ba9defe22aa729d1646f970f791c03d7 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 8 Jun 2009 10:44:05 -0500 Subject: x86, UV: Fix macros for multiple coherency domains Fix bug in the SGI UV macros that support systems with multiple coherency domains. The macros used for referencing global MMR (chipset registers) are failing to correctly "or" the NASID (node identifier) bits that reside above M+N. These high bits are supplied automatically by the chipset for memory accesses coming from the processor socket. However, the bits must be present for references to the special global MMR space used to map chipset registers. (See uv_hub.h for more details ...) The bug results in references to invalid/incorrect nodes. Signed-off-by: Jack Steiner Cc: LKML-Reference: <20090608154405.GA16395@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda693..39f2af4 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -562,7 +562,7 @@ void __init uv_system_init(void) union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int max_pnode = 0; + int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; @@ -574,6 +574,13 @@ void __init uv_system_init(void) mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + pnode_mask = (1 << n_val) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; + gnode_upper = ((unsigned long)gnode_extra << m_val); + printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", + n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) @@ -607,11 +614,6 @@ void __init uv_system_init(void) } } - pnode_mask = (1 << n_val) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_upper = (((unsigned long)node_id.s.node_id) & - ~((1 << n_val) - 1)) << m_val; - uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size); @@ -634,6 +636,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; + uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; -- cgit v1.1 From f86748e91a14bd6cc49477560f33ed5d59896e89 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 22:33:10 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add AMD support Fill in amd_hw_cache_event_id[] with the AMD CPU specific events, for family 0x0f, 0x10 and 0x11. There's apparently no distinction between load and store events, so we only fill in the load events. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 102 +++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0339d19..93af821 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -389,6 +389,97 @@ static u64 intel_pmu_raw_event(u64 event) return event & CORE_EVNTSEL_MASK; } +static const u64 amd_0f_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + /* * AMD Performance Monitor K7 and later. */ @@ -1345,6 +1436,17 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; + + switch (boot_cpu_data.x86) { + case 0x0f: + case 0x10: + case 0x11: + memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("AMD Family 0f/10/11 events, "); + break; + } return 0; } -- cgit v1.1 From 820a644211bc1ac7715333abdb0f0b9ea4fbb549 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 19:10:25 +0200 Subject: perf_counter, x86: Clean up hw_cache_event ids copies Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 93af821..56001fe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1409,23 +1409,20 @@ static int intel_pmu_init(void) switch (boot_cpu_data.x86_model) { case 17: memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Nehalem/Corei7 events, "); break; case 28: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Atom events, "); break; -- cgit v1.1 From 29150078d7a1758df8c7a6cd2ec066ac65e1fab9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 10:54:18 +0200 Subject: amd-iommu: remove BUS_NOTIFY_BOUND_DRIVER handling Handling this event causes device assignment in KVM to fail because the device gets re-attached as soon as the pci-stub registers as the driver for the device. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8510e90..8187260 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1145,17 +1145,6 @@ static int device_change_notifier(struct notifier_block *nb, "to a non-dma-ops domain\n", dev_name(dev)); switch (action) { - case BUS_NOTIFY_BOUND_DRIVER: - if (domain) - goto out; - dma_domain = find_protection_domain(devid); - if (!dma_domain) - dma_domain = iommu->default_dom; - attach_device(iommu, &dma_domain->domain, devid); - DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " - "%d for device %s\n", - dma_domain->domain.id, dev_name(dev)); - break; case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; -- cgit v1.1 From 71ff3bca2f70264effe8cbbdd5bc10cf6be5f2f0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Jun 2009 13:47:33 -0700 Subject: amd-iommu: detach device explicitly before attaching it to a new domain This fixes a bug with a device that could not be assigned to a KVM guest because it is still assigned to a dma_ops protection domain. [chrisw: simply remove WARN_ON(), will always fire since dev->driver will be pci-sub] Signed-off-by: Chris Wright Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8187260..772e910 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2073,7 +2073,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, old_domain = domain_for_device(devid); if (old_domain) - return -EBUSY; + detach_device(old_domain, devid); attach_device(iommu, domain, devid); -- cgit v1.1 From e9a22a13c71986851e931bdfa054f68839ff8576 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Jun 2009 12:00:37 +0200 Subject: amd-iommu: remove unnecessary "AMD IOMMU: " prefix That prefix is already included in the DUMP_printk macro. So there is no need to repeat it in the format string. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 772e910..1c60554 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1266,9 +1266,8 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain " - "%d for device %s\n", - (*domain)->id, dev_name(dev)); + DUMP_printk("Using protection domain %d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) -- cgit v1.1 From 42937e81a82b6bbc51a309c83da140b3a7ca5945 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Mon, 8 Jun 2009 15:55:09 +0200 Subject: x86: Detect use of extended APIC ID for AMD CPUs Booting a 32-bit kernel on Magny-Cours results in the following panic: ... Using APIC driver default ... Overriding APIC driver with bigsmp ... Getting VERSION: 80050010 Getting VERSION: 80050010 Getting ID: 10000000 Getting ID: ef000000 Getting LVT0: 700 Getting LVT1: 10000 Kernel panic - not syncing: Boot APIC ID in local APIC unexpected (16 vs 0) Pid: 1, comm: swapper Not tainted 2.6.30-rcX #2 Call Trace: [] ? panic+0x38/0xd3 [] ? native_smp_prepare_cpus+0x259/0x31f [] ? kernel_init+0x3e/0x141 [] ? kernel_init+0x0/0x141 [] ? kernel_thread_helper+0x7/0x10 The reason is that default_get_apic_id handled extension of local APIC ID field just in case of XAPIC. Thus for this AMD CPU, default_get_apic_id() returns 0 and bigsmp_get_apic_id() returns 16 which leads to the respective kernel panic. This patch introduces a Linux specific feature flag to indicate support for extended APIC id (8 bits instead of 4 bits width) and sets the flag on AMD CPUs if applicable. Signed-off-by: Andreas Herrmann Cc: LKML-Reference: <20090608135509.GA12431@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459..0802e15 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) (c->x86_model == 8 && c->x86_mask >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* check CPU config space for extended APIC ID */ + if (c->x86 >= 0xf) { + unsigned int val; + val = read_pci_config(0, 24, 0, 0x68); + if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } +#endif } static void __cpuinit init_amd(struct cpuinfo_x86 *c) -- cgit v1.1 From fecc8ac8496fce96069724f54daba8e7078b0082 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Tue, 9 Jun 2009 21:15:53 +0800 Subject: perf_counter, x86: Correct some event and umask values for Intel processors Correct some event and UMASK values according to Intel SDM, in the Nehalem and Atom tables. Signed-off-by: Yong Wang Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090609131553.GA12489@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 56001fe..40978aa 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -119,7 +119,7 @@ static const u64 nehalem_hw_cache_event_ids }, [ C(L1I ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS */ + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ }, [ C(OP_WRITE) ] = { @@ -162,7 +162,7 @@ static const u64 nehalem_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISS_RETIRED */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -291,7 +291,7 @@ static const u64 atom_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST */ + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -301,8 +301,8 @@ static const u64 atom_hw_cache_event_ids }, [ C(L1I ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -329,11 +329,11 @@ static const u64 atom_hw_cache_event_ids }, [ C(DTLB) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ }, [ C(OP_PREFETCH) ] = { -- cgit v1.1 From a90ede7b17d122acd58e6e1ff911be9dcf5263cc Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 11 Feb 2009 22:45:42 -0200 Subject: KVM: x86: paravirt skip pit-through-ioapic boot check Skip the test which checks if the PIT is properly routed when using the IOAPIC, aimed at buggy hardware. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33019dd..057173d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,7 @@ #include #include #include +#include #define MMU_QUEUE_SIZE 1024 @@ -230,6 +231,9 @@ static void paravirt_ops_setup(void) pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; } +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif } void __init kvm_guest_init(void) -- cgit v1.1 From 32f8840064d88cc3f6e85203aec7b6b57bebcb97 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 7 May 2009 17:55:12 -0300 Subject: KVM: use smp_send_reschedule in kvm_vcpu_kick KVM uses a function call IPI to cause the exit of a guest running on a physical cpu. For virtual interrupt notification there is no need to wait on IPI receival, or to execute any function. This is exactly what the reschedule IPI does, without the overhead of function IPI. So use it instead of smp_call_function_single in kvm_vcpu_kick. Also change the "guest_mode" variable to a bit in vcpu->requests, and use that to collapse multiple IPI's that would be issued between the first one and zeroing of guest mode. This allows kvm_vcpu_kick to called with interrupts disabled. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kernel/smp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea..3b2e55e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ } void smp_call_function_interrupt(struct pt_regs *regs) -- cgit v1.1 From a0861c02a981c943573478ea13b29b1fb958ee5b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 8 Jun 2009 17:37:09 +0800 Subject: KVM: Add VT-x machine check support VT-x needs an explicit MC vector intercept to handle machine checks in the hyper visor. It also has a special option to catch machine checks that happen during VT entry. Do these interceptions and forward them to the Linux machine check handler. Make it always look like user space is interrupted because the machine check handler treats kernel/user space differently. Thanks to Jiang Yunhong for help and testing. Cc: stable@kernel.org Signed-off-by: Andi Kleen Signed-off-by: Huang Ying Signed-off-by: Avi Kivity --- arch/x86/kernel/cpu/mcheck/mce_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 09dd1d4..289cc48 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -420,6 +420,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) out2: atomic_dec(&mce_entry); } +EXPORT_SYMBOL_GPL(do_machine_check); #ifdef CONFIG_X86_MCE_INTEL /*** -- cgit v1.1 From dc81081b2d9a6a9d64dad1bef1e5fc9fb660e53e Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 10 Jun 2009 17:06:12 +0800 Subject: perf_counter/x86: Fix the model number of Intel Core2 processors Fix the model number of Intel Core2 processors according to the documentation: Intel Processor Identification with the CPUID Instruction: http://www.intel.com/support/processors/sb/cs-009861.htm Signed-off-by: Yong Wang Also-Reported-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090610090612.GA26580@ywang-moblin2.bj.intel.com> [ Added two more model numbers suggested by Arnd Bergmann ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 40978aa..49f2585 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1407,7 +1407,10 @@ static int intel_pmu_init(void) * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { - case 17: + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.1 From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 13:40:57 +0200 Subject: perf_counter: More aggressive frequency adjustment Also employ the overflow handler to adjust the frequency, this results in a stable frequency in about 40~50 samples, instead of that many ticks. This also means we can start sampling at a sample period of 1 without running head-first into the throttle. It relies on sched_clock() to accurately measure the time difference between the overflow NMIs. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 49f2585..240ca56 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (!hwc->sample_period) + if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } - atomic64_set(&hwc->period_left, hwc->sample_period); counter->destroy = hw_perf_counter_destroy; /* -- cgit v1.1 From 0de51088e6a82bc8413d3ca9e28bbca2788b5b53 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 8 Jun 2009 18:27:54 +0800 Subject: CPUFREQ: Enable acpi-cpufreq driver for VIA/Centaur CPUs The VIA/Centaur C7, C7-M and Nano CPU's all support ACPI based cpu p-states using a MSR interface. The Linux driver just never made use of it, since in addition to the check for the EST flag it also checked if the vendor is Intel. Signed-off-by: Harald Welte [ Removed the vendor checks entirely - Linus ] Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 752e8c6..ae9b5032 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid) { struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return 0; - - return 1; + return cpu_has(cpu, X86_FEATURE_EST); } static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) -- cgit v1.1 From 0fea615e526b4b7eff0363ee02d5753e5f924089 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 8 Jun 2009 18:29:36 +0800 Subject: CPUFREQ: Mark e_powersaver driver as EXPERIMENTAL and DANGEROUS The e_powersaver driver for VIA's C7 CPU's needs to be marked as DANGEROUS as it configures the CPU to power states that are out of specification. According to Centaur, all systems with C7 and Nano CPU's support the ACPI p-state method. Thus, the acpi-cpufreq driver should be used instead. Signed-off-by: Harald Welte Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/Kconfig | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c8398..f138c6c 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -220,11 +220,14 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver" + tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" select CPU_FREQ_TABLE - depends on X86_32 + depends on X86_32 && EXPERIMENTAL help - This adds the CPUFreq driver for VIA C7 processors. + This adds the CPUFreq driver for VIA C7 processors. However, this driver + does not have any safeguards to prevent operating the CPU out of spec + and is thus considered dangerous. Please use the regular ACPI cpufreq + driver, enabled by CONFIG_X86_ACPI_CPUFREQ. If in doubt, say N. -- cgit v1.1 From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:02:22 +0200 Subject: perf_counter: Introduce struct for sample data For easy extension of the sample data, put it in a structure. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 240ca56..82a23d4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void) */ static int intel_pmu_handle_irq(struct pt_regs *regs) { + struct perf_sample_data data; struct cpu_hw_counters *cpuc; - struct cpu_hw_counters; int bit, cpu, loops; u64 ack, status; + data.regs = regs; + data.addr = 0; + cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -1210,7 +1213,7 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; - if (perf_counter_overflow(counter, 1, regs, 0)) + if (perf_counter_overflow(counter, 1, &data)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -1230,12 +1233,16 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs) { - int cpu, idx, handled = 0; + struct perf_sample_data data; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; + int cpu, idx, handled = 0; u64 val; + data.regs = regs; + data.addr = 0; + cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, regs, 0)) + if (perf_counter_overflow(counter, 1, &data)) amd_pmu_disable_counter(hwc, idx); } -- cgit v1.1 From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:34:59 +0200 Subject: perf_counter: Accurate period data We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is incorrect. When we adjust the period, it will only take effect the next cycle but report it for the current cycle. So when we adjust the period for every cycle, we're always wrong. Solve this by keeping track of the last_period. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 82a23d4..57ae1be 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); } @@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; ret = 1; } /* @@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; - /* counter overflow */ - handled = 1; - inc_irq_stat(apic_perf_irqs); + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; @@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) amd_pmu_disable_counter(hwc, idx); } + if (handled) + inc_irq_stat(apic_perf_irqs); + return handled; } -- cgit v1.1 From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:06:28 +0200 Subject: perf_counter: Standardize event names Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 57ae1be..572fb43 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { */ static const u64 intel_perfmon_event_map[] = { - [PERF_COUNT_CPU_CYCLES] = 0x003c, - [PERF_COUNT_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_CACHE_MISSES] = 0x412e, - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; static u64 intel_pmu_event_map(int event) @@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_CPU_CYCLES] = 0x0076, - [PERF_COUNT_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_CACHE_MISSES] = 0x0081, - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; static u64 amd_pmu_event_map(int event) @@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; -- cgit v1.1 From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:19:11 +0200 Subject: perf_counter: Rename L2 to LL cache The top (fastest) and last level (biggest) caches are the most interesting ones, performance wise. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: [ Fixed the Nehalem LL table to LLC Reference/Miss events ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 572fb43..895c82e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ @@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES */ - [ C(RESULT_MISS) ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS */ + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ }, }, [ C(DTLB) ] = { @@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ @@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ @@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0, [ C(RESULT_MISS) ] = 0, -- cgit v1.1 From 38c7fed2f5ffee17e1fa3e0f78b0e1bf43d52d13 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 25 May 2009 15:10:58 +0300 Subject: x86: remove some alloc_bootmem_cpumask_var calling Now that we set up the slab allocator earlier, we can get rid of some alloc_bootmem_cpumask_var() calls in boot code. Cc: Ingo Molnar Cc: Johannes Weiner Cc: Linus Torvalds Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1946fac..139201a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -185,8 +185,8 @@ int __init arch_early_irq_init(void) for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_bootmem_cpumask_var(&cfg[i].domain); - alloc_bootmem_cpumask_var(&cfg[i].old_domain); + alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT); + alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } -- cgit v1.1 From dad213aeb59718623fc59defeff95fe8c3feb8a0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 May 2009 18:14:40 -0700 Subject: irq/cpumask: make memoryless node zero happy Don't hardcode to node zero for early boot IRQ setup memory allocations. [ penberg@cs.helsinki.fi: minor cleanups ] Cc: Ingo Molnar Cc: Johannes Weiner Cc: Linus Torvalds Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- arch/x86/kernel/apic/io_apic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 139201a..94605e7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -177,16 +177,18 @@ int __init arch_early_irq_init(void) struct irq_cfg *cfg; struct irq_desc *desc; int count; + int node; int i; cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); + node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT); - alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT); + alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } -- cgit v1.1 From 12274e96b42884f043dfaa87eb1e5e10281bfac3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 11 Jun 2009 15:07:48 -0700 Subject: x86: use zalloc_cpumask_var in arch_early_irq_init So we make sure MAXSMP gets a cleared cpumask Signed-off-by: Yinghai Lu Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 94605e7..ef8d929 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -187,8 +187,8 @@ int __init arch_early_irq_init(void) for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); - alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } -- cgit v1.1