From b76834bc1b6db0a0923eed85c81b1113021b0612 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 6 Dec 2010 11:16:25 -0600 Subject: kprobes: Use this_cpu_ops Use this_cpu ops in various places to optimize per cpu data access. Cc: Jason Baron Cc: Namhyung Kim Acked-by: H. Peter Anvin Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/kernel/kprobes.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1cbd54c..572ecc8 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -403,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) { - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; @@ -412,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { - __get_cpu_var(current_kprobe) = p; + __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); if (is_IF_modifier(p->ainsn.insn)) @@ -586,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) preempt_enable_no_resched(); return 1; } else if (kprobe_running()) { - p = __get_cpu_var(current_kprobe); + p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { setup_singlestep(p, regs, kcb, 0); return 1; @@ -759,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { - __get_cpu_var(current_kprobe) = &ri->rp->kp; + __this_cpu_write(current_kprobe, &ri->rp->kp); get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __get_cpu_var(current_kprobe) = NULL; + __this_cpu_write(current_kprobe, NULL); } recycle_rp_inst(ri, &empty_rp); @@ -1198,10 +1198,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; regs->orig_ax = ~0UL; - __get_cpu_var(current_kprobe) = &op->kp; + __this_cpu_write(current_kprobe, &op->kp); kcb->kprobe_status = KPROBE_HIT_ACTIVE; opt_pre_handler(&op->kp, regs); - __get_cpu_var(current_kprobe) = NULL; + __this_cpu_write(current_kprobe, NULL); } preempt_enable_no_resched(); } -- cgit v1.1 From 780f36d8b3fa9572f731d4fb85067b2e45e6f993 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 6 Dec 2010 11:16:29 -0600 Subject: xen: Use this_cpu_ops Use this_cpu_ops to reduce code size and simplify things in various places. V3->V4: Move instance of this_cpu_inc_return to a later patchset so that this patch can be applied without infrastructure changes. Cc: Jeremy Fitzhardinge Acked-by: H. Peter Anvin Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/xen/enlighten.c | 4 ++-- arch/x86/xen/multicalls.h | 2 +- arch/x86/xen/spinlock.c | 8 ++++---- arch/x86/xen/time.c | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 44dcad4..aa8c89a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -574,8 +574,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) preempt_disable(); - start = __get_cpu_var(idt_desc).address; - end = start + __get_cpu_var(idt_desc).size + 1; + start = __this_cpu_read(idt_desc.address); + end = start + __this_cpu_read(idt_desc.size) + 1; xen_mc_flush(); diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 9e565da..4ec8035 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -22,7 +22,7 @@ static inline void xen_mc_batch(void) unsigned long flags; /* need to disable interrupts until this entry is complete */ local_irq_save(flags); - __get_cpu_var(xen_mc_irq_flags) = flags; + __this_cpu_write(xen_mc_irq_flags, flags); } static inline struct multicall_space xen_mc_entry(size_t args) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 23e061b..cc9b1e1 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) { struct xen_spinlock *prev; - prev = __get_cpu_var(lock_spinners); - __get_cpu_var(lock_spinners) = xl; + prev = __this_cpu_read(lock_spinners); + __this_cpu_write(lock_spinners, xl); wmb(); /* set lock of interest before count */ @@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock asm(LOCK_PREFIX " decw %0" : "+m" (xl->spinners) : : "memory"); wmb(); /* decrement count before restoring lock */ - __get_cpu_var(lock_spinners) = prev; + __this_cpu_write(lock_spinners, prev); } static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; struct xen_spinlock *prev; - int irq = __get_cpu_var(lock_kicker_irq); + int irq = __this_cpu_read(lock_kicker_irq); int ret; u64 start; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b2bb5aa3..ef8930f 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -135,24 +135,24 @@ static void do_stolen_accounting(void) /* Add the appropriate number of ticks of stolen time, including any left-overs from last time. */ - stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); + stolen = runnable + offline + __this_cpu_read(xen_residual_stolen); if (stolen < 0) stolen = 0; ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); - __get_cpu_var(xen_residual_stolen) = stolen; + __this_cpu_write(xen_residual_stolen, stolen); account_steal_ticks(ticks); /* Add the appropriate number of ticks of blocked time, including any left-overs from last time. */ - blocked += __get_cpu_var(xen_residual_blocked); + blocked += __this_cpu_read(xen_residual_blocked); if (blocked < 0) blocked = 0; ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); - __get_cpu_var(xen_residual_blocked) = blocked; + __this_cpu_write(xen_residual_blocked, blocked); account_idle_ticks(ticks); } -- cgit v1.1 From 8f1d97c79eb65de1d05799d6b81d79cd94169114 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 6 Dec 2010 11:40:00 -0600 Subject: x86: Support for this_cpu_add, sub, dec, inc_return Supply an implementation for x86 in order to generate more efficient code. V2->V3: - Cleanup - Remove strange type checking from percpu_add_return_op. tj: - Dropped unused typedef from percpu_add_return_op(). - Renamed ret__ to paro_ret__ in percpu_add_return_op(). - Minor indentation adjustments. Acked-by: H. Peter Anvin Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/percpu.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index f899e01..38f9e96 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -177,6 +177,39 @@ do { \ } \ } while (0) +/* + * Add return operation + */ +#define percpu_add_return_op(var, val) \ +({ \ + typeof(var) paro_ret__ = val; \ + switch (sizeof(var)) { \ + case 1: \ + asm("xaddb %0, "__percpu_arg(1) \ + : "+q" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 2: \ + asm("xaddw %0, "__percpu_arg(1) \ + : "+r" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 4: \ + asm("xaddl %0, "__percpu_arg(1) \ + : "+r" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 8: \ + asm("xaddq %0, "__percpu_arg(1) \ + : "+re" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + default: __bad_percpu_size(); \ + } \ + paro_ret__ += val; \ + paro_ret__; \ +}) + #define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) pfo_ret__; \ @@ -300,6 +333,14 @@ do { \ #define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) +#ifndef CONFIG_M386 +#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#endif /* * Per cpu atomic 64 bit operations are only available under 64 bit. * 32 bit must fall back to generic operations. @@ -324,6 +365,8 @@ do { \ #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) +#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) #endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ -- cgit v1.1 From 403047754cf690b012369b8fb563b738b88086e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 Dec 2010 15:47:04 +0100 Subject: percpu,x86: relocate this_cpu_add_return() and friends - include/linux/percpu.h: this_cpu_add_return() and friends were located next to __this_cpu_add_return(). However, the overall organization is to first group by preemption safeness. Relocate this_cpu_add_return() and friends to preemption-safe area. - arch/x86/include/asm/percpu.h: Relocate percpu_add_return_op() after other more basic operations. Relocate [__]this_cpu_add_return_8() so that they're first grouped by preemption safeness. Signed-off-by: Tejun Heo Cc: Christoph Lameter --- arch/x86/include/asm/percpu.h | 71 +++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 36 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 38f9e96..dd0cd4b 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -177,39 +177,6 @@ do { \ } \ } while (0) -/* - * Add return operation - */ -#define percpu_add_return_op(var, val) \ -({ \ - typeof(var) paro_ret__ = val; \ - switch (sizeof(var)) { \ - case 1: \ - asm("xaddb %0, "__percpu_arg(1) \ - : "+q" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 2: \ - asm("xaddw %0, "__percpu_arg(1) \ - : "+r" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 4: \ - asm("xaddl %0, "__percpu_arg(1) \ - : "+r" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 8: \ - asm("xaddq %0, "__percpu_arg(1) \ - : "+re" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - default: __bad_percpu_size(); \ - } \ - paro_ret__ += val; \ - paro_ret__; \ -}) - #define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) pfo_ret__; \ @@ -263,6 +230,39 @@ do { \ }) /* + * Add return operation + */ +#define percpu_add_return_op(var, val) \ +({ \ + typeof(var) paro_ret__ = val; \ + switch (sizeof(var)) { \ + case 1: \ + asm("xaddb %0, "__percpu_arg(1) \ + : "+q" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 2: \ + asm("xaddw %0, "__percpu_arg(1) \ + : "+r" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 4: \ + asm("xaddl %0, "__percpu_arg(1) \ + : "+r" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + case 8: \ + asm("xaddq %0, "__percpu_arg(1) \ + : "+re" (paro_ret__), "+m" (var) \ + : : "memory"); \ + break; \ + default: __bad_percpu_size(); \ + } \ + paro_ret__ += val; \ + paro_ret__; \ +}) + +/* * percpu_read() makes gcc load the percpu variable every time it is * accessed while percpu_read_stable() allows the value to be cached. * percpu_read_stable() is more efficient and can be used if its value @@ -352,6 +352,7 @@ do { \ #define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) +#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) @@ -359,14 +360,12 @@ do { \ #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) #define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) - -#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) #endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ -- cgit v1.1 From 7296e08abac0a22a2534a4f6e493c764f2c77583 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 14 Dec 2010 10:28:44 -0600 Subject: x86: this_cpu_cmpxchg and this_cpu_xchg operations Provide support as far as the hardware capabilities of the x86 cpus allow. Define CONFIG_CMPXCHG_LOCAL in Kconfig.cpu to allow core code to test for fast cpuops implementations. V1->V2: - Take out the definition for this_cpu_cmpxchg_8 and move it into a separate patch. tj: - Reordered ops to better follow this_cpu_* organization. - Renamed macro temp variables similar to their existing neighbours. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/Kconfig.cpu | 3 ++ arch/x86/include/asm/percpu.h | 107 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2ac9069..15588a0 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) +config CMPXCHG_LOCAL + def_bool X86_64 || (X86_32 && !M386) + config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index dd0cd4b..b85ade5 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -263,6 +263,83 @@ do { \ }) /* + * Beware: xchg on x86 has an implied lock prefix. There will be the cost of + * full lock semantics even though they are not needed. + */ +#define percpu_xchg_op(var, nval) \ +({ \ + typeof(var) pxo_ret__; \ + typeof(var) pxo_new__ = (nval); \ + switch (sizeof(var)) { \ + case 1: \ + asm("xchgb %2, "__percpu_arg(1) \ + : "=a" (pxo_ret__), "+m" (var) \ + : "q" (pxo_new__) \ + : "memory"); \ + break; \ + case 2: \ + asm("xchgw %2, "__percpu_arg(1) \ + : "=a" (pxo_ret__), "+m" (var) \ + : "r" (pxo_new__) \ + : "memory"); \ + break; \ + case 4: \ + asm("xchgl %2, "__percpu_arg(1) \ + : "=a" (pxo_ret__), "+m" (var) \ + : "r" (pxo_new__) \ + : "memory"); \ + break; \ + case 8: \ + asm("xchgq %2, "__percpu_arg(1) \ + : "=a" (pxo_ret__), "+m" (var) \ + : "r" (pxo_new__) \ + : "memory"); \ + break; \ + default: __bad_percpu_size(); \ + } \ + pxo_ret__; \ +}) + +/* + * cmpxchg has no such implied lock semantics as a result it is much + * more efficient for cpu local operations. + */ +#define percpu_cmpxchg_op(var, oval, nval) \ +({ \ + typeof(var) pco_ret__; \ + typeof(var) pco_old__ = (oval); \ + typeof(var) pco_new__ = (nval); \ + switch (sizeof(var)) { \ + case 1: \ + asm("cmpxchgb %2, "__percpu_arg(1) \ + : "=a" (pco_ret__), "+m" (var) \ + : "q" (pco_new__), "0" (pco_old__) \ + : "memory"); \ + break; \ + case 2: \ + asm("cmpxchgw %2, "__percpu_arg(1) \ + : "=a" (pco_ret__), "+m" (var) \ + : "r" (pco_new__), "0" (pco_old__) \ + : "memory"); \ + break; \ + case 4: \ + asm("cmpxchgl %2, "__percpu_arg(1) \ + : "=a" (pco_ret__), "+m" (var) \ + : "r" (pco_new__), "0" (pco_old__) \ + : "memory"); \ + break; \ + case 8: \ + asm("cmpxchgq %2, "__percpu_arg(1) \ + : "=a" (pco_ret__), "+m" (var) \ + : "r" (pco_new__), "0" (pco_old__) \ + : "memory"); \ + break; \ + default: __bad_percpu_size(); \ + } \ + pco_ret__; \ +}) + +/* * percpu_read() makes gcc load the percpu variable every time it is * accessed while percpu_read_stable() allows the value to be cached. * percpu_read_stable() is more efficient and can be used if its value @@ -300,6 +377,12 @@ do { \ #define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) #define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) +/* + * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much + * faster than an xchg with forced lock semantics. + */ +#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -319,6 +402,11 @@ do { \ #define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) #define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) +#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) #define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) @@ -332,15 +420,32 @@ do { \ #define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) +#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #ifndef CONFIG_M386 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + #define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#endif +#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#endif /* !CONFIG_M386 */ + /* * Per cpu atomic 64 bit operations are only available under 64 bit. * 32 bit must fall back to generic operations. -- cgit v1.1 From 8270137a0d50507a5b40f880db636527045b8466 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 14 Dec 2010 10:28:47 -0600 Subject: cpuops: Use cmpxchg for xchg to avoid lock semantics Use cmpxchg instead of xchg to realize this_cpu_xchg. xchg will cause LOCK overhead since LOCK is always implied but cmpxchg will not. Baselines: xchg() = 18 cycles (no segment prefix, LOCK semantics) __this_cpu_xchg = 1 cycle (simulated using this_cpu_read/write, two prefixes. Looks like the cpu can use loop optimization to get rid of most of the overhead) Cycles before: this_cpu_xchg = 37 cycles (segment prefix and LOCK (implied by xchg)) After: this_cpu_xchg = 11 cycle (using cmpxchg without lock semantics) Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/percpu.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index b85ade5..8ee4516 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -263,8 +263,9 @@ do { \ }) /* - * Beware: xchg on x86 has an implied lock prefix. There will be the cost of - * full lock semantics even though they are not needed. + * xchg is implemented using cmpxchg without a lock prefix. xchg is + * expensive due to the implied lock prefix. The processor cannot prefetch + * cachelines if xchg is used. */ #define percpu_xchg_op(var, nval) \ ({ \ @@ -272,25 +273,33 @@ do { \ typeof(var) pxo_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ - asm("xchgb %2, "__percpu_arg(1) \ + asm("\n1:mov "__percpu_arg(1)",%%al" \ + "\n\tcmpxchgb %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ : "=a" (pxo_ret__), "+m" (var) \ : "q" (pxo_new__) \ : "memory"); \ break; \ case 2: \ - asm("xchgw %2, "__percpu_arg(1) \ + asm("\n1:mov "__percpu_arg(1)",%%ax" \ + "\n\tcmpxchgw %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ : "=a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ break; \ case 4: \ - asm("xchgl %2, "__percpu_arg(1) \ + asm("\n1:mov "__percpu_arg(1)",%%eax" \ + "\n\tcmpxchgl %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ : "=a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ break; \ case 8: \ - asm("xchgq %2, "__percpu_arg(1) \ + asm("\n1:mov "__percpu_arg(1)",%%rax" \ + "\n\tcmpxchgq %2, "__percpu_arg(1) \ + "\n\tjnz 1b" \ : "=a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ -- cgit v1.1 From 0a3aee0da4402aa19b66e458038533c896fb80c6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 18 Dec 2010 16:28:55 +0100 Subject: x86: Use this_cpu_ops to optimize code Go through x86 code and replace __get_cpu_var and get_cpu_var instances that refer to a scalar and are not used for address determinations. Cc: Yinghai Lu Cc: Ingo Molnar Acked-by: Tejun Heo Acked-by: "H. Peter Anvin" Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/debugreg.h | 2 +- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/nmi.c | 24 ++++++++++++------------ arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++---- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 6 +++--- arch/x86/kernel/cpu/perf_event.c | 27 +++++++++++---------------- arch/x86/kernel/cpu/perf_event_intel.c | 4 ++-- arch/x86/kernel/ftrace.c | 6 +++--- arch/x86/kernel/hw_breakpoint.c | 12 ++++++------ arch/x86/kernel/irq.c | 6 +++--- arch/x86/kernel/irq_32.c | 4 ++-- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tsc.c | 2 +- arch/x86/kvm/x86.c | 8 ++++---- arch/x86/oprofile/nmi_int.c | 2 +- 16 files changed, 57 insertions(+), 62 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b81002f..078ad0c 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -94,7 +94,7 @@ static inline void hw_breakpoint_disable(void) static inline int hw_breakpoint_active(void) { - return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; + return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void aout_dump_debugregs(struct user *dump); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7cc0a72..8d50922 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2302,7 +2302,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) unsigned int irr; struct irq_desc *desc; struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; + irq = __this_cpu_read(vector_irq[vector]); if (irq == -1) continue; @@ -2336,7 +2336,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); goto unlock; } - __get_cpu_var(vector_irq)[vector] = -1; + __this_cpu_write(vector_irq[vector], -1); unlock: raw_spin_unlock(&desc->lock); } diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index c90041c..b387dce 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -306,12 +306,12 @@ void acpi_nmi_disable(void) */ void cpu_nmi_set_wd_enabled(void) { - __get_cpu_var(wd_enabled) = 1; + __this_cpu_write(wd_enabled, 1); } void setup_apic_nmi_watchdog(void *unused) { - if (__get_cpu_var(wd_enabled)) + if (__this_cpu_read(wd_enabled)) return; /* cheap hack to support suspend/resume */ @@ -322,12 +322,12 @@ void setup_apic_nmi_watchdog(void *unused) switch (nmi_watchdog) { case NMI_LOCAL_APIC: if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; + __this_cpu_write(wd_enabled, 0); return; } /* FALL THROUGH */ case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; + __this_cpu_write(wd_enabled, 1); atomic_inc(&nmi_active); } } @@ -337,13 +337,13 @@ void stop_apic_nmi_watchdog(void *unused) /* only support LOCAL and IO APICs for now */ if (!nmi_watchdog_active()) return; - if (__get_cpu_var(wd_enabled) == 0) + if (__this_cpu_read(wd_enabled) == 0) return; if (nmi_watchdog == NMI_LOCAL_APIC) lapic_watchdog_stop(); else __acpi_nmi_disable(NULL); - __get_cpu_var(wd_enabled) = 0; + __this_cpu_write(wd_enabled, 0); atomic_dec(&nmi_active); } @@ -403,8 +403,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) sum = get_timer_irqs(cpu); - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; + if (__this_cpu_read(nmi_touch)) { + __this_cpu_write(nmi_touch, 0); touched = 1; } @@ -427,7 +427,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) touched = 1; /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { + if (!touched && __this_cpu_read(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... @@ -440,12 +440,12 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) die_nmi("BUG: NMI Watchdog detected LOCKUP", regs, panic_on_timeout); } else { - __get_cpu_var(last_irq_sum) = sum; + __this_cpu_write(last_irq_sum, sum); __this_cpu_write(alert_counter, 0); } /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) + if (!__this_cpu_read(wd_enabled)) return rc; switch (nmi_watchdog) { case NMI_LOCAL_APIC: @@ -467,7 +467,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) static void enable_ioapic_nmi_watchdog_single(void *unused) { - __get_cpu_var(wd_enabled) = 1; + __this_cpu_write(wd_enabled, 1); atomic_inc(&nmi_active); __acpi_nmi_enable(NULL); } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c1c52c3..26ec9a7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -118,8 +118,8 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) else if (!strcmp(oem_table_id, "UVX")) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { - __get_cpu_var(x2apic_extra_bits) = - nodeid << (uvh_apicid.s.pnode_shift - 1); + __this_cpu_write(x2apic_extra_bits, + nodeid << (uvh_apicid.s.pnode_shift - 1)); uv_system_type = UV_NON_UNIQUE_APIC; uv_set_apicid_hibit(); return 1; @@ -284,7 +284,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x) unsigned int id; WARN_ON(preemptible() && num_online_cpus() > 1); - id = x | __get_cpu_var(x2apic_extra_bits); + id = x | __this_cpu_read(x2apic_extra_bits); return id; } @@ -376,7 +376,7 @@ struct apic __refdata apic_x2apic_uv_x = { static __cpuinit void set_x2apic_extra_bits(int pnode) { - __get_cpu_var(x2apic_extra_bits) = (pnode << 6); + __this_cpu_write(x2apic_extra_bits, (pnode << 6)); } /* diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 491977b..42a3604 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1377,7 +1377,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) static void query_values_on_cpu(void *_err) { int *err = _err; - struct powernow_k8_data *data = __get_cpu_var(powernow_data); + struct powernow_k8_data *data = __this_cpu_read(powernow_data); *err = query_current_values_with_pending_wait(data); } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7a35b72..0c746af 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -326,7 +326,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { - unsigned bank = __get_cpu_var(injectm.bank); + unsigned bank = __this_cpu_read(injectm.bank); if (msr == rip_msr) return offsetof(struct mce, ip); @@ -346,7 +346,7 @@ static u64 mce_rdmsrl(u32 msr) { u64 v; - if (__get_cpu_var(injectm).finished) { + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); if (offset < 0) @@ -369,7 +369,7 @@ static u64 mce_rdmsrl(u32 msr) static void mce_wrmsrl(u32 msr, u64 v) { - if (__get_cpu_var(injectm).finished) { + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); if (offset >= 0) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6d75b91..ba85814 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -968,8 +968,7 @@ x86_perf_event_set_period(struct perf_event *event) static void x86_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (cpuc->enabled) + if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } @@ -1243,7 +1242,7 @@ perf_event_nmi_handler(struct notifier_block *self, break; case DIE_NMIUNKNOWN: this_nmi = percpu_read(irq_stat.__nmi_count); - if (this_nmi != __get_cpu_var(pmu_nmi).marked) + if (this_nmi != __this_cpu_read(pmu_nmi.marked)) /* let the kernel handle the unknown nmi */ return NOTIFY_DONE; /* @@ -1267,8 +1266,8 @@ perf_event_nmi_handler(struct notifier_block *self, this_nmi = percpu_read(irq_stat.__nmi_count); if ((handled > 1) || /* the next nmi could be a back-to-back nmi */ - ((__get_cpu_var(pmu_nmi).marked == this_nmi) && - (__get_cpu_var(pmu_nmi).handled > 1))) { + ((__this_cpu_read(pmu_nmi.marked) == this_nmi) && + (__this_cpu_read(pmu_nmi.handled) > 1))) { /* * We could have two subsequent back-to-back nmis: The * first handles more than one counter, the 2nd @@ -1279,8 +1278,8 @@ perf_event_nmi_handler(struct notifier_block *self, * handling more than one counter. We will mark the * next (3rd) and then drop it if unhandled. */ - __get_cpu_var(pmu_nmi).marked = this_nmi + 1; - __get_cpu_var(pmu_nmi).handled = handled; + __this_cpu_write(pmu_nmi.marked, this_nmi + 1); + __this_cpu_write(pmu_nmi.handled, handled); } return NOTIFY_STOP; @@ -1454,11 +1453,9 @@ static inline void x86_pmu_read(struct perf_event *event) */ static void x86_pmu_start_txn(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - perf_pmu_disable(pmu); - cpuc->group_flag |= PERF_EVENT_TXN; - cpuc->n_txn = 0; + __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); + __this_cpu_write(cpu_hw_events.n_txn, 0); } /* @@ -1468,14 +1465,12 @@ static void x86_pmu_start_txn(struct pmu *pmu) */ static void x86_pmu_cancel_txn(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - - cpuc->group_flag &= ~PERF_EVENT_TXN; + __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* * Truncate the collected events. */ - cpuc->n_added -= cpuc->n_txn; - cpuc->n_events -= cpuc->n_txn; + __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); + __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); perf_pmu_enable(pmu); } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c8f5c08..4ee59bc 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -649,7 +649,7 @@ static void intel_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) + if (!__this_cpu_read(cpu_hw_events.enabled)) return; intel_pmu_enable_bts(hwc->config); @@ -679,7 +679,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event) static void intel_pmu_reset(void) { - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); unsigned long flags; int idx; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3afb33f..b45246f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -167,9 +167,9 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { - __get_cpu_var(save_modifying_code) = modifying_code; + __this_cpu_write(save_modifying_code, modifying_code); - if (!__get_cpu_var(save_modifying_code)) + if (!__this_cpu_read(save_modifying_code)) return; if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { @@ -183,7 +183,7 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { - if (!__get_cpu_var(save_modifying_code)) + if (!__this_cpu_read(save_modifying_code)) return; /* Finish all executions before clearing nmi_running */ diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 42c5942..02f0763 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) return -EBUSY; set_debugreg(info->address, i); - __get_cpu_var(cpu_debugreg[i]) = info->address; + __this_cpu_write(cpu_debugreg[i], info->address); dr7 = &__get_cpu_var(cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); @@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) void hw_breakpoint_restore(void) { - set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); - set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); - set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); - set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); + set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); + set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); + set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); set_debugreg(current->thread.debugreg6, 6); - set_debugreg(__get_cpu_var(cpu_dr7), 7); + set_debugreg(__this_cpu_read(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 83ec017..3a43caa 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -234,7 +234,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) exit_idle(); irq_enter(); - irq = __get_cpu_var(vector_irq)[vector]; + irq = __this_cpu_read(vector_irq[vector]); if (!handle_irq(irq, regs)) { ack_APIC_irq(); @@ -350,12 +350,12 @@ void fixup_irqs(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; - if (__get_cpu_var(vector_irq)[vector] < 0) + if (__this_cpu_read(vector_irq[vector]) < 0) continue; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); if (irr & (1 << (vector % 32))) { - irq = __get_cpu_var(vector_irq)[vector]; + irq = __this_cpu_read(vector_irq[vector]); data = irq_get_irq_data(irq); raw_spin_lock(&desc->lock); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 96656f2..48ff6dc 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -79,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); - irqctx = __get_cpu_var(hardirq_ctx); + irqctx = __this_cpu_read(hardirq_ctx); /* * this is where we switch to the IRQ stack. However, if we are @@ -166,7 +166,7 @@ asmlinkage void do_softirq(void) if (local_softirq_pending()) { curctx = current_thread_info(); - irqctx = __get_cpu_var(softirq_ctx); + irqctx = __this_cpu_read(softirq_ctx); irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 083e99d..ff4e5a1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1377,7 +1377,7 @@ void play_dead_common(void) mb(); /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; + __this_cpu_write(cpu_state, CPU_DEAD); /* * With physical CPU hotplug, we should halt the cpu diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0c40d8b..acb08dd 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -659,7 +659,7 @@ void restore_sched_clock_state(void) local_irq_save(flags); - __get_cpu_var(cyc2ns_offset) = 0; + __this_cpu_write(cyc2ns_offset, 0); offset = cyc2ns_suspend - sched_clock(); for_each_possible_cpu(cpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cdac9e5..79d9606 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -981,7 +981,7 @@ static inline u64 nsec_to_cycles(u64 nsec) if (kvm_tsc_changes_freq()) printk_once(KERN_WARNING "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * __get_cpu_var(cpu_tsc_khz); + ret = nsec * __this_cpu_read(cpu_tsc_khz); do_div(ret, USEC_PER_SEC); return ret; } @@ -1066,7 +1066,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); kernel_ns = get_kernel_ns(); - this_tsc_khz = __get_cpu_var(cpu_tsc_khz); + this_tsc_khz = __this_cpu_read(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); @@ -4432,7 +4432,7 @@ EXPORT_SYMBOL_GPL(kvm_fast_pio_out); static void tsc_bad(void *info) { - __get_cpu_var(cpu_tsc_khz) = 0; + __this_cpu_write(cpu_tsc_khz, 0); } static void tsc_khz_changed(void *data) @@ -4446,7 +4446,7 @@ static void tsc_khz_changed(void *data) khz = cpufreq_quick_get(raw_smp_processor_id()); if (!khz) khz = tsc_khz; - __get_cpu_var(cpu_tsc_khz) = khz; + __this_cpu_write(cpu_tsc_khz, khz); } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 4e8baad..a0cae67 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -143,7 +143,7 @@ static inline int has_mux(void) inline int op_x86_phys_to_virt(int phys) { - return __get_cpu_var(switch_index) + phys; + return __this_cpu_read(switch_index) + phys; } inline int op_x86_virt_to_phys(int virt) -- cgit v1.1 From 7b543a5334ff4ea2e3ad3b777fc23cdb8072a988 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 18 Dec 2010 16:30:05 +0100 Subject: x86: Replace uses of current_cpu_data with this_cpu ops Replace all uses of current_cpu_data with this_cpu operations on the per cpu structure cpu_info. The scala accesses are replaced with the matching this_cpu ops which results in smaller and more efficient code. In the long run, it might be a good idea to remove cpu_data() macro too and use per_cpu macro directly. tj: updated description Cc: Yinghai Lu Cc: Ingo Molnar Acked-by: H. Peter Anvin Acked-by: Tejun Heo Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/processor.h | 3 +-- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++-- arch/x86/kernel/cpu/mcheck/mce.c | 14 +++++++------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 2 +- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/smpboot.c | 12 ++++++------ arch/x86/oprofile/op_model_ppro.c | 8 ++++---- 10 files changed, 26 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cae9c3c..c6efecf 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -141,10 +141,9 @@ extern __u32 cpu_caps_set[NCAPINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) -#define current_cpu_data __get_cpu_var(cpu_info) #else +#define cpu_info boot_cpu_data #define cpu_data(cpu) boot_cpu_data -#define current_cpu_data boot_cpu_data #endif extern const struct seq_operations cpuinfo_op; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3f838d5..8accfe3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -516,7 +516,7 @@ static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - if (cpu_has(¤t_cpu_data, X86_FEATURE_ARAT)) { + if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; /* Make LAPIC timer preferrable over percpu HPET */ lapic_clockevent.rating = 150; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9e093f8..7c7bedb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -668,7 +668,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383); bool cpu_has_amd_erratum(const int *erratum) { - struct cpuinfo_x86 *cpu = ¤t_cpu_data; + struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info); int osvw_id = *erratum++; u32 range; u32 ms; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 42a3604..35c7e65 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -521,7 +521,7 @@ static void check_supported_cpu(void *_rc) *rc = -ENODEV; - if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) + if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD) return; eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 17ad033..453c616 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -266,7 +266,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ - size_in_kb = current_cpu_data.x86_cache_size; + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); break; case 3: if (!l3.val) @@ -288,7 +288,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; if (assoc == 0xffff) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0c746af..d916183 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1159,7 +1159,7 @@ static void mce_start_timer(unsigned long data) WARN_ON(smp_processor_id() != data); - if (mce_available(¤t_cpu_data)) { + if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); } @@ -1767,7 +1767,7 @@ static int mce_shutdown(struct sys_device *dev) static int mce_resume(struct sys_device *dev) { __mcheck_cpu_init_generic(); - __mcheck_cpu_init_vendor(¤t_cpu_data); + __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); return 0; } @@ -1775,7 +1775,7 @@ static int mce_resume(struct sys_device *dev) static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); __mcheck_cpu_init_timer(); @@ -1790,7 +1790,7 @@ static void mce_restart(void) /* Toggle features for corrected errors */ static void mce_disable_ce(void *all) { - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (all) del_timer_sync(&__get_cpu_var(mce_timer)); @@ -1799,7 +1799,7 @@ static void mce_disable_ce(void *all) static void mce_enable_ce(void *all) { - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; cmci_reenable(); cmci_recheck(); @@ -2022,7 +2022,7 @@ static void __cpuinit mce_disable_cpu(void *h) unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) @@ -2040,7 +2040,7 @@ static void __cpuinit mce_reenable_cpu(void *h) unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 6fcd093..8694ef56 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -130,7 +130,7 @@ void cmci_recheck(void) unsigned long flags; int banks; - if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) return; local_irq_save(flags); machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 57d1868..dae1c07 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -445,7 +445,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { - if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); @@ -460,7 +460,7 @@ static void mwait_idle(void) { if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); - if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ff4e5a1..0720071 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -430,7 +430,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, c->llc_shared_map); - if (current_cpu_data.x86_max_cores == 1) { + if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); c->booted_cores = 1; return; @@ -1094,7 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) preempt_disable(); smp_cpu_index_default(); - current_cpu_data = boot_cpu_data; + memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); /* @@ -1397,11 +1397,11 @@ static inline void mwait_play_dead(void) int i; void *mwait_ptr; - if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT)) + if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_MWAIT)) return; - if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH)) + if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) return; - if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; eax = CPUID_MWAIT_LEAF; @@ -1452,7 +1452,7 @@ static inline void mwait_play_dead(void) static inline void hlt_play_dead(void) { - if (current_cpu_data.x86 >= 4) + if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); while (1) { diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index d769cda..94b7450 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -95,8 +95,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, * counter width: */ if (!(eax.split.version_id == 0 && - current_cpu_data.x86 == 6 && - current_cpu_data.x86_model == 15)) { + __this_cpu_read(cpu_info.x86) == 6 && + __this_cpu_read(cpu_info.x86_model) == 15)) { if (counter_width < eax.split.bit_width) counter_width = eax.split.bit_width; @@ -235,8 +235,8 @@ static void arch_perfmon_setup_counters(void) eax.full = cpuid_eax(0xa); /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ - if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && - current_cpu_data.x86_model == 15) { + if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 && + __this_cpu_read(cpu_info.x86_model) == 15) { eax.split.version_id = 2; eax.split.num_counters = 2; eax.split.bit_width = 40; -- cgit v1.1 From c1955b5f3a95717ce1f5235f6e9968da068e3183 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 18 Dec 2010 16:30:48 +0100 Subject: x86: Use this_cpu_inc_return for nmi counter this_cpu_inc_return() saves us a memory access there. Reviewed-by: Pekka Enberg Reviewed-by: Mathieu Desnoyers Acked-by: H. Peter Anvin Acked-by: Tejun Heo Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/kernel/apic/nmi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b387dce..37769cc 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -432,8 +432,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - __this_cpu_inc(alert_counter); - if (__this_cpu_read(alert_counter) == 5 * nmi_hz) + if (__this_cpu_inc_return(alert_counter) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ -- cgit v1.1 From 357089fca91f639dd005ae0721f5f932b4f276ab Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 16 Dec 2010 12:14:43 -0600 Subject: x86: udelay: Use this_cpu_read to avoid address calculation The code will use a segment prefix instead of doing the lookup and calculation. Signed-off-by: Christoph Lameter Acked-by: "H. Peter Anvin" Signed-off-by: Tejun Heo --- arch/x86/lib/delay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index ff485d3..fc45ba8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long xloops) asm("mull %%edx" :"=d" (xloops), "=&a" (d0) :"1" (xloops), "0" - (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); + (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4))); __delay(++xloops); } -- cgit v1.1