From fc2219d49ef1606e7fd2c88af2b423b01ff3d319 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2009 09:50:41 -0700 Subject: rcu: Clean up code based on review feedback from Josh Triplett These issues identified during an old-fashioned face-to-face code review extended over many hours. o Bury various forms of the "rsp->completed == rsp->gpnum" comparison into an rcu_gp_in_progress() function, which has the beneficial side-effect of forcing consistent use of ACCESS_ONCE(). o Replace hand-coded arithmetic with DIV_ROUND_UP(). o Bury several "!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x01])" instances into an rcu_preempted_readers() function, as this expression indicates that there are no readers blocked within RCU read-side critical sections blocking the current grace period. (Though there might well be similar readers blocking the next grace period.) o Remove a dangling rcu_restart_cpu() declaration that has been dangling for almost 20 minor releases of the kernel. Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12537246442687-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 29 +++++++++++++++++----------- kernel/rcutree.h | 6 +++--- kernel/rcutree_plugin.h | 50 ++++++++++++++++++++++++------------------------- 3 files changed, 46 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 52b06f6..f85b684 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -101,6 +101,16 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, #include "rcutree_plugin.h" /* + * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * permit this function to be invoked without holding the root rcu_node + * structure's ->lock, but of course results can be subject to change. + */ +static int rcu_gp_in_progress(struct rcu_state *rsp) +{ + return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); +} + +/* * Note a quiescent state. Because we do not need to know * how many quiescent states passed, just if there was at least * one since the start of the grace period, this just sets a flag. @@ -173,9 +183,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - /* ACCESS_ONCE() because we are accessing outside of lock. */ - return *rdp->nxttail[RCU_DONE_TAIL] && - ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum); + return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); } /* @@ -482,7 +490,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) spin_lock_irqsave(&rnp->lock, flags); delta = jiffies - rsp->jiffies_stall; - if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { + if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -537,8 +545,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); - } else if (rsp->gpnum != rsp->completed && - delta >= RCU_STALL_RAT_DELAY) { + } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { /* They had two time units to dump stack, so complain. */ print_other_cpu_stall(rsp); @@ -703,9 +710,9 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) * hold rnp->lock, as required by rcu_start_gp(), which will release it. */ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) - __releases(rnp->lock) + __releases(rcu_get_root(rsp)->lock) { - WARN_ON_ONCE(rsp->completed == rsp->gpnum); + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -1092,7 +1099,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) struct rcu_node *rnp = rcu_get_root(rsp); u8 signaled; - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) + if (!rcu_gp_in_progress(rsp)) return; /* No grace period in progress, nothing to force. */ if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ @@ -1251,7 +1258,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->nxttail[RCU_NEXT_TAIL] = &head->next; /* Start a new grace period if one not already started. */ - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { + if (!rcu_gp_in_progress(rsp)) { unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); @@ -1331,7 +1338,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && + if (rcu_gp_in_progress(rsp) && ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { rdp->n_rp_need_fqs++; return 1; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 8e8287a..9aa8c8a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -48,14 +48,14 @@ #elif NR_CPUS <= RCU_FANOUT_SQ # define NUM_RCU_LVLS 2 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 #elif NR_CPUS <= RCU_FANOUT_CUBE # define NUM_RCU_LVLS 3 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) -# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_3 NR_CPUS #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 1cee04f..8ff1ba7 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -150,6 +150,16 @@ void __rcu_read_lock(void) } EXPORT_SYMBOL_GPL(__rcu_read_lock); +/* + * Check for preempted RCU readers blocking the current grace period + * for the specified rcu_node structure. If the caller needs a reliable + * answer, it must hold the rcu_node's ->lock. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); +} + static void rcu_read_unlock_special(struct task_struct *t) { int empty; @@ -196,7 +206,7 @@ static void rcu_read_unlock_special(struct task_struct *t) break; spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + empty = !rcu_preempted_readers(rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; @@ -207,7 +217,7 @@ static void rcu_read_unlock_special(struct task_struct *t) * drop rnp->lock and restore irq. */ if (!empty && rnp->qsmask == 0 && - list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { + !rcu_preempted_readers(rnp)) { struct rcu_node *rnp_p; if (rnp->parent == NULL) { @@ -257,12 +267,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { unsigned long flags; struct list_head *lp; - int phase = rnp->gpnum & 0x1; + int phase; struct task_struct *t; - if (!list_empty(&rnp->blocked_tasks[phase])) { + if (rcu_preempted_readers(rnp)) { spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; /* re-read under lock. */ + phase = rnp->gpnum & 0x1; lp = &rnp->blocked_tasks[phase]; list_for_each_entry(t, lp, rcu_node_entry) printk(" P%d", t->pid); @@ -281,20 +291,10 @@ static void rcu_print_task_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); + WARN_ON_ONCE(rcu_preempted_readers(rnp)); WARN_ON_ONCE(rnp->qsmask); } -/* - * Check for preempted RCU readers for the specified rcu_node structure. - * If the caller needs a reliable answer, it must hold the rcu_node's - * >lock. - */ -static int rcu_preempted_readers(struct rcu_node *rnp) -{ - return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); -} - #ifdef CONFIG_HOTPLUG_CPU /* @@ -461,6 +461,15 @@ static void rcu_preempt_note_context_switch(int cpu) { } +/* + * Because preemptable RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return 0; +} + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR /* @@ -483,15 +492,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -/* - * Because preemptable RCU does not exist, there are never any preempted - * RCU readers. - */ -static int rcu_preempted_readers(struct rcu_node *rnp) -{ - return 0; -} - #ifdef CONFIG_HOTPLUG_CPU /* -- cgit v1.1 From 1eba8f84380bede3c602bd7758dea96925cead01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2009 09:50:42 -0700 Subject: rcu: Clean up code based on review feedback from Josh Triplett, part 2 These issues identified during an old-fashioned face-to-face code review extending over many hours. o Add comments for tricky parts of code, and correct comments that have passed their sell-by date. o Get rid of the vestiges of rcu_init_sched(), which is no longer needed now that PREEMPT_RCU is gone. o Move the #include of rcutree_plugin.h to the end of rcutree.c, which means that, rather than having a random collection of forward declarations, the new set of forward declarations document the set of plugins. The new home for this #include also allows __rcu_init_preempt() to move into rcutree_plugin.h. o Fix rcu_preempt_check_callbacks() to be static. Suggested-by: Josh Triplett Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12537246443924-git-send-email-> Signed-off-by: Ingo Molnar Peter Zijlstra --- kernel/rcutree.c | 72 +++++++++++++++++++++++-------------------------- kernel/rcutree.h | 31 +++++++++++++++------ kernel/rcutree_plugin.h | 23 ++++++++++++++-- 3 files changed, 77 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f85b684..53a5ef0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -81,24 +81,29 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); -extern long rcu_batches_completed_sched(void); -static struct rcu_node *rcu_get_root(struct rcu_state *rsp); -static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags); -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); +/* Forward declarations for rcutree_plugin.h */ +static inline void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempted_readers(struct rcu_node *rnp); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +static void rcu_print_task_stall(struct rcu_node *rnp); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp); +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void __rcu_process_callbacks(struct rcu_state *rsp, - struct rcu_data *rdp); -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp); -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp); -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, - int preemptable); +static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_process_callbacks(void); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +static int rcu_preempt_pending(int cpu); +static int rcu_preempt_needs_cpu(int cpu); +static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void __init __rcu_init_preempt(void); -#include "rcutree_plugin.h" /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s @@ -377,7 +382,7 @@ static long dyntick_recall_completed(struct rcu_state *rsp) /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU - * is already in a quiescent state courtesy of dynticks idle mode. + * is in dynticks idle mode, which is an extended quiescent state. */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { @@ -624,9 +629,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) note_new_gpnum(rsp, rdp); /* - * Because we are first, we know that all our callbacks will - * be covered by this upcoming grace period, even the ones - * that were registered arbitrarily recently. + * Because this CPU just now started the new grace period, we know + * that all of its callbacks will be covered by this upcoming grace + * period, even the ones that were registered arbitrarily recently. + * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. + * + * Other CPUs cannot be sure exactly when the grace period started. + * Therefore, their recently registered callbacks must pass through + * an additional RCU_NEXT_READY stage, so that they will be handled + * by the next RCU grace period. */ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; @@ -886,7 +897,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) /* * Move callbacks from the outgoing CPU to the running CPU. - * Note that the outgoing CPU is now quiscent, so it is now + * Note that the outgoing CPU is now quiescent, so it is now * (uncharacteristically) safe to access its rcu_data structure. * Note also that we must carefully retain the order of the * outgoing CPU's callbacks in order for rcu_barrier() to work @@ -1577,25 +1588,6 @@ do { \ } \ } while (0) -#ifdef CONFIG_TREE_PREEMPT_RCU - -void __init __rcu_init_preempt(void) -{ - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -void __init __rcu_init_preempt(void) -{ -} - -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ - void __init __rcu_init(void) { int i; /* All used by RCU_INIT_FLAVOR(). */ @@ -1612,6 +1604,8 @@ void __init __rcu_init(void) open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } +#include "rcutree_plugin.h" + module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 9aa8c8a..a48d11f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -79,15 +79,21 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - spinlock_t lock; + spinlock_t lock; /* Root rcu_node's lock protects some */ + /* rcu_state fields as well as following. */ long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ + /* In leaf rcu_node, each bit corresponds to */ + /* an rcu_data structure, otherwise, each */ + /* bit corresponds to a child rcu_node */ + /* structure. */ unsigned long qsmaskinit; /* Per-GP initialization for qsmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ + /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ int grphi; /* highest-numbered CPU or group here. */ u8 grpnum; /* CPU/group number for next level up. */ @@ -95,6 +101,9 @@ struct rcu_node { struct rcu_node *parent; struct list_head blocked_tasks[2]; /* Tasks blocked in RCU read-side critsect. */ + /* Grace period number (->gpnum) x blocked */ + /* by tasks on the (x & 0x1) element of the */ + /* blocked_tasks[] array. */ } ____cacheline_internodealigned_in_smp; /* Index values for nxttail array in struct rcu_data. */ @@ -126,19 +135,22 @@ struct rcu_data { * Any of the partitions might be empty, in which case the * pointer to that partition will be equal to the pointer for * the following partition. When the list is empty, all of - * the nxttail elements point to nxtlist, which is NULL. + * the nxttail elements point to the ->nxtlist pointer itself, + * which in that case is NULL. * - * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP * [nxtlist, *nxttail[RCU_DONE_TAIL]): * Entries that batch # <= ->completed * The grace period for these entries has completed, and * the other grace-period-completed entries may be moved * here temporarily in rcu_process_callbacks(). + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * Note that the value of *nxttail[RCU_NEXT_TAIL] will + * always be NULL, as this is the end of the list. */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; @@ -216,6 +228,9 @@ struct rcu_state { /* Force QS state. */ long gpnum; /* Current gp number. */ long completed; /* # of last completed gp. */ + + /* End of fields guarded by root rcu_node's lock. */ + spinlock_t onofflock; /* exclude on/offline and */ /* starting new GP. */ spinlock_t fqslock; /* Only one task forcing */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8ff1ba7..6525021 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -419,6 +419,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* + * Initialize preemptable RCU's state structures. + */ +static void __init __rcu_init_preempt(void) +{ + int i; /* All used by RCU_INIT_FLAVOR(). */ + int j; + struct rcu_node *rnp; + + RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); +} + +/* * Check for a task exiting while in a preemptable-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep @@ -518,7 +530,7 @@ static void rcu_preempt_offline_cpu(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to check. */ -void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(int cpu) { } @@ -526,7 +538,7 @@ void rcu_preempt_check_callbacks(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to process. */ -void rcu_preempt_process_callbacks(void) +static void rcu_preempt_process_callbacks(void) { } @@ -563,4 +575,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { } +/* + * Because preemptable RCU does not exist, it need not be initialized. + */ +static void __init __rcu_init_preempt(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ -- cgit v1.1 From 9b2619aff0332e95ea5eb7a0d75b0208818d871c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2009 09:50:43 -0700 Subject: rcu: Clean up code to address Ingo's checkpatch feedback Move declarations and update storage classes to make checkpatch happy. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12537246441701-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 3 --- kernel/rcutorture.c | 4 +--- kernel/rcutree.c | 23 ----------------------- kernel/rcutree.h | 26 +++++++++++++++++++++++++- kernel/rcutree_trace.c | 12 ++++++------ 5 files changed, 32 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 37ac454..8e79513 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -259,9 +259,6 @@ static void rcu_migrate_callback(struct rcu_head *notused) wake_up(&rcu_migrate_wq); } -extern int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); - static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 233768f..697c0a0 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -606,8 +606,6 @@ static struct rcu_torture_ops sched_ops_sync = { .name = "sched_sync" }; -extern int rcu_expedited_torture_stats(char *page); - static struct rcu_torture_ops sched_expedited_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, @@ -650,7 +648,7 @@ rcu_torture_writer(void *arg) old_rp = rcu_torture_current; rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ if (old_rp) { i = old_rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 53a5ef0..8e52cde 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -81,29 +81,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); -/* Forward declarations for rcutree_plugin.h */ -static inline void rcu_bootup_announce(void); -long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); -static int rcu_preempted_readers(struct rcu_node *rnp); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -static void rcu_print_task_stall(struct rcu_node *rnp); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -static void rcu_preempt_offline_cpu(int cpu); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_check_callbacks(int cpu); -static void rcu_preempt_process_callbacks(void); -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -static int rcu_preempt_pending(int cpu); -static int rcu_preempt_needs_cpu(int cpu); -static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void __init __rcu_init_preempt(void); - /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a48d11f..e6ab31c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -270,5 +270,29 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#endif /* #ifdef RCU_TREE_NONCORE */ +#else /* #ifdef RCU_TREE_NONCORE */ +/* Forward declarations for rcutree_plugin.h */ +static inline void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempted_readers(struct rcu_node *rnp); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +static void rcu_print_task_stall(struct rcu_node *rnp); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +static void rcu_preempt_offline_cpu(int cpu); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_process_callbacks(void); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +static int rcu_preempt_pending(int cpu); +static int rcu_preempt_needs_cpu(int cpu); +static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void __init __rcu_init_preempt(void); + +#endif /* #else #ifdef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c89f5e9..f09af28 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata, NULL); } -static struct file_operations rcudata_fops = { +static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, .open = rcudata_open, .read = seq_read, @@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata_csv, NULL); } -static struct file_operations rcudata_csv_fops = { +static const struct file_operations rcudata_csv_fops = { .owner = THIS_MODULE, .open = rcudata_csv_open, .read = seq_read, @@ -159,7 +159,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", rsp->completed, rsp->gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), @@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file) return single_open(file, show_rcuhier, NULL); } -static struct file_operations rcuhier_fops = { +static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, @@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file) return single_open(file, show_rcugp, NULL); } -static struct file_operations rcugp_fops = { +static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, @@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file) return single_open(file, show_rcu_pending, NULL); } -static struct file_operations rcu_pending_fops = { +static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, .open = rcu_pending_open, .read = seq_read, -- cgit v1.1 From d3f6302e7e51b41af86c6496ffb2f95e8f2179df Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 24 Sep 2009 14:07:55 -0700 Subject: hrtimer: Remove overly verbose "switch to high res mode" message On big systems, printing copies of Switched to high resolution mode on CPU nnn clutters up the kernel log for minimal gain. Just get rid of them. Signed-off-by: Roland Dreier LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e5d98ce..4267279 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -720,8 +720,6 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } -- cgit v1.1 From 33974093c024f08caadd2fc71a83bd811ed1831d Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 28 Sep 2009 16:43:01 +0100 Subject: tracing: Fix infinite recursion in ftrace_update_pid_func() When CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST is enabled __ftrace_trace_function contains the current trace function, not ftrace_trace_function. In ftrace_update_pid_func() we currently incorrectly assign the value of ftrace_trace_function to __ftrace_trace_funcion before returning. Without this patch it is possible to execute an infinite recursion whereby ftrace_test_stop_func() calls __ftrace_trace_function, which was assigned ftrace_test_stop_func() in ftrace_update_pid_func(). Signed-off-by: Matt Fleming Acked-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <1254152581-18347-1-git-send-email-matt@console-pimps.org> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 46592fe..3724756 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -225,7 +225,11 @@ static void ftrace_update_pid_func(void) if (ftrace_trace_function == ftrace_stub) return; +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST func = ftrace_trace_function; +#else + func = __ftrace_trace_function; +#endif if (ftrace_pid_trace) { set_ftrace_pid_function(func); -- cgit v1.1 From 8c9ed8e14c342ec5e7f27e7e498f62409a10eb29 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 25 Sep 2009 13:51:17 +0800 Subject: perf_event: Fix event group handling in __perf_event_sched_*() Paul Mackerras says: "Actually, looking at this more closely, it has to be a group leader anyway since it's at the top level of ctx->group_list. In fact I see four places where we do: list_for_each_entry(event, &ctx->group_list, group_entry) { if (event == event->group_leader) ... or the equivalent, three of which appear to have been introduced by afedadf2 ("perf_counter: Optimize sched in/out of counters") back in May by Peter Z. As far as I can see the if () is superfluous in each case (a singleton event will be a group of 1 and will have its group_leader pointing to itself)." [ See: http://marc.info/?l=linux-kernel&m=125361238901442&w=2 ] And Peter Zijlstra points out this is a bugfix: "The intent was to call event_sched_{in,out}() for single event groups because that's cheaper than group_sched_{in,out}(), however.. - as you noticed, I got the condition wrong, it should have read: list_empty(&event->sibling_list) - it failed to call group_can_go_on() which deals with ->exclusive. - it also doesn't call hw_perf_group_sched_in() which might break power." [ See: http://marc.info/?l=linux-kernel&m=125369523318583&w=2 ] Changelog v1->v2: - Fix the title name according to Peter Zijlstra's suggestion - Remove the comments and WARN_ON_ONCE() as Peter Zijlstra's suggestion Signed-off-by: Xiao Guangrong Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4ABC5A55.7000208@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 0f86feb..e50543d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1030,14 +1030,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx, update_context_time(ctx); perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event != event->group_leader) - event_sched_out(event, cpuctx, ctx); - else - group_sched_out(event, cpuctx, ctx); - } - } + if (ctx->nr_active) + list_for_each_entry(event, &ctx->group_list, group_entry) + group_sched_out(event, cpuctx, ctx); + perf_enable(); out: spin_unlock(&ctx->lock); @@ -1258,12 +1254,8 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != cpu) continue; - if (event != event->group_leader) - event_sched_in(event, cpuctx, ctx, cpu); - else { - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx, cpu); - } + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); /* * If this pinned group hasn't been scheduled, @@ -1291,15 +1283,9 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != cpu) continue; - if (event != event->group_leader) { - if (event_sched_in(event, cpuctx, ctx, cpu)) + if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; - } else { - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } } perf_enable(); out: -- cgit v1.1 From 27f9994c50e95f3a5a81fe4c7491a9f9cffe6ec0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 25 Sep 2009 13:54:01 +0800 Subject: perf_event: Clean up perf_event_init_task() While at it: we can traverse ctx->group_list to get all group leader, it should be safe since we hold ctx->mutex. Changlog v1->v2: - remove WARN_ON_ONCE() according to Peter Zijlstra's suggestion Signed-off-by: Xiao Guangrong Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4ABC5AF9.6060808@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e50543d..e491fb0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4767,9 +4767,7 @@ int perf_event_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { - if (event != event->group_leader) - continue; + list_for_each_entry(event, &parent_ctx->group_list, group_entry) { if (!event->attr.inherit) { inherited_all = 0; -- cgit v1.1 From f9ac5a69edee0ee7e06a05727226e3f275306c8d Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 28 Sep 2009 16:55:40 +0900 Subject: kmemtrace: Fix up tracer registration Commit ddc1637af217dbd8bc51f30e6d24e84476a869a6 ("kmemtrace: Print binary output only if 'bin' option is set") ended up inverting the error detection logic. register_tracer() returns 0 on success, which this change caused to treat as an error, resulting in: [ 0.132000] Warning: could not register the kmem tracer as well as bailing out of the initcall with an error value. This restores the old logic. Signed-off-by: Paul Mundt Acked-by: Pekka Enberg Acked-by: Frederic Weisbecker Cc: Eduard - Gabriel Munteanu Cc: Steven Rostedt Cc: Li Zefan LKML-Reference: <20090928075540.GD6668@linux-sh.org> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 81b1645..a91da69 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void) return 1; } - if (!register_tracer(&kmem_tracer)) { + if (register_tracer(&kmem_tracer) != 0) { pr_warning("Warning: could not register the kmem tracer\n"); return 1; } -- cgit v1.1 From 1087e9b4ff708976499b4de541d9e1d57d49b60a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 4 Oct 2009 02:20:11 +0200 Subject: HWPOISON: Clean up PR_MCE_KILL interface While writing the manpage I noticed some shortcomings in the current interface. - Define symbolic names for all the different values - Boundary check the kill mode values - For symmetry add a get interface too. This allows library code to get/set the current state. - For consistency define a PR_MCE_KILL_DEFAULT value Signed-off-by: Andi Kleen --- kernel/sys.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 255475d..f6afe07 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1546,24 +1546,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg4 | arg5) return -EINVAL; switch (arg2) { - case 0: + case PR_MCE_KILL_CLEAR: if (arg3 != 0) return -EINVAL; current->flags &= ~PF_MCE_PROCESS; break; - case 1: + case PR_MCE_KILL_SET: current->flags |= PF_MCE_PROCESS; - if (arg3 != 0) + if (arg3 == PR_MCE_KILL_EARLY) current->flags |= PF_MCE_EARLY; - else + else if (arg3 == PR_MCE_KILL_LATE) current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); + else + return -EINVAL; break; default: return -EINVAL; } error = 0; break; - + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; default: error = -EINVAL; break; -- cgit v1.1 From fe8e5b5a60f8427940d33b205e127aecfb0bca10 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 3 Oct 2009 14:55:18 +0200 Subject: tracing: Check total refcount before releasing bufs in profile_enable failure When we call the profile_enable() callback of an event, we release the shared perf event tracing buffers unconditionnaly in the failure path. This is wrong because there may be other users of these. Then check the total refcount before doing this. Reported-by: Paul Mackerras Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Li Zefan --- kernel/trace/trace_event_profile.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index dd44b87..e52784b 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -31,7 +31,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) if (atomic_inc_return(&event->profile_count)) return 0; - if (!total_profile_count++) { + if (!total_profile_count) { buf = (char *)alloc_percpu(profile_buf_t); if (!buf) goto fail_buf; @@ -46,14 +46,19 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) } ret = event->profile_enable(); - if (!ret) + if (!ret) { + total_profile_count++; return 0; + } - kfree(trace_profile_buf_nmi); fail_buf_nmi: - kfree(trace_profile_buf); + if (!total_profile_count) { + kfree(trace_profile_buf_nmi); + kfree(trace_profile_buf); + trace_profile_buf_nmi = NULL; + trace_profile_buf = NULL; + } fail_buf: - total_profile_count--; atomic_dec(&event->profile_count); return ret; -- cgit v1.1 From 75fb4090b39a3d7bf9ac77a28665c991ec5eaadc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 3 Oct 2009 15:08:54 +0200 Subject: tracing: Use free_percpu instead of kfree In the event->profile_enable() failure path, we release the per cpu buffers using kfree which is wrong because they are per cpu pointers. Although free_percpu only wraps kfree for now, that may change in the future so lets use the correct way. Reported-by: Paul Mackerras Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Li Zefan --- kernel/trace/trace_event_profile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index e52784b..8d5c171 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -53,8 +53,8 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) fail_buf_nmi: if (!total_profile_count) { - kfree(trace_profile_buf_nmi); - kfree(trace_profile_buf); + free_percpu(trace_profile_buf_nmi); + free_percpu(trace_profile_buf); trace_profile_buf_nmi = NULL; trace_profile_buf = NULL; } -- cgit v1.1 From f83f9ac2632732bd1678150b5a03d152f912fe72 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Thu, 24 Sep 2009 06:47:10 +0000 Subject: sched: Set correct normal_prio and prio values in sched_fork() normal_prio should be updated if policy changes from RT to SCHED_MORMAL or if static_prio/nice is changed. Some paths through sched_fork() ignore this requirement and may result in normal_prio having an invalid value. Fixing this issue allows the call to effective_prio() in wake_up_new_task() to be removed. Signed-off-by: Peter Williams Acked-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1535f38..76c0e96 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2515,22 +2515,17 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - - /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { p->policy = SCHED_NORMAL; - - if (p->normal_prio < DEFAULT_PRIO) - p->prio = DEFAULT_PRIO; + p->normal_prio = p->static_prio; + } if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; set_load_weight(p); } @@ -2541,6 +2536,11 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_reset_on_fork = 0; } + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; @@ -2581,8 +2581,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - p->prio = effective_prio(p); - if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { -- cgit v1.1 From 162cc2794df37662beb7f97ddd1dd5bffaf85e9a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2009 16:18:13 -0700 Subject: rcu: Fix rcu_lock_map build failure on CONFIG_PROVE_LOCKING=y Move the rcu_lock_map definition from rcutree.c to rcupdate.c so that TINY_RCU can use lockdep. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 7 +++++++ kernel/rcutree.c | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 8e79513..4a189ea 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -46,6 +46,13 @@ #include #include +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_key; +struct lockdep_map rcu_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +EXPORT_SYMBOL_GPL(rcu_lock_map); +#endif + enum rcu_barrier { RCU_BARRIER_STD, RCU_BARRIER_BH, diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8e52cde..81af59b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -49,13 +49,6 @@ #include "rcutree.h" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - /* Data structures. */ #define RCU_STATE_INITIALIZER(name) { \ -- cgit v1.1 From 3d76c082907e8f83c5d5c4572f38d53ad8f00c4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Sep 2009 07:46:32 -0700 Subject: rcu: Clean up code based on review feedback from Josh Triplett, part 3 Whitespace fixes, updated comments, and trivial code movement. o Fix whitespace error in RCU_HEAD_INIT() o Move "So where is rcu_write_lock()" comment so that it does not come between the rcu_read_unlock() header comment and the rcu_read_unlock() definition. o Move the module_param statements for blimit, qhimark, and qlowmark to immediately follow the corresponding definitions. o In __rcu_offline_cpu(), move the assignment to rdp_me inside the "if" statement, given that rdp_me is not used outside of that "if" statement. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12541491931164-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 81af59b..d5597830 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -122,6 +122,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -878,8 +882,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) * indefinitely delay callbacks, you have far worse things to * be worrying about. */ - rdp_me = rsp->rda[smp_processor_id()]; if (rdp->nxtlist != NULL) { + rdp_me = rsp->rda[smp_processor_id()]; *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtlist = NULL; @@ -1575,7 +1579,3 @@ void __init __rcu_init(void) } #include "rcutree_plugin.h" - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); -- cgit v1.1 From a0b6c9a78c41dc36732d6e1e90f0f2f57b29816f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Sep 2009 07:46:33 -0700 Subject: rcu: Clean up code based on review feedback from Josh Triplett, part 4 These issues identified during an old-fashioned face-to-face code review extending over many hours. This group improves an existing abstraction and introduces two new ones. It also fixes an RCU stall-warning bug found while making the other changes. o Make RCU_INIT_FLAVOR() declare its own variables, removing the need to declare them at each call site. o Create an rcu_for_each_leaf() macro that scans the leaf nodes of the rcu_node tree. o Create an rcu_for_each_node_breadth_first() macro that does a breadth-first traversal of the rcu_node tree, AKA stepping through the array in index-number order. o If all CPUs corresponding to a given leaf rcu_node structure go offline, then any tasks queued on that leaf will be moved to the root rcu_node structure. Therefore, the stall-warning code must dump out tasks queued on the root rcu_node structure as well as those queued on the leaf rcu_node structures. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12541491934126-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 53 ++++++++++++++++++++++++++----------------------- kernel/rcutree.h | 12 +++++++++++ kernel/rcutree_plugin.h | 4 ---- 3 files changed, 40 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5597830..e2e272b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -462,8 +462,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) long delta; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; /* Only let one CPU complain about others per time interval. */ @@ -474,18 +472,24 @@ static void print_other_cpu_stall(struct rcu_state *rsp) return; } rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rcu_print_task_stall(rnp); spin_unlock_irqrestore(&rnp->lock, flags); /* OK, time to rat on our buddy... */ printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { rcu_print_task_stall(rnp); - if (rnp_cur->qsmask == 0) + if (rnp->qsmask == 0) continue; - for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) - if (rnp_cur->qsmask & (1UL << cpu)) - printk(" %d", rnp_cur->grplo + cpu); + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + printk(" %d", rnp->grplo + cpu); } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); @@ -649,7 +653,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * one corresponding to this CPU, due to the fact that we have * irqs disabled. */ - for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { + rcu_for_each_node_breadth_first(rsp, rnp) { spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1042,33 +1046,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, int cpu; unsigned long flags; unsigned long mask; - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; + struct rcu_node *rnp; - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { mask = 0; - spin_lock_irqsave(&rnp_cur->lock, flags); + spin_lock_irqsave(&rnp->lock, flags); if (rsp->completed != lastcomp) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); return 1; } - if (rnp_cur->qsmask == 0) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + if (rnp->qsmask == 0) { + spin_unlock_irqrestore(&rnp->lock, flags); continue; } - cpu = rnp_cur->grplo; + cpu = rnp->grplo; bit = 1; - for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { - if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) + for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) mask |= bit; } if (mask != 0 && rsp->completed == lastcomp) { - /* cpu_quiet_msk() releases rnp_cur->lock. */ - cpu_quiet_msk(mask, rsp, rnp_cur, flags); + /* cpu_quiet_msk() releases rnp->lock. */ + cpu_quiet_msk(mask, rsp, rnp, flags); continue; } - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); } return 0; } @@ -1550,6 +1553,10 @@ static void __init rcu_init_one(struct rcu_state *rsp) */ #define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ + int i; \ + int j; \ + struct rcu_node *rnp; \ + \ rcu_init_one(rsp); \ rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ j = 0; \ @@ -1564,10 +1571,6 @@ do { \ void __init __rcu_init(void) { - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e6ab31c..676eecd 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -106,6 +106,18 @@ struct rcu_node { /* blocked_tasks[] array. */ } ____cacheline_internodealigned_in_smp; +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6525021..57200fe 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -423,10 +423,6 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) */ static void __init __rcu_init_preempt(void) { - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); } -- cgit v1.1 From 135c8aea557cf53abe6c8847e286d01442124193 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Sep 2009 21:50:21 -0700 Subject: rcu: Replace the rcu_barrier enum with pointer to call_rcu*() function The rcu_barrier enum causes several problems: (1) you have to define the enum somewhere, and there is no convenient place, (2) the difference between TREE_RCU and TREE_PREEMPT_RCU causes problems when you need to map from rcu_barrier enum to struct rcu_state, (3) the switch statement are large, and (4) TINY_RCU really needs a different rcu_barrier() than do the treercu implementations. So replace it with a functionally equivalent but cleaner function pointer abstraction. Signed-off-by: Paul E. McKenney Acked-by: Mathieu Desnoyers Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12541998232366-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4a189ea..e432422 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -53,12 +53,6 @@ struct lockdep_map rcu_lock_map = EXPORT_SYMBOL_GPL(rcu_lock_map); #endif -enum rcu_barrier { - RCU_BARRIER_STD, - RCU_BARRIER_BH, - RCU_BARRIER_SCHED, -}; - static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -184,19 +178,12 @@ static void rcu_barrier_func(void *type) { int cpu = smp_processor_id(); struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); atomic_inc(&rcu_barrier_cpu_count); - switch ((enum rcu_barrier)type) { - case RCU_BARRIER_STD: - call_rcu(head, rcu_barrier_callback); - break; - case RCU_BARRIER_BH: - call_rcu_bh(head, rcu_barrier_callback); - break; - case RCU_BARRIER_SCHED: - call_rcu_sched(head, rcu_barrier_callback); - break; - } + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); } static inline void wait_migrated_callbacks(void) @@ -209,7 +196,8 @@ static inline void wait_migrated_callbacks(void) * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(enum rcu_barrier type) +static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) { BUG_ON(in_interrupt()); /* Take cpucontrol mutex to protect against CPU hotplug */ @@ -225,7 +213,7 @@ static void _rcu_barrier(enum rcu_barrier type) * early. */ atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)type, 1); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); @@ -238,7 +226,7 @@ static void _rcu_barrier(enum rcu_barrier type) */ void rcu_barrier(void) { - _rcu_barrier(RCU_BARRIER_STD); + _rcu_barrier(call_rcu); } EXPORT_SYMBOL_GPL(rcu_barrier); @@ -247,7 +235,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ void rcu_barrier_bh(void) { - _rcu_barrier(RCU_BARRIER_BH); + _rcu_barrier(call_rcu_bh); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -256,7 +244,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(RCU_BARRIER_SCHED); + _rcu_barrier(call_rcu_sched); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); -- cgit v1.1 From d014e8894dfc523dd9d2f2a17b6dcb94facea810 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 2 Oct 2009 14:41:20 +0300 Subject: panic: Fix panic message visibility by calling bust_spinlocks(0) before dying Commit ffd71da4e3f ("panic: decrease oops_in_progress only after having done the panic") moved bust_spinlocks(0) to the end of the function, which in practice is never reached. As a result console_unblank() is not called, and on some systems the user may not see the panic message. Move it back up to before the unblanking. Signed-off-by: Aaro Koskinen Reviewed-by: Frederic Weisbecker LKML-Reference: <1254483680-25578-1-git-send-email-aaro.koskinen@nokia.com> Signed-off-by: Ingo Molnar --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 512ab73..bc4dcb6 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...) atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + bust_spinlocks(0); + if (!panic_blink) panic_blink = no_blink; @@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...) mdelay(1); i++; } - bust_spinlocks(0); } EXPORT_SYMBOL(panic); -- cgit v1.1 From eaaea8036d0261d87d7072c5bc88c7ea730c18ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 4 Oct 2009 09:34:17 +0200 Subject: futex: Fix locking imbalance Rich reported a lock imbalance in the futex code: http://bugzilla.kernel.org/show_bug.cgi?id=14288 It's caused by the displacement of the retry_private label in futex_wake_op(). The code unlocks the hash bucket locks in the error handling path and retries without locking them again which makes the next unlock fail. Move retry_private so we lock the hash bucket locks when we retry. Reported-by: Rich Ercolany Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Darren Hart Cc: stable-2.6.31 LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 463af2e..1e176f3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -916,8 +916,8 @@ retry: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); - double_lock_hb(hb1, hb2); retry_private: + double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { -- cgit v1.1 From ee949a86b3aef15845ea677aa60231008de62672 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 6 Oct 2009 01:00:49 -0500 Subject: tracing/syscalls: Use long for syscall ret format and field definitions The syscall event definitions use long for the syscall exit ret value, but unsigned long for the same thing in the format and field definitions. Change them all to long. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: rostedt@goodmis.org Cc: lizf@cn.fujitsu.com Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <1254808849-7829-4-git-send-email-tzanussi@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9fbce6c..527e17e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -166,7 +166,7 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(unsigned long, ret)); + SYSCALL_FIELD(long, ret)); if (!ret) return 0; @@ -212,7 +212,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, FILTER_OTHER); return ret; -- cgit v1.1 From 906010b2134e14a2e377decbadd357b3d0ab9c6a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Sep 2009 16:08:49 +0200 Subject: perf_event: Provide vmalloc() based mmap() backing Some architectures such as Sparc, ARM and MIPS (basically everything with flush_dcache_page()) need to deal with dcache aliases by carefully placing pages in both kernel and user maps. These architectures typically have to use vmalloc_user() for this. However, on other architectures, vmalloc() is not needed and has the downsides of being more restricted and slower than regular allocations. Signed-off-by: Peter Zijlstra Acked-by: David Miller Cc: Andrew Morton Cc: Jens Axboe Cc: Paul Mackerras LKML-Reference: <1254830228.21044.272.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 248 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 186 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e491fb0..9d0b5c6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -2091,49 +2092,31 @@ unlock: rcu_read_unlock(); } -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static unsigned long perf_data_size(struct perf_mmap_data *data) { - struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; + return data->nr_pages << (PAGE_SHIFT + data->data_order); +} - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; +#ifndef CONFIG_PERF_USE_VMALLOC - vmf->page = virt_to_page(data->data_pages[nr]); - } +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > data->nr_pages) + return NULL; - ret = 0; -unlock: - rcu_read_unlock(); + if (pgoff == 0) + return virt_to_page(data->user_page); - return ret; + return virt_to_page(data->data_pages[pgoff - 1]); } -static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { struct perf_mmap_data *data; unsigned long size; @@ -2158,19 +2141,10 @@ static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) goto fail_data_pages; } + data->data_order = 0; data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - if (event->attr.watermark) { - data->watermark = min_t(long, PAGE_SIZE * nr_pages, - event->attr.wakeup_watermark); - } - if (!data->watermark) - data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - rcu_assign_pointer(event->data, data); - - return 0; + return data; fail_data_pages: for (i--; i >= 0; i--) @@ -2182,7 +2156,7 @@ fail_user_page: kfree(data); fail: - return -ENOMEM; + return NULL; } static void perf_mmap_free_page(unsigned long addr) @@ -2193,28 +2167,169 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void __perf_mmap_data_free(struct rcu_head *rcu_head) +static void perf_mmap_data_free(struct perf_mmap_data *data) { - struct perf_mmap_data *data; int i; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) perf_mmap_free_page((unsigned long)data->data_pages[i]); +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > (1UL << data->data_order)) + return NULL; + + return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void perf_mmap_data_free_work(struct work_struct *work) +{ + struct perf_mmap_data *data; + void *base; + int i, nr; + + data = container_of(work, struct perf_mmap_data, work); + nr = 1 << data->data_order; + + base = data->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); +} + +static void perf_mmap_data_free(struct perf_mmap_data *data) +{ + schedule_work(&data->work); +} + +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + void *all_buf; + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + INIT_WORK(&data->work, perf_mmap_data_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + data->user_page = all_buf; + data->data_pages[0] = all_buf + PAGE_SIZE; + data->data_order = ilog2(nr_pages); + data->nr_pages = 1; + + return data; + +fail_all_buf: + kfree(data); + +fail: + return NULL; +} + +#endif + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(data, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static void +perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +{ + long max_size = perf_data_size(data); + + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, max_size, + event->attr.wakeup_watermark); + } + + if (!data->watermark) + data->watermark = max_t(long, PAGE_SIZE, max_size / 2); + + + rcu_assign_pointer(event->data, data); +} + +static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + perf_mmap_data_free(data); kfree(data); } -static void perf_mmap_data_free(struct perf_event *event) +static void perf_mmap_data_release(struct perf_event *event) { struct perf_mmap_data *data = event->data; WARN_ON(atomic_read(&event->mmap_count)); rcu_assign_pointer(event->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); + call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2230,11 +2345,12 @@ static void perf_mmap_close(struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->data); struct user_struct *user = current_user(); - atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->data->nr_locked; - perf_mmap_data_free(event); + perf_mmap_data_release(event); mutex_unlock(&event->mmap_mutex); } } @@ -2252,6 +2368,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; + struct perf_mmap_data *data; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -2314,10 +2431,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } WARN_ON(event->data); - ret = perf_mmap_data_alloc(event, nr_pages); - if (ret) + + data = perf_mmap_data_alloc(event, nr_pages); + ret = -ENOMEM; + if (!data) goto unlock; + ret = 0; + perf_mmap_data_init(event, data); + atomic_set(&event->mmap_count, 1); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; @@ -2505,7 +2627,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, if (!data->writable) return true; - mask = (data->nr_pages << PAGE_SHIFT) - 1; + mask = perf_data_size(data) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2610,7 +2732,7 @@ void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { unsigned int pages_mask; - unsigned int offset; + unsigned long offset; unsigned int size; void **pages; @@ -2619,12 +2741,14 @@ void perf_output_copy(struct perf_output_handle *handle, pages = handle->data->data_pages; do { - unsigned int page_offset; + unsigned long page_offset; + unsigned long page_size; int nr; nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); + page_offset = offset & (page_size - 1); + size = min_t(unsigned int, page_size - page_offset, len); memcpy(pages[nr] + page_offset, buf, size); -- cgit v1.1 From b0f56f1a63b7b968e6feeeefeace24bc8e0a4a65 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 1 Oct 2009 13:33:28 +0900 Subject: trace: Fix missing assignment in trace_ctxwake_* The state char variable S should be reassigned, if S == 0. We are missing the state of the task that is going to sleep for the context switch events (in the raw mode). Fortunately the problem arises with the sched_switch/wake_up tracers, not the sched trace events. The formers are legacy now. But still, that was buggy. Signed-off-by: Hiroshi Shimamoto Cc: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4AC43118.6050409@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f572f44..cda766f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -883,7 +883,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, @@ -918,7 +918,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); -- cgit v1.1 From fc6b177dee33365ccb29fe6d2092223cf8d679f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Oct 2009 18:17:32 +0200 Subject: futex: Nullify robust lists after cleanup The robust list pointers of user space held futexes are kept intact over an exec() call. When the exec'ed task exits exit_robust_list() is called with the stale pointer. The risk of corruption is minimal, but still it is incorrect to keep the pointers valid. Actually glibc should uninstall the robust list before calling exec() but we have to deal with it anyway. Nullify the pointers after [compat_]exit_robust_list() has been called. Reported-by: Anirban Sinha Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner LKML-Reference: Cc: stable@kernel.org --- kernel/fork.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bfee931..88ef51c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -543,11 +543,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) + if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); + tsk->robust_list = NULL; + } #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) + if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } #endif #endif -- cgit v1.1 From 322a2c100a8998158445599ea437fb556aa95b11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Oct 2009 18:18:03 +0200 Subject: futex: Move exit_pi_state() call to release_mm() exit_pi_state() is called from do_exit() but not from do_execve(). Move it to release_mm() so it gets called from do_execve() as well. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: stable@kernel.org Cc: Anirban Sinha Cc: Peter Zijlstra --- kernel/exit.c | 2 -- kernel/fork.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ae5d866..bc2b1fd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -989,8 +989,6 @@ NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif diff --git a/kernel/fork.c b/kernel/fork.c index 88ef51c..341965b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -553,6 +553,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) tsk->compat_robust_list = NULL; } #endif + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); #endif /* Get rid of any cached register state */ -- cgit v1.1 From d0ec774cb2599c858be9d923bb873cf6697520d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Oct 2009 21:48:16 -0700 Subject: rcu: Move rcu_barrier() to rcutree Move the existing rcu_barrier() implementation to rcutree.c, consistent with the fact that the rcu_barrier() implementation is tied quite tightly to the RCU implementation. This opens the way to simplify and fix rcutree.c's rcu_barrier() implementation in a later patch. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12548908982563-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 120 +----------------------------------------------------- kernel/rcutree.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index e432422..4001833 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -53,16 +53,8 @@ struct lockdep_map rcu_lock_map = EXPORT_SYMBOL_GPL(rcu_lock_map); #endif -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; int rcu_scheduler_active __read_mostly; -static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); -static struct rcu_head rcu_migrate_head[3]; -static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); - /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. @@ -165,120 +157,10 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -static void rcu_barrier_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); -} - -/* - * Called with preemption disabled, and from cross-cpu IRQ context. - */ -static void rcu_barrier_func(void *type) -{ - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); - - atomic_inc(&rcu_barrier_cpu_count); - call_rcu_func = type; - call_rcu_func(head, rcu_barrier_callback); -} - -static inline void wait_migrated_callbacks(void) -{ - wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); - smp_mb(); /* In case we didn't sleep. */ -} - -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. - */ -static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head))) -{ - BUG_ON(in_interrupt()); - /* Take cpucontrol mutex to protect against CPU hotplug */ - mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); - /* - * Initialize rcu_barrier_cpu_count to 1, then invoke - * rcu_barrier_func() on each CPU, so that each CPU also has - * incremented rcu_barrier_cpu_count. Only then is it safe to - * decrement rcu_barrier_cpu_count -- otherwise the first CPU - * might complete its grace period before all of the other CPUs - * did their increment, causing this function to return too - * early. - */ - atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); - wait_for_completion(&rcu_barrier_completion); - mutex_unlock(&rcu_barrier_mutex); - wait_migrated_callbacks(); -} - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - */ -void rcu_barrier(void) -{ - _rcu_barrier(call_rcu); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(call_rcu_bh); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(call_rcu_sched); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -static void rcu_migrate_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_migrate_type_count)) - wake_up(&rcu_migrate_wq); -} - static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { - rcu_cpu_notify(self, action, hcpu); - if (action == CPU_DYING) { - /* - * preempt_disable() in on_each_cpu() prevents stop_machine(), - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(), - * and the dead cpu(if it exist) queues rcu_migrate_callback()s. - * - * These callbacks ensure _rcu_barrier() waits for all - * RCU callbacks of the specified type to complete. - */ - atomic_set(&rcu_migrate_type_count, 3); - call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); - call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); - call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); - } else if (action == CPU_DOWN_PREPARE) { - /* Don't need to wait until next removal operation. */ - /* rcu_migrate_head is protected by cpu_add_remove_lock */ - wait_migrated_callbacks(); - } - - return NOTIFY_OK; + return rcu_cpu_notify(self, action, hcpu); } void __init rcu_init(void) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e2e272b..0108570 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1363,6 +1363,103 @@ int rcu_needs_cpu(int cpu) rcu_preempt_needs_cpu(cpu); } +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; +static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); +static struct rcu_head rcu_migrate_head[3]; +static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); + +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *type) +{ + int cpu = smp_processor_id(); + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + + atomic_inc(&rcu_barrier_cpu_count); + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); +} + +static inline void wait_migrated_callbacks(void) +{ + wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); + smp_mb(); /* In case we didn't sleep. */ +} + +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. + */ +static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) +{ + BUG_ON(in_interrupt()); + /* Take cpucontrol mutex to protect against CPU hotplug */ + mutex_lock(&rcu_barrier_mutex); + init_completion(&rcu_barrier_completion); + /* + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. + */ + atomic_set(&rcu_barrier_cpu_count, 1); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); + wait_for_completion(&rcu_barrier_completion); + mutex_unlock(&rcu_barrier_mutex); + wait_migrated_callbacks(); +} + +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(call_rcu); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(call_rcu_bh); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(call_rcu_sched); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +static void rcu_migrate_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_migrate_type_count)) + wake_up(&rcu_migrate_wq); +} + /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ @@ -1459,6 +1556,28 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* Don't need to wait until next removal operation. */ + /* rcu_migrate_head is protected by cpu_add_remove_lock */ + wait_migrated_callbacks(); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * preempt_disable() in on_each_cpu() prevents stop_machine(), + * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" + * returns, all online cpus have queued rcu_barrier_func(), + * and the dead cpu(if it exist) queues rcu_migrate_callback()s. + * + * These callbacks ensure _rcu_barrier() waits for all + * RCU callbacks of the specified type to complete. + */ + atomic_set(&rcu_migrate_type_count, 3); + call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); + call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); + call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: -- cgit v1.1 From e74f4c4564455c91a3b4075bb1721993c2a95dda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Oct 2009 21:48:17 -0700 Subject: rcu: Make hot-unplugged CPU relinquish its own RCU callbacks The current interaction between RCU and CPU hotplug requires that RCU block in CPU notifiers waiting for callbacks to drain. This can be greatly simplified by having each CPU relinquish its own callbacks, and for both _rcu_barrier() and CPU_DEAD notifiers to adopt all callbacks that were previously relinquished. This change also eliminates the possibility of certain types of hangs due to the previous practice of waiting for callbacks to be invoked from within CPU notifiers. If you don't every wait, you cannot hang. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1254890898456-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 151 +++++++++++++++++++++++++----------------------- kernel/rcutree.h | 11 +++- kernel/rcutree_plugin.h | 34 +++++++++++ kernel/rcutree_trace.c | 4 +- 4 files changed, 125 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0108570..d8d9865 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -63,6 +63,9 @@ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .orphan_cbs_list = NULL, \ + .orphan_cbs_tail = &name.orphan_cbs_list, \ + .orphan_qlen = 0, \ .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -838,17 +841,63 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU /* + * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the + * specified flavor of RCU. The callbacks will be adopted by the next + * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever + * comes first. Because this is invoked from the CPU_DYING notifier, + * irqs are already disabled. + */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + + if (rdp->nxtlist == NULL) + return; /* irqs disabled, so comparison is stable. */ + spin_lock(&rsp->onofflock); /* irqs already disabled. */ + *rsp->orphan_cbs_tail = rdp->nxtlist; + rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rsp->orphan_qlen += rdp->qlen; + rdp->qlen = 0; + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ +} + +/* + * Adopt previously orphaned RCU callbacks. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp; + + spin_lock_irqsave(&rsp->onofflock, flags); + rdp = rsp->rda[smp_processor_id()]; + if (rsp->orphan_cbs_list == NULL) { + spin_unlock_irqrestore(&rsp->onofflock, flags); + return; + } + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; + rdp->qlen += rsp->orphan_qlen; + rsp->orphan_cbs_list = NULL; + rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; + rsp->orphan_qlen = 0; + spin_unlock_irqrestore(&rsp->onofflock, flags); +} + +/* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { - int i; unsigned long flags; long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; - struct rcu_data *rdp_me; struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -871,32 +920,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) } while (rnp != NULL); lastcomp = rsp->completed; - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + spin_unlock_irqrestore(&rsp->onofflock, flags); - /* - * Move callbacks from the outgoing CPU to the running CPU. - * Note that the outgoing CPU is now quiescent, so it is now - * (uncharacteristically) safe to access its rcu_data structure. - * Note also that we must carefully retain the order of the - * outgoing CPU's callbacks in order for rcu_barrier() to work - * correctly. Finally, note that we start all the callbacks - * afresh, even those that have passed through a grace period - * and are therefore ready to invoke. The theory is that hotplug - * events are rare, and that if they are frequent enough to - * indefinitely delay callbacks, you have far worse things to - * be worrying about. - */ - if (rdp->nxtlist != NULL) { - rdp_me = rsp->rda[smp_processor_id()]; - *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp_me->qlen += rdp->qlen; - rdp->qlen = 0; - } - local_irq_restore(flags); + rcu_adopt_orphan_cbs(rsp); } /* @@ -914,6 +940,14 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ +} + +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} + static void rcu_offline_cpu(int cpu) { } @@ -1367,9 +1401,6 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; -static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); -static struct rcu_head rcu_migrate_head[3]; -static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); static void rcu_barrier_callback(struct rcu_head *notused) { @@ -1392,21 +1423,16 @@ static void rcu_barrier_func(void *type) call_rcu_func(head, rcu_barrier_callback); } -static inline void wait_migrated_callbacks(void) -{ - wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); - smp_mb(); /* In case we didn't sleep. */ -} - /* * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, +static void _rcu_barrier(struct rcu_state *rsp, + void (*call_rcu_func)(struct rcu_head *head, void (*func)(struct rcu_head *head))) { BUG_ON(in_interrupt()); - /* Take cpucontrol mutex to protect against CPU hotplug */ + /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); /* @@ -1419,29 +1445,22 @@ static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head, * early. */ atomic_set(&rcu_barrier_cpu_count, 1); + preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ + rcu_adopt_orphan_cbs(rsp); on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); - wait_migrated_callbacks(); -} - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - */ -void rcu_barrier(void) -{ - _rcu_barrier(call_rcu); } -EXPORT_SYMBOL_GPL(rcu_barrier); /** * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. */ void rcu_barrier_bh(void) { - _rcu_barrier(call_rcu_bh); + _rcu_barrier(&rcu_bh_state, call_rcu_bh); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -1450,16 +1469,10 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(call_rcu_sched); + _rcu_barrier(&rcu_sched_state, call_rcu_sched); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); -static void rcu_migrate_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_migrate_type_count)) - wake_up(&rcu_migrate_wq); -} - /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ @@ -1556,27 +1569,21 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* Don't need to wait until next removal operation. */ - /* rcu_migrate_head is protected by cpu_add_remove_lock */ - wait_migrated_callbacks(); - break; case CPU_DYING: case CPU_DYING_FROZEN: /* - * preempt_disable() in on_each_cpu() prevents stop_machine(), + * preempt_disable() in _rcu_barrier() prevents stop_machine(), * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(), - * and the dead cpu(if it exist) queues rcu_migrate_callback()s. - * - * These callbacks ensure _rcu_barrier() waits for all - * RCU callbacks of the specified type to complete. + * returns, all online cpus have queued rcu_barrier_func(). + * The dying CPU clears its cpu_online_mask bit and + * moves all of its RCU callbacks to ->orphan_cbs_list + * in the context of stop_machine(), so subsequent calls + * to _rcu_barrier() will adopt these callbacks and only + * then queue rcu_barrier_func() on all remaining CPUs. */ - atomic_set(&rcu_migrate_type_count, 3); - call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); - call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); - call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); + rcu_send_cbs_to_orphanage(&rcu_bh_state); + rcu_send_cbs_to_orphanage(&rcu_sched_state); + rcu_preempt_send_cbs_to_orphanage(); break; case CPU_DEAD: case CPU_DEAD_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 676eecd..b40ac57 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -244,7 +244,15 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. */ + /* starting new GP. Also */ + /* protects the following */ + /* orphan_cbs fields. */ + struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ + /* orphaned by all CPUs in */ + /* a given leaf rcu_node */ + /* going offline. */ + struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ + long orphan_qlen; /* Number of orphaned cbs. */ spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -305,6 +313,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void rcu_preempt_send_cbs_to_orphanage(void); static void __init __rcu_init_preempt(void); #endif /* #else #ifdef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 57200fe..c0cb783 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -410,6 +410,15 @@ static int rcu_preempt_needs_cpu(int cpu) return !!per_cpu(rcu_preempt_data, cpu).nxtlist; } +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(&rcu_preempt_state, call_rcu); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Initialize preemptable RCU's per-CPU data. */ @@ -419,6 +428,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* + * Move preemptable RCU's callbacks to ->orphan_cbs_list. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ + rcu_send_cbs_to_orphanage(&rcu_preempt_state); +} + +/* * Initialize preemptable RCU's state structures. */ static void __init __rcu_init_preempt(void) @@ -564,6 +581,16 @@ static int rcu_preempt_needs_cpu(int cpu) } /* + * Because preemptable RCU does not exist, rcu_barrier() is just + * another name for rcu_barrier_sched(). + */ +void rcu_barrier(void) +{ + rcu_barrier_sched(); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* * Because preemptable RCU does not exist, there is no per-CPU * data to initialize. */ @@ -572,6 +599,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* + * Because there is no preemptable RCU, there are no callbacks to move. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ +} + +/* * Because preemptable RCU does not exist, it need not be initialized. */ static void __init __rcu_init_preempt(void) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index f09af28..4b31c77 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -159,13 +159,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", rsp->completed, rsp->gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh); + rsp->n_force_qs_lh, rsp->orphan_qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); -- cgit v1.1 From 978c0b88146a7f9b364b71b5b83c5b12e7b413d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Oct 2009 21:48:18 -0700 Subject: rcu: Place root rcu_node structure in separate lockdep class Before this patch, all of the rcu_node structures were in the same lockdep class, so that lockdep would complain when rcu_preempt_offline_tasks() acquired the root rcu_node structure's lock while holding one of the leaf rcu_nodes' locks. This patch changes rcu_init_one() to use a separate spin_lock_init() for the root rcu_node structure's lock than is used for that of all of the rest of the rcu_node structures, which puts the root rcu_node structure's lock in its own lockdep class. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12548908983277-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d8d9865..705f02a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1647,7 +1647,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - spin_lock_init(&rnp->lock); + if (rnp != rcu_get_root(rsp)) + spin_lock_init(&rnp->lock); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; @@ -1670,6 +1671,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) INIT_LIST_HEAD(&rnp->blocked_tasks[1]); } } + spin_lock_init(&rcu_get_root(rsp)->lock); } /* -- cgit v1.1 From fdc6f192e7e1ae80565af23cc33dc88e3dcdf184 Mon Sep 17 00:00:00 2001 From: Eero Nurkkala Date: Wed, 7 Oct 2009 11:54:26 +0300 Subject: NOHZ: update idle state also when NOHZ is inactive Commit f2e21c9610991e95621a81407cdbab881226419b had unfortunate side effects with cpufreq governors on some systems. If the system did not switch into NOHZ mode ts->inidle is not set when tick_nohz_stop_sched_tick() is called from the idle routine. Therefor all subsequent calls from irq_exit() to tick_nohz_stop_sched_tick() fail to call tick_nohz_start_idle(). This results in bogus idle accounting information which is passed to cpufreq governors. Set the inidle flag unconditionally of the NOHZ active state to keep the idle time accounting correct in any case. [ tglx: Added comment and tweaked the changelog ] Reported-by: Steven Noonan Signed-off-by: Eero Nurkkala Cc: Rik van Riel Cc: Venkatesh Pallipadi Cc: Greg KH Cc: Steven Noonan Cc: stable@kernel.org LKML-Reference: <1254907901.30157.93.camel@eenurkka-desktop> Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e0f59a2..89aed59 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle) if (!inidle && !ts->inidle) goto end; + /* + * Set ts->inidle unconditionally. Even if the system did not + * switch to NOHZ mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + now = tick_nohz_start_idle(ts); /* @@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - ts->inidle = 1; - if (need_resched()) goto end; -- cgit v1.1 From 829b876dfc94ea8be3a47e200d06f1f217bb104f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 27 Sep 2009 07:02:07 -0400 Subject: tracing: fix transposed numbers of lock_depth and preempt_count The lock_depth and preempt_count numbers in the latency format is transposed. Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index cda766f..ed17565 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -486,16 +486,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) hardirq ? 'h' : softirq ? 's' : '.')) return 0; - if (entry->lock_depth < 0) - ret = trace_seq_putc(s, '.'); + if (entry->preempt_count) + ret = trace_seq_printf(s, "%x", entry->preempt_count); else - ret = trace_seq_printf(s, "%d", entry->lock_depth); + ret = trace_seq_putc(s, '.'); + if (!ret) return 0; - if (entry->preempt_count) - return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_putc(s, '.'); + if (entry->lock_depth < 0) + return trace_seq_putc(s, '.'); + + return trace_seq_printf(s, "%d", entry->lock_depth); } static int -- cgit v1.1 From da085681014fb43d67d9bf6d14bc068e9254bd49 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 7 Oct 2009 11:46:54 -0700 Subject: futex: fix requeue_pi key imbalance If futex_wait_requeue_pi() wakes prior to requeue, we drop the reference to the source futex_key twice, once in handle_early_requeue_pi_wakeup() and once on our way out. Remove the drop from the handle_early_requeue_pi_wakeup() and keep the get/drops together in futex_wait_requeue_pi(). Reported-by: Helge Bahmann Signed-off-by: Darren Hart Cc: Helge Bahmann Cc: Peter Zijlstra Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz Cc: stable-2.6.31 LKML-Reference: <4ACCE21E.5030805@us.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 1e176f3..c3bb2fc 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2111,7 +2111,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &q->list.plist); - drop_futex_key_refs(&q->key); if (timeout && !timeout->task) ret = -ETIMEDOUT; -- cgit v1.1 From e7247a15ff3bbdab0a8b402dffa1171e5c05a8e0 Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Wed, 7 Oct 2009 19:00:35 +0200 Subject: tracing: correct module boundaries for ftrace_release When the module is about the unload we release its call records. The ftrace_release function was given wrong values representing the module core boundaries, thus not releasing its call records. Plus making ftrace_release function module specific. Signed-off-by: Jiri Olsa LKML-Reference: <1254934835-363-3-git-send-email-jolsa@redhat.com> Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 46592fe..c701476 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2658,19 +2658,17 @@ static int ftrace_convert_nops(struct module *mod, } #ifdef CONFIG_MODULES -void ftrace_release(void *start, void *end) +void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = (unsigned long)end; - if (ftrace_disabled || !start || start == end) + if (ftrace_disabled) return; mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { + if (within_module_core(rec->ip, mod)) { /* * rec->ip is changed in ftrace_free_rec() * It should not between s and e if record was freed. @@ -2702,9 +2700,7 @@ static int ftrace_module_notify(struct notifier_block *self, mod->num_ftrace_callsites); break; case MODULE_STATE_GOING: - ftrace_release(mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); + ftrace_release_mod(mod); break; } -- cgit v1.1 From 3279ba37db5d65c4ab0dcdee3b211ccb85bb563f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 7 Oct 2009 16:57:56 -0400 Subject: ftrace: check for failure for all conversions Due to legacy code from back when the dynamic tracer used a daemon, only core kernel code was checking for failures. This is no longer the case. We must check for failures any time we perform text modifications. Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c701476..f136fe5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1074,14 +1074,9 @@ static void ftrace_replace_code(int enable) failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(rec->ip)) { - ftrace_free_rec(rec); - } else { - ftrace_bug(failed, rec->ip); - /* Stop processing */ - return; - } + ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; } } while_for_each_ftrace_rec(); } -- cgit v1.1 From c8647b28726b09b087155417bb698e7b3789f8a0 Mon Sep 17 00:00:00 2001 From: Zhenwen Xu Date: Thu, 8 Oct 2009 09:21:46 +0800 Subject: tracing: fix warning on kernel/trace/trace_branch.c andtrace_hw_branches.c fix warnings that caused the API change of trace_buffer_lock_reserve() change files: kernel/trace/trace_hw_branch.c kernel/trace/trace_branch.c Signed-off-by: Zhenwen Xu LKML-Reference: <20091008012146.GA4170@helight> Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 2 +- kernel/trace/trace_hw_branches.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7a7a9fd..216e2dd 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -54,7 +54,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) goto out; pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, + event = trace_buffer_lock_reserve(tr->buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) goto out; diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 23b6385..69543a9 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to) struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; + struct ring_buffer *buf; struct hw_branch_entry *entry; unsigned long irq1; int cpu; @@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, + buf = tr->buffer; + event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, sizeof(*entry), 0, 0); if (!event) goto out; @@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buf, event)) + trace_buffer_unlock_commit(buf, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); -- cgit v1.1 From 8f6e8a314ab37cadd72da5ace9027f2d04aba854 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 7 Oct 2009 21:53:41 -0400 Subject: tracing: user local buffer variable for trace branch tracer Just using the tr->buffer for the API to trace_buffer_lock_reserve is not good enough. This is because the tr->buffer may change, and we do not want to commit with a different buffer that we reserved from. This patch uses a local variable to hold the buffer that was used to reserve and commit with. Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 216e2dd..4a194f0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; + struct ring_buffer *buffer; unsigned long flags; int cpu, pc; const char *p; @@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) goto out; pc = preempt_count(); - event = trace_buffer_lock_reserve(tr->buffer, TRACE_BRANCH, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) goto out; @@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); -- cgit v1.1 From a813a159766ee9d36aa1fc717c60d63325a6d077 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 Oct 2009 01:41:35 -0400 Subject: tracing: fix trace_vprintk call The addition of trace_array_{v}printk used the wrong function for trace_vprintk to call. This broke trace_marker and trace_vprintk itself. Although trace_printk may not have been affected by those that end up calling trace_vbprintk. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4506826..c820b03 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1393,7 +1393,7 @@ int trace_array_vprintk(struct trace_array *tr, int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { - return trace_array_printk(&global_trace, ip, fmt, args); + return trace_array_vprintk(&global_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); -- cgit v1.1 From d25105e8911bff1dbd68e387f12901c5b1a15fe8 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 9 Oct 2009 12:40:42 +0200 Subject: writeback: account IO throttling wait as iowait It makes sense to do IOWAIT when someone is blocked due to IO throttle, as suggested by Kame and Peter. There is an old comment for not doing IOWAIT on throttle, however it has been mismatching the code for a long time. If we stop accounting IOWAIT for 2.6.32, it could be an undesirable behavior change. So restore the io_schedule. CC: KAMEZAWA Hiroyuki CC: Peter Zijlstra Signed-off-by: Wu Fengguang Signed-off-by: Jens Axboe --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1535f38..074f753 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6720,9 +6720,6 @@ EXPORT_SYMBOL(yield); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) */ void __sched io_schedule(void) { -- cgit v1.1 From 3365e7798760dc6c190a9bbb0945a38f02625438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2009 10:12:41 +0200 Subject: lockdep: Use cpu_clock() for lockstat Some tracepoint magic (TRACE_EVENT(lock_acquired)) relies on the fact that lock hold times are positive and uses div64 on that. That triggered a build warning on MIPS, and probably causes bad output in certain circumstances as well. Make it truly positive. Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra LKML-Reference: <1254818502.21044.112.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3815ac1d..9af5672 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); +static inline u64 lockstat_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + static int lock_point(unsigned long points[], unsigned long ip) { int i; @@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip) return i; } -static void lock_time_inc(struct lock_time *lt, s64 time) +static void lock_time_inc(struct lock_time *lt, u64 time) { if (time > lt->max) lt->max = time; @@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats) static void lock_release_holdtime(struct held_lock *hlock) { struct lock_class_stats *stats; - s64 holdtime; + u64 holdtime; if (!lock_stat) return; - holdtime = sched_clock() - hlock->holdtime_stamp; + holdtime = lockstat_clock() - hlock->holdtime_stamp; stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) @@ -2792,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; - hlock->holdtime_stamp = sched_clock(); + hlock->holdtime_stamp = lockstat_clock(); #endif if (check == 2 && !mark_irqflags(curr, hlock)) @@ -3322,7 +3327,7 @@ found_it: if (hlock->instance != lock) return; - hlock->waittime_stamp = sched_clock(); + hlock->waittime_stamp = lockstat_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); contending_point = lock_point(hlock_class(hlock)->contending_point, @@ -3345,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; - u64 now; - s64 waittime = 0; + u64 now, waittime = 0; int i, cpu; depth = curr->lockdep_depth; @@ -3374,7 +3378,7 @@ found_it: cpu = smp_processor_id(); if (hlock->waittime_stamp) { - now = sched_clock(); + now = lockstat_clock(); waittime = now - hlock->waittime_stamp; hlock->holdtime_stamp = now; } -- cgit v1.1 From f5dc37530ba8a35aae0f7f4f13781d1904f71e94 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 9 Oct 2009 08:35:03 +0200 Subject: sched: Update the clock of runqueue select_task_rq() selected In try_to_wake_up(), we update the runqueue clock, but select_task_rq() may select a different runqueue than the one we updated, leaving the new runqueue's clock stale for a bit. This patch cures occasional huge latencies reported by latencytop when coming out of idle on a mostly idle NO_HZ box. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1255070103.7639.30.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 76c0e96..e3f8f4f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2311,7 +2311,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq; + struct rq *rq, *orig_rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2319,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = task_rq_lock(p, &flags); + rq = orig_rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2350,6 +2350,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, set_task_cpu(p, cpu); rq = task_rq_lock(p, &flags); + + if (rq != orig_rq) + update_rq_clock(rq); + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); -- cgit v1.1 From d43c36dc6b357fa1806800f18aa30123c747a6d1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 7 Oct 2009 17:09:06 +0400 Subject: headers: remove sched.h from interrupt.h After m68k's task_thread_info() doesn't refer to current, it's possible to remove sched.h from interrupt.h and not break m68k! Many thanks to Heiko Carstens for allowing this. Signed-off-by: Alexey Dobriyan --- kernel/irq/handle.c | 1 + kernel/mutex-debug.c | 1 + kernel/time/timekeeping.c | 1 + 3 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a81cf80..17c71bb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 50d022e..ec815a9 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46f..c3a4e29 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From e17b38bf9e70d74f3739a600db75240078ac1407 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 11 Oct 2009 19:12:00 -0700 Subject: sched: Fix missing kernel-doc notation The following htmldocs warnings: Warning(kernel/sched.c:685): No description found for parameter 'cpu' Warning(kernel/sched.c:3676): No description found for parameter 'sd' Trigger because new parameters were added to update_rq_clock() and update_group_power() without updating the kernel-doc notation. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Randy Dunlap Cc: Andrew Morton Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <4AD29070.7070002@oracle.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e3f8f4f..789001d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -676,6 +676,7 @@ inline void update_rq_clock(struct rq *rq) /** * runqueue_is_locked + * @cpu: the processor in question. * * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock @@ -3660,6 +3661,7 @@ static void update_group_power(struct sched_domain *sd, int cpu) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu -- cgit v1.1 From 8ad807318fcd62aba0e18c7c7fbfcc1af3fcdbab Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 13 Oct 2009 09:28:57 +0800 Subject: tracing/filters: Fix memory leak when setting a filter Every time we set a filter, we leak memory allocated by postfix_append_operand() and postfix_append_op(). Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi Cc: # for v2.6.31.x LKML-Reference: <4AD3D7D9.4070400@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2324578..98a6cc5 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -933,8 +933,9 @@ static void postfix_clear(struct filter_parse_state *ps) while (!list_empty(&ps->postfix)) { elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - kfree(elt->operand); list_del(&elt->list); + kfree(elt->operand); + kfree(elt); } } -- cgit v1.1 From d58e6576b0deec6f0b9ff8450fe282da18c50883 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Oct 2009 20:40:43 +0200 Subject: futex: Handle spurious wake up The futex code does not handle spurious wake up in futex_wait and futex_wait_requeue_pi. The code assumes that any wake up which was not caused by futex_wake / requeue or by a timeout was caused by a signal wake up and returns one of the syscall restart error codes. In case of a spurious wake up the signal delivery code which deals with the restart error codes is not invoked and we return that error code to user space. That causes applications which actually check the return codes to fail. Blaise reported that on preempt-rt a python test program run into a exception trap. -rt exposed that due to a built in spurious wake up accelerator :) Solve this by checking signal_pending(current) in the wake up path and handle the spurious wake up case w/o returning to user space. Reported-by: Blaise Gassend Debugged-by: Darren Hart Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable@kernel.org LKML-Reference: --- kernel/futex.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4949d33..5c88839 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1791,6 +1791,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1808,9 +1809,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -2118,9 +2124,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, */ plist_del(&q->list, &q->list.plist); + /* Handle spurious wakeups gracefully */ + ret = -EAGAIN; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; @@ -2198,6 +2206,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; +retry: key2 = FUTEX_KEY_INIT; ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2292,6 +2301,9 @@ out_put_keys: out_key2: put_futex_key(fshared, &key2); + /* Spurious wakeup ? */ + if (ret == -EAGAIN) + goto retry; out: if (to) { hrtimer_cancel(&to->timer); -- cgit v1.1 From 03541f8b69c058162e4cf9675ec9181e6a204d55 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Oct 2009 16:58:03 +1100 Subject: perf_event: Adjust frequency and unthrottle for non-group-leader events The loop in perf_ctx_adjust_freq checks the frequency of sampling event counters, and adjusts the event interval and unthrottles the event if required, and resets the interrupt count for the event. However, at present it only looks at group leaders. This means that a sampling event that is not a group leader will eventually get throttled, once its interrupt count reaches sysctl_perf_event_sample_rate/HZ --- and that is guaranteed to happen, if the event is active for long enough, since the interrupt count never gets reset. Once it is throttled it never gets unthrottled, so it basically just stops working at that point. This fixes it by making perf_ctx_adjust_freq use ctx->event_list rather than ctx->group_list. The existing spin_lock/spin_unlock around the loop makes it unnecessary to put rcu_read_lock/ rcu_read_unlock around the list_for_each_entry_rcu(). Reported-by: Mark W. Krentel Signed-off-by: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra LKML-Reference: <19157.26731.855609.165622@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9d0b5c6..afb7ef3d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1355,7 +1355,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) u64 interrupts, freq; spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; -- cgit v1.1 From 92f6a5e37a2e2d3342dafb2b39c2f8bc340bbf84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2009 12:43:07 +0200 Subject: sched: Do less agressive buddy clearing Yanmin reported a hackbench regression due to: > commit de69a80be32445b0a71e8e3b757e584d7beb90f7 > Author: Peter Zijlstra > Date: Thu Sep 17 09:01:20 2009 +0200 > > sched: Stop buddies from hogging the system I really liked de69a80b, and it affecting hackbench shows I wasn't crazy ;-) So hackbench is a multi-cast, with one sender spraying multiple receivers, who in their turn don't spray back. This would be exactly the scenario that patch 'cures'. Previously we would not clear the last buddy after running the next task, allowing the sender to get back to work sooner than it otherwise ought to have been, increasing latencies for other tasks. Now, since those receivers don't poke back, they don't enforce the buddy relation, which means there's nothing to re-elect the sender. Cure this by less agressively clearing the buddy stats. Only clear buddies when they were not chosen. It should still avoid a buddy sticking around long after its served its time. Reported-by: "Zhang, Yanmin" Signed-off-by: Peter Zijlstra CC: Mike Galbraith LKML-Reference: <1255084986.8802.46.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b4..c32c3e6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -861,12 +861,21 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *buddy; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) - return cfs_rq->next; + if (cfs_rq->next) { + buddy = cfs_rq->next; + cfs_rq->next = NULL; + if (wakeup_preempt_entity(buddy, se) < 1) + return buddy; + } - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) - return cfs_rq->last; + if (cfs_rq->last) { + buddy = cfs_rq->last; + cfs_rq->last = NULL; + if (wakeup_preempt_entity(buddy, se) < 1) + return buddy; + } return se; } @@ -1654,16 +1663,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) do { se = pick_next_entity(cfs_rq); - /* - * If se was a buddy, clear it so that it will have to earn - * the favour again. - * - * If se was not a buddy, clear the buddies because neither - * was elegible to run, let them earn it again. - * - * IOW. unconditionally clear buddies. - */ - __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.1 From 43046b606673c9c991919ff75b980b72541e9ede Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 14 Oct 2009 09:16:42 -0700 Subject: workqueue: add 'flush_delayed_work()' to run and wait for delayed work It basically turns a delayed work into an immediate work, and then waits for it to finish. --- kernel/workqueue.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2d..ccefe57 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork, EXPORT_SYMBOL(schedule_delayed_work); /** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done -- cgit v1.1 From 2bc872036e1c5948b5b02942810bbdd8dbdb9812 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 14 Oct 2009 10:12:39 -0700 Subject: futex: Check for NULL keys in match_futex If userspace tries to perform a requeue_pi on a non-requeue_pi waiter, it will find the futex_q->requeue_pi_key to be NULL and OOPS. Check for NULL in match_futex() instead of doing explicit NULL pointer checks on all call sites. While match_futex(NULL, NULL) returning false is a little odd, it's still correct as we expect valid key references. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Ingo Molnar CC: Eric Dumazet CC: Dinakar Guniguntala CC: John Stultz Cc: stable@kernel.org LKML-Reference: <4AD60687.10306@us.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5c88839..06938e5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) */ static inline int match_futex(union futex_key *key1, union futex_key *key2) { - return (key1->both.word == key2->both.word + return (key1 && key2 + && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } -- cgit v1.1 From 8c53e46314562fe814b0afef6cfcbd2f562b017c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 14 Oct 2009 09:16:42 -0700 Subject: workqueue: add 'flush_delayed_work()' to run and wait for delayed work It basically turns a delayed work into an immediate work, and then waits for it to finish, thus allowing you to force (and wait for) an immediate flush of a delayed work. We'll want to use this in the tty layer to clean up tty_flush_to_ldisc(). Acked-by: Oleg Nesterov [ Fixed to use 'del_timer_sync()' as noted by Oleg ] Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2d..47cdd7e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork, EXPORT_SYMBOL(schedule_delayed_work); /** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done -- cgit v1.1 From 37c72e56f6b234ea7387ba530434a80abf2658d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Oct 2009 10:15:55 -0700 Subject: rcu: Prevent RCU IPI storms in presence of high call_rcu() load As the number of callbacks on a given CPU rises, invoke force_quiescent_state() only every blimit number of callbacks (defaults to 10,000), and even then only if no other CPU has invoked force_quiescent_state() in the meantime. This should fix the performance regression reported by Nick. Reported-by: Nick Piggin Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: jens.axboe@oracle.com LKML-Reference: <12555405592133-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 29 ++++++++++++++++++++++++----- kernel/rcutree.h | 4 ++++ 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 705f02a..ddbf111 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -958,7 +958,7 @@ static void rcu_offline_cpu(int cpu) * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; @@ -1011,6 +1011,13 @@ static void rcu_do_batch(struct rcu_data *rdp) if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + local_irq_restore(flags); /* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1224,7 +1231,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rdp); + rcu_do_batch(rsp, rdp); } /* @@ -1288,10 +1295,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } - /* Force the grace period if too many callbacks or too long waiting. */ - if (unlikely(++rdp->qlen > qhimark)) { + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { rdp->blimit = LONG_MAX; - force_quiescent_state(rsp, 0); + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1523,6 +1540,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->beenonline = 1; /* We have now been online. */ rdp->preemptable = preemptable; rdp->passed_quiesc_completed = lastcomp - 1; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b40ac57..599161f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -167,6 +167,10 @@ struct rcu_data { struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; long qlen; /* # of queued callbacks */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ -- cgit v1.1 From 019129d595caaa5bd0b41d128308da1be6a91869 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Oct 2009 10:15:56 -0700 Subject: rcu: Stopgap fix for synchronize_rcu_expedited() for TREE_PREEMPT_RCU For the short term, map synchronize_rcu_expedited() to synchronize_rcu() for TREE_PREEMPT_RCU and to synchronize_sched_expedited() for TREE_RCU. Longer term, there needs to be a real expedited grace period for TREE_PREEMPT_RCU, but candidate patches to date are considerably more complex and intrusive. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: npiggin@suse.de Cc: jens.axboe@oracle.com LKML-Reference: <12555405592331-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree_plugin.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c0cb783..ebd20ee 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -393,6 +393,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period. We are supposed to expedite the + * grace period, but this is the crude slow compatability hack, so just + * invoke synchronize_rcu(). + */ +void synchronize_rcu_expedited(void) +{ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* * Check to see if there is any immediate preemptable-RCU-related work * to be done. */ @@ -565,6 +576,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptable RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* * Because preemptable RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) -- cgit v1.1 From 237c80c5c8fb7ec128cf2a756b550dc41ad7eac7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Oct 2009 09:26:14 -0700 Subject: rcu: Fix TREE_PREEMPT_RCU CPU_HOTPLUG bad-luck hang If the following sequence of events occurs, then TREE_PREEMPT_RCU will hang waiting for a grace period to complete, eventually OOMing the system: o A TREE_PREEMPT_RCU build of the kernel is booted on a system with more than 64 physical CPUs present (32 on a 32-bit system). Alternatively, a TREE_PREEMPT_RCU build of the kernel is booted with RCU_FANOUT set to a sufficiently small value that the physical CPUs populate two or more leaf rcu_node structures. o A task is preempted in an RCU read-side critical section while running on a CPU corresponding to a given leaf rcu_node structure. o All CPUs corresponding to this same leaf rcu_node structure record quiescent states for the current grace period. o All of these same CPUs go offline (hence the need for enough physical CPUs to populate more than one leaf rcu_node structure). This causes the preempted task to be moved to the root rcu_node structure. At this point, there is nothing left to cause the quiescent state to be propagated up the rcu_node tree, so the current grace period never completes. The simplest fix, especially after considering the deadlock possibilities, is to detect this situation when the last CPU is offlined, and to set that CPU's ->qsmask bit in its leaf rcu_node structure. This will cause the next invocation of force_quiescent_state() to end the grace period. Without this fix, this hang can be triggered in an hour or so on some machines with rcutorture and random CPU onlining/offlining. With this fix, these same machines pass a full 10 hours of this sort of abuse. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20091015162614.GA19131@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 15 ++++++++++++++- kernel/rcutree.h | 6 +++--- kernel/rcutree_plugin.h | 25 +++++++++++++++++-------- 3 files changed, 34 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ddbf111..0536125 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -913,7 +913,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp, rdp); + + /* + * If there was a task blocking the current grace period, + * and if all CPUs have checked in, we need to propagate + * the quiescent state up the rcu_node hierarchy. But that + * is inconvenient at the moment due to deadlock issues if + * this should end the current grace period. So set the + * offlined CPU's bit in ->qsmask in order to force the + * next force_quiescent_state() invocation to clean up this + * mess in a deadlock-free manner. + */ + if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask) + rnp->qsmask |= mask; + mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 599161f..1823c6e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -306,9 +306,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ebd20ee..ef2a58c 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) * parent is to remove the need for rcu_read_unlock_special() to * make more than two attempts to acquire the target rcu_node's lock. * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. + * * The caller must hold rnp->lock with irqs disabled. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; struct list_head *lp_root; + int retval = rcu_preempted_readers(rnp); struct rcu_node *rnp_root = rcu_get_root(rsp); struct task_struct *tp; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); - return; /* Shouldn't happen: at least one CPU online. */ + return 0; /* Shouldn't happen: at least one CPU online. */ } WARN_ON_ONCE(rnp != rdp->mynode && (!list_empty(&rnp->blocked_tasks[0]) || @@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } + + return retval; } /* @@ -532,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) /* * Because preemptable RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections. + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { + return 0; } /* -- cgit v1.1 From 89061d3d58e1f0742139605dc6a7950aa1ecc019 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 15 Oct 2009 15:30:48 -0700 Subject: futex: Move drop_futex_key_refs out of spinlock'ed region When requeuing tasks from one futex to another, the reference held by the requeued task to the original futex location needs to be dropped eventually. Dropping the reference may ultimately lead to a call to "iput_final" and subsequently call into filesystem- specific code - which may be non-atomic. It is therefore safer to defer this drop operation until after the futex_hash_bucket spinlock has been dropped. Originally-From: Helge Bahmann Signed-off-by: Darren Hart Cc: Cc: Peter Zijlstra Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz Cc: Sven-Thorsten Dietrich Cc: John Kacur LKML-Reference: <4AD7A298.5040802@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 06938e5..642f3bb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1029,7 +1029,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; @@ -1227,6 +1226,7 @@ retry_private: */ if (ret == 1) { WARN_ON(pi_state); + drop_count++; task_count++; ret = get_futex_value_locked(&curval2, uaddr2); if (!ret) @@ -1305,6 +1305,7 @@ retry_private: if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.1 From 65a64464349883891e21e74af16c05d6e1eeb4e9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 14 Oct 2009 06:22:47 +0200 Subject: HWPOISON: Allow schedule_on_each_cpu() from keventd Right now when calling schedule_on_each_cpu() from keventd there is a deadlock because it tries to schedule a work item on the current CPU too. This happens via lru_add_drain_all() in hwpoison. Just call the function for the current CPU in this case. This is actually faster too. Debugging with Fengguang Wu & Max Asbock Signed-off-by: Andi Kleen --- kernel/workqueue.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2d..f61a2fe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -667,21 +667,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; + int orig = -1; struct work_struct *works; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; + /* + * when running in keventd don't schedule a work item on itself. + * Can just call directly because the work queue is already bound. + * This also is faster. + * Make this a generic parameter for other workqueues? + */ + if (current_is_keventd()) { + orig = raw_smp_processor_id(); + INIT_WORK(per_cpu_ptr(works, orig), func); + func(per_cpu_ptr(works, orig)); + } + get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); + if (cpu == orig) + continue; INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) - flush_work(per_cpu_ptr(works, cpu)); + for_each_online_cpu(cpu) { + if (cpu != orig) + flush_work(per_cpu_ptr(works, cpu)); + } put_online_cpus(); free_percpu(works); return 0; -- cgit v1.1 From 04bf7539c08d64184736cdc5e4ad617eda77eb0f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 Oct 2009 06:45:02 +0200 Subject: PM: Make warning in suspend_test_finish() less likely to happen Increase TEST_SUSPEND_SECONDS to 10 so the warning in suspend_test_finish() doesn't annoy the users of slower systems so much. Also, make the warning print the suspend-resume cycle time, so that we know why the warning actually triggered. Patch prepared during the hacking session at the Kernel Summit in Tokyo. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/power/suspend_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 17d8bb1a..25596e4 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -19,7 +19,7 @@ * The time it takes is system-specific though, so when we test this * during system bootup we allow a LOT of time. */ -#define TEST_SUSPEND_SECONDS 5 +#define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; @@ -49,7 +49,8 @@ void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); } /* -- cgit v1.1 From 721a669b7225edeeb0ca8e2bf71b83882326a71b Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Date: Tue, 15 Sep 2009 14:33:08 +0200 Subject: perf events: Fix swevent hrtimer sampling by keeping track of remaining time when enabling/disabling swevent hrtimers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the hrtimer based events work for sysprof. Whenever a swevent is scheduled out, the hrtimer is canceled. When it is scheduled back in, the timer is restarted. This happens every scheduler tick, which means the timer never expired because it was getting repeatedly restarted over and over with the same period. To fix that, save the remaining time when disabling; when reenabling, use that saved time as the period instead of the user-specified sampling period. Also, move the starting and stopping of the hrtimers to helper functions instead of duplicating the code. Signed-off-by: Søren Sandmann Pedersen LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 61 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index afb7ef3d..33ff019 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3969,6 +3969,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) return ret; } +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period; + + if (hwc->remaining) { + if (hwc->remaining < 0) + period = 10000; + else + period = hwc->remaining; + hwc->remaining = 0; + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + hwc->remaining = ktime_to_ns(remaining); + + hrtimer_cancel(&hwc->hrtimer); + } +} + /* * Software event: cpu wall time clock */ @@ -3991,22 +4027,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + perf_swevent_start_hrtimer(event); return 0; } static void cpu_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); cpu_clock_perf_event_update(event); } @@ -4043,22 +4071,15 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + + perf_swevent_start_hrtimer(event); return 0; } static void task_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); task_clock_perf_event_update(event, event->ctx->time); } -- cgit v1.1 From 54f4407608c59712a8f5ec1e10dfac40bef5a2e7 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Date: Thu, 22 Oct 2009 18:34:08 +0200 Subject: perf events: Don't generate events for the idle task when exclude_idle is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Getting samples for the idle task is often not interesting, so don't generate them when exclude_idle is set for the event in question. Signed-off-by: Søren Sandmann Pedersen Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 33ff019..7f29643 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3959,8 +3959,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) { - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); -- cgit v1.1 From f685ceacab07d3f6c236f04803e2f2f0dbcc5afb Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Oct 2009 23:09:22 +0200 Subject: sched: Strengthen buddies and mitigate buddy induced latencies This patch restores the effectiveness of LAST_BUDDY in preventing pgsql+oltp from collapsing due to wakeup preemption. It also switches LAST_BUDDY to exclusively do what it does best, namely mitigate the effects of aggressive wakeup preemption, which improves vmark throughput markedly, and restores mysql+oltp scalability. Since buddies are about scalability, enable them beginning at the point where we begin expanding sched_latency, namely sched_nr_latency. Previously, buddies were cleared aggressively, which seriously reduced their effectiveness. Not clearing aggressively however, produces a small drop in mysql+oltp throughput immediately after peak, indicating that LAST_BUDDY is actually doing some harm. This is right at the point where X on the desktop in competition with another load wants low latency service. Ergo, do not enable until we need to scale. To mitigate latency induced by buddies, or by a task just missing wakeup preemption, check latency at tick time. Last hunk prevents buddies from stymieing BALANCE_NEWIDLE via CACHE_HOT_BUDDY. Supporting performance tests: tip = v2.6.32-rc5-1497-ga525b32 tipx = NO_GENTLE_FAIR_SLEEPERS NEXT_BUDDY granularity knobs = 31 knobs + 31 buddies tip+x = NO_GENTLE_FAIR_SLEEPERS granularity knobs = 31 knobs (Three run averages except where noted.) vmark: ------ tip 108466 messages per second tip+ 125307 messages per second tip+x 125335 messages per second tipx 117781 messages per second 2.6.31.3 122729 messages per second mysql+oltp: ----------- clients 1 2 4 8 16 32 64 128 256 .......................................................................................... tip 9949.89 18690.20 34801.24 34460.04 32682.88 30765.97 28305.27 25059.64 19548.08 tip+ 10013.90 18526.84 34900.38 34420.14 33069.83 32083.40 30578.30 28010.71 25605.47 tipx 9698.71 18002.70 34477.56 33420.01 32634.30 31657.27 29932.67 26827.52 21487.18 2.6.31.3 8243.11 18784.20 34404.83 33148.38 31900.32 31161.90 29663.81 25995.94 18058.86 pgsql+oltp: ----------- clients 1 2 4 8 16 32 64 128 256 .......................................................................................... tip 13686.37 26609.25 51934.28 51347.81 49479.51 45312.65 36691.91 26851.57 24145.35 tip+ (1x) 13907.85 27135.87 52951.98 52514.04 51742.52 50705.43 49947.97 48374.19 46227.94 tip+x 13906.78 27065.81 52951.19 52542.59 52176.11 51815.94 50838.90 49439.46 46891.00 tipx 13742.46 26769.81 52351.99 51891.73 51320.79 50938.98 50248.65 48908.70 46553.84 2.6.31.3 13815.35 26906.46 52683.34 52061.31 51937.10 51376.80 50474.28 49394.47 47003.25 Signed-off-by: Mike Galbraith Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 73 ++++++++++++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 789001d..cae6700 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2008,7 +2008,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c32c3e6..37087a7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + if (delta_exec < sysctl_sched_min_granularity) + return; + + if (cfs_rq->nr_running > 1) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + s64 delta = curr->vruntime - se->vruntime; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } } @@ -861,21 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); - struct sched_entity *buddy; + struct sched_entity *left = se; - if (cfs_rq->next) { - buddy = cfs_rq->next; - cfs_rq->next = NULL; - if (wakeup_preempt_entity(buddy, se) < 1) - return buddy; - } + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; - if (cfs_rq->last) { - buddy = cfs_rq->last; - cfs_rq->last = NULL; - if (wakeup_preempt_entity(buddy, se) < 1) - return buddy; - } + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + clear_buddies(cfs_rq, se); return se; } @@ -1577,6 +1594,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int sync = wake_flags & WF_SYNC; + int scale = cfs_rq->nr_running >= sched_nr_latency; update_curr(cfs_rq); @@ -1591,18 +1609,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; - /* - * Only set the backward buddy when the current task is still on the - * rq. This can happen when a wakeup gets interleaved with schedule on - * the ->pre_schedule() or idle_balance() point, either of which can - * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, for - * obvious reasons its a bad idea to schedule back to the idle thread. - */ - if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - set_last_buddy(se); - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* @@ -1648,8 +1655,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) + if (wakeup_preempt_entity(se, pse) == 1) { resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.1 From cf8517cf905b5cd31d5790250b9ac39f7cb8aa53 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:16 -0400 Subject: tracing: Update *ppos instead of filp->f_pos Instead of directly updating filp->f_pos we should update the *ppos argument. The filp->f_pos gets updated within the file_pos_write() function called from sys_write(). Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.399670810@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37ba67e..9c451a1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -740,7 +740,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&ftrace_profile_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c820b03..b20d3ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return ret; } - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, } mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (err) return err; - filp->f_pos += ret; + *ppos += ret; return ret; } @@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } } - filp->f_pos += cnt; + *ppos += cnt; /* If check pages failed, return ENOMEM */ if (tracing_disabled) -- cgit v1.1 From 3e69533b51930a7169235db2caf703884e6e3bbb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:17 -0400 Subject: tracing: Fix trace_seq_printf() return value trace_seq_printf() return value is a little ambiguous. It currently returns the length of the space available in the buffer. printf usually returns the amount written. This is not adequate here, because: trace_seq_printf(s, ""); is perfectly legal, and returning 0 would indicate that it failed. We can always see the amount written by looking at the before and after values of s->len. This is not quite the same use as printf. We only care if the string was successfully written to the buffer or not. Make trace_seq_printf() return 0 if the trace oversizes the buffer's free space, 1 otherwise. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.631787612@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed17565..b6c12c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) * @s: trace sequence descriptor * @fmt: printf format string * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formating of a trace * trace_seq_printf is used to store strings into a special @@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) s->len += ret; - return len; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); -- cgit v1.1 From 67b394f7f26d84edb7294cc6528ab7ca6daa2ad1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:18 -0400 Subject: tracing: Fix comment typo and documentation example Trivial patch to fix a documentation example and to fix a comment. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.871719877@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4ff019..217f699 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2681,7 +2681,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_entries); /** - * ring_buffer_overrun_cpu - get the number of overruns in buffer + * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer -- cgit v1.1 From 6d3f1e12f46a2f9a1bb7e7aa433df8dd31ce5647 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:19 -0400 Subject: tracing: Remove cpu arg from the rb_time_stamp() function The cpu argument is not used inside the rb_time_stamp() function. Plus fix a typo. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233647.118547500@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 217f699..3ffa502 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -483,7 +483,7 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; @@ -494,7 +494,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) u64 time; preempt_disable_notrace(); - time = rb_time_stamp(buffer, cpu); + time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; @@ -599,7 +599,7 @@ static struct list_head *rb_list_head(struct list_head *list) } /* - * rb_is_head_page - test if the give page is the head page + * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to @@ -1868,7 +1868,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer); next_page->page->time_stamp = *ts; } @@ -2111,7 +2111,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer); /* * Only the first commit can update the timestamp. -- cgit v1.1 From 4a6cc4bd32e580722882115d4c8b964d732c11e4 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 29 Oct 2009 00:26:00 +0900 Subject: sched: move rq_weight data array out of .percpu Commit 34d76c41 introduced percpu array update_shares_data, size of which being proportional to NR_CPUS. Unfortunately this blows up ia64 for large NR_CPUS configuration, as ia64 allows only 64k for .percpu section. Fix this by allocating this array dynamically and keep only pointer to it percpu. The per-cpu handling doesn't impose significant performance penalty on potentially contented path in tg_shares_up(). ... ffffffff8104337c: 65 48 8b 14 25 20 cd mov %gs:0xcd20,%rdx ffffffff81043383: 00 00 ffffffff81043385: 48 c7 c0 00 e1 00 00 mov $0xe100,%rax ffffffff8104338c: 48 c7 45 a0 00 00 00 movq $0x0,-0x60(%rbp) ffffffff81043393: 00 ffffffff81043394: 48 c7 45 a8 00 00 00 movq $0x0,-0x58(%rbp) ffffffff8104339b: 00 ffffffff8104339c: 48 01 d0 add %rdx,%rax ffffffff8104339f: 49 8d 94 24 08 01 00 lea 0x108(%r12),%rdx ffffffff810433a6: 00 ffffffff810433a7: b9 ff ff ff ff mov $0xffffffff,%ecx ffffffff810433ac: 48 89 45 b0 mov %rax,-0x50(%rbp) ffffffff810433b0: bb 00 04 00 00 mov $0x400,%ebx ffffffff810433b5: 48 89 55 c0 mov %rdx,-0x40(%rbp) ... After: ... ffffffff8104337c: 65 8b 04 25 28 cd 00 mov %gs:0xcd28,%eax ffffffff81043383: 00 ffffffff81043384: 48 98 cltq ffffffff81043386: 49 8d bc 24 08 01 00 lea 0x108(%r12),%rdi ffffffff8104338d: 00 ffffffff8104338e: 48 8b 15 d3 7f 76 00 mov 0x767fd3(%rip),%rdx # ffffffff817ab368 ffffffff81043395: 48 8b 34 c5 00 ee 6d mov -0x7e921200(,%rax,8),%rsi ffffffff8104339c: 81 ffffffff8104339d: 48 c7 45 a0 00 00 00 movq $0x0,-0x60(%rbp) ffffffff810433a4: 00 ffffffff810433a5: b9 ff ff ff ff mov $0xffffffff,%ecx ffffffff810433aa: 48 89 7d c0 mov %rdi,-0x40(%rbp) ffffffff810433ae: 48 c7 45 a8 00 00 00 movq $0x0,-0x58(%rbp) ffffffff810433b5: 00 ffffffff810433b6: bb 00 04 00 00 mov $0x400,%ebx ffffffff810433bb: 48 01 f2 add %rsi,%rdx ffffffff810433be: 48 89 55 b0 mov %rdx,-0x50(%rbp) ... Signed-off-by: Jiri Kosina Acked-by: Ingo Molnar Signed-off-by: Tejun Heo --- kernel/sched.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ee61f45..526d237 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1563,11 +1563,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -struct update_shares_data { - unsigned long rq_weight[NR_CPUS]; -}; - -static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); +static __read_mostly unsigned long *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1577,12 +1573,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight, - struct update_shares_data *usd) + unsigned long *usd_rq_weight) { unsigned long shares, rq_weight; int boost = 0; - rq_weight = usd->rq_weight[cpu]; + rq_weight = usd_rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; @@ -1617,7 +1613,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, static int tg_shares_up(struct task_group *tg, void *data) { unsigned long weight, rq_weight = 0, shares = 0; - struct update_shares_data *usd; + unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; int i; @@ -1626,11 +1622,11 @@ static int tg_shares_up(struct task_group *tg, void *data) return 0; local_irq_save(flags); - usd = &__get_cpu_var(update_shares_data); + usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); for_each_cpu(i, sched_domain_span(sd)) { weight = tg->cfs_rq[i]->load.weight; - usd->rq_weight[i] = weight; + usd_rq_weight[i] = weight; /* * If there are currently no tasks on the cpu pretend there @@ -1651,7 +1647,7 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd); + update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); local_irq_restore(flags); @@ -9406,6 +9402,10 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_GROUP_SCHED */ +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); +#endif for_each_possible_cpu(i) { struct rq *rq; -- cgit v1.1 From 11df6dddcbc38affb7473aad3d962baf8414a947 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 28 Oct 2009 20:26:48 +0100 Subject: futex: Fix spurious wakeup for requeue_pi really The requeue_pi path doesn't use unqueue_me() (and the racy lock_ptr == NULL test) nor does it use the wake_list of futex_wake() which where the reason for commit 41890f2 (futex: Handle spurious wake up) See debugging discussing on LKML Message-ID: <4AD4080C.20703@us.ibm.com> The changes in this fix to the wait_requeue_pi path were considered to be a likely unecessary, but harmless safety net. But it turns out that due to the fact that for unknown $@#!*( reasons EWOULDBLOCK is defined as EAGAIN we built an endless loop in the code path which returns correctly EWOULDBLOCK. Spurious wakeups in wait_requeue_pi code path are unlikely so we do the easy solution and return EWOULDBLOCK^WEAGAIN to user space and let it deal with the spurious wakeup. Cc: Darren Hart Cc: Peter Zijlstra Cc: Eric Dumazet Cc: John Stultz Cc: Dinakar Guniguntala LKML-Reference: <4AE23C74.1090502@us.ibm.com> Cc: stable@kernel.org Signed-off-by: Thomas Gleixner --- kernel/futex.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 642f3bb..fb65e82 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2127,7 +2127,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, plist_del(&q->list, &q->list.plist); /* Handle spurious wakeups gracefully */ - ret = -EAGAIN; + ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; else if (signal_pending(current)) @@ -2208,7 +2208,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; -retry: key2 = FUTEX_KEY_INIT; ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2303,9 +2302,6 @@ out_put_keys: out_key2: put_futex_key(fshared, &key2); - /* Spurious wakeup ? */ - if (ret == -EAGAIN) - goto retry; out: if (to) { hrtimer_cancel(&to->timer); -- cgit v1.1 From 65afac7d80ab3bc9f81e75eafb71eeb92a3ebdef Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 08:56:16 -0600 Subject: param: fix lots of bugs with writing charp params from sysfs, by leaking mem. e180a6b7759a "param: fix charp parameters set via sysfs" fixed the case where charp parameters written via sysfs were freed, leaving drivers accessing random memory. Unfortunately, storing a flag in the kparam struct was a bad idea: it's rodata so setting it causes an oops on some archs. But that's not all: 1) module_param_array() on charp doesn't work reliably, since we use an uninitialized temporary struct kernel_param. 2) there's a fundamental race if a module uses this parameter and then it's changed: they will still access the old, freed, memory. The simplest fix (ie. for 2.6.32) is to never free the memory. This prevents all these problems, at cost of a memory leak. In practice, there are only 18 places where a charp is writable via sysfs, and all are root-only writable. Reported-by: Takashi Iwai Cc: Sitsofe Wheeler Cc: Frederic Weisbecker Cc: Christof Schmitt Signed-off-by: Rusty Russell Cc: stable@kernel.org --- kernel/params.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 9da58ea..95ef27c 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -218,13 +218,9 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->flags & KPARAM_KMALLOCED) - kfree(*(char **)kp->arg); - /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->flags |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); if (!kp->arg) return -ENOMEM; @@ -605,11 +601,7 @@ void module_param_sysfs_remove(struct module *mod) void destroy_params(const struct kernel_param *params, unsigned num) { - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].flags & KPARAM_KMALLOCED) - kfree(*(char **)params[i].arg); + /* FIXME: This should free kmalloced charp parameters. It doesn't. */ } static void __init kernel_add_sysfs_param(const char *name, -- cgit v1.1 From d553ad864e3b3dde3f1038d491e207021b2d6293 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 08:56:17 -0600 Subject: param: fix NULL comparison on oom kp->arg is always true: it's the contents of that pointer we care about. Reported-by: Takashi Iwai Signed-off-by: Rusty Russell Cc: stable@kernel.org --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 95ef27c..00520c4 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -222,7 +222,7 @@ int param_set_charp(const char *val, struct kernel_param *kp) * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { *(char **)kp->arg = kstrdup(val, GFP_KERNEL); - if (!kp->arg) + if (!*(char **)kp->arg) return -ENOMEM; } else *(const char **)kp->arg = val; -- cgit v1.1 From 3c7d76e371ac1a3802ae1673f5c63554af59325c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 08:56:19 -0600 Subject: param: fix setting arrays of bool We create a dummy struct kernel_param on the stack for parsing each array element, but we didn't initialize the flags word. This matters for arrays of type "bool", where the flag indicates if it really is an array of bools or unsigned int (old-style). Reported-by: Takashi Iwai Signed-off-by: Rusty Russell Cc: stable@kernel.org --- kernel/params.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 00520c4..d656c27 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -300,6 +300,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, struct kernel_param *kp), + u16 flags, unsigned int *num) { int ret; @@ -309,6 +310,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; + kp.flags = flags; /* No equals sign? */ if (!val) { @@ -354,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, arr->num ?: &temp_num); + arr->elemsize, arr->set, kp->flags, + arr->num ?: &temp_num); } int param_array_get(char *buffer, struct kernel_param *kp) -- cgit v1.1 From 0d0df599f1f11f12d589318bacb59a50fb5c0310 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 26 Oct 2009 16:49:34 -0700 Subject: connector: fix regression introduced by sid connector Since commit 02b51df1b07b4e9ca823c89284e704cadb323cd1 (proc connector: add event for process becoming session leader) we have the following warning: Badness at kernel/softirq.c:143 [...] Krnl PSW : 0404c00180000000 00000000001481d4 (local_bh_enable+0xb0/0xe0) [...] Call Trace: ([<000000013fe04100>] 0x13fe04100) [<000000000048a946>] sk_filter+0x9a/0xd0 [<000000000049d938>] netlink_broadcast+0x2c0/0x53c [<00000000003ba9ae>] cn_netlink_send+0x272/0x2b0 [<00000000003baef0>] proc_sid_connector+0xc4/0xd4 [<0000000000142604>] __set_special_pids+0x58/0x90 [<0000000000159938>] sys_setsid+0xb4/0xd8 [<00000000001187fe>] sysc_noemu+0x10/0x16 [<00000041616cb266>] 0x41616cb266 The warning is ---> WARN_ON_ONCE(in_irq() || irqs_disabled()); The network code must not be called with disabled interrupts but sys_setsid holds the tasklist_lock with spinlock_irq while calling the connector. After a discussion we agreed that we can move proc_sid_connector from __set_special_pids to sys_setsid. We also agreed that it is sufficient to change the check from task_session(curr) != pid into err > 0, since if we don't change the session, this means we were already the leader and return -EPERM. One last thing: There is also daemonize(), and some people might want to get a notification in that case. Since daemonize() is only needed if a user space does kernel_thread this does not look important (and there seems to be no consensus if this connector should be called in daemonize). If we really want this, we can add proc_sid_connector to daemonize() in an additional patch (Scott?) Signed-off-by: Christian Borntraeger Cc: Scott James Remnant Cc: Matt Helsley Cc: David S. Miller Acked-by: Oleg Nesterov Acked-by: Evgeniy Polyakov Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 +--- kernel/sys.c | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e61891f..f7864ac 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -359,10 +359,8 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - proc_sid_connector(curr); - } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); diff --git a/kernel/sys.c b/kernel/sys.c index 255475d..1828f8d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); + if (err > 0) + proc_sid_connector(group_leader); return err; } -- cgit v1.1 From 478988d3b28e98a31e0cfe24e011e28ba0f3cf0d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 26 Oct 2009 16:49:36 -0700 Subject: cgroup: fix strstrip() misuse cgroup_write_X64() and cgroup_write_string() ignore the return value of strstrip(). it makes small inconsistent behavior. example: ========================= # cd /mnt/cgroup/hoge # cat memory.swappiness 60 # echo "59 " > memory.swappiness # cat memory.swappiness 59 # echo " 58" > memory.swappiness bash: echo: write error: Invalid argument This patch fixes it. Cc: Li Zefan Acked-by: Paul Menage Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ca83b73..0249f4b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); if (cft->write_u64) { - u64 val = simple_strtoull(buffer, &end, 0); + u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_u64(cgrp, cft, val); } else { - s64 val = simple_strtoll(buffer, &end, 0); + s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_s64(cgrp, cft, val); @@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - retval = cft->write_string(cgrp, cft, buffer); + retval = cft->write_string(cgrp, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: -- cgit v1.1 From 8c85dd8730bfb696e691145335f884c7baef8277 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 26 Oct 2009 16:50:07 -0700 Subject: sysctl: fix false positives when PROC_SYSCTL=n Having ->procname but not ->proc_handler is valid when PROC_SYSCTL=n, people use such combination to reduce ifdefs with non-standard handlers. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=14408 Signed-off-by: Alexey Dobriyan Reported-by: Peter Teoh Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index b38423c..b6e7aaea 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) if (!table->ctl_name && table->strategy) set_fail(&fail, table, "Strategy without ctl_name"); #endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif -- cgit v1.1 From 49557e620339cb134127b5bfbcfecc06b77d0232 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 2 Nov 2009 20:37:20 +1030 Subject: sched: Fix boot crash by zalloc()ing most of the cpu masks I got a boot crash when forcing cpumasks offstack on 32 bit, because find_new_ilb() returned 3 on my UP system (nohz.cpu_mask wasn't zeroed). AFAICT the others need to be zeroed too: only nohz.ilb_grp_nohz_mask is initialized before use. Signed-off-by: Rusty Russell Cc: Peter Zijlstra LKML-Reference: <200911022037.21282.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cae6700..bf21adb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9535,13 +9535,13 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); -- cgit v1.1 From b00bc0b237055b4c45816325ee14f0bd83e6f590 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Nov 2009 13:01:56 +0100 Subject: uids: Prevent tear down race Ingo triggered the following warning: WARNING: at lib/debugobjects.c:255 debug_print_object+0x42/0x50() Hardware name: System Product Name ODEBUG: init active object type: timer_list Modules linked in: Pid: 2619, comm: dmesg Tainted: G W 2.6.32-rc5-tip+ #5298 Call Trace: [<81035443>] warn_slowpath_common+0x6a/0x81 [<8120e483>] ? debug_print_object+0x42/0x50 [<81035498>] warn_slowpath_fmt+0x29/0x2c [<8120e483>] debug_print_object+0x42/0x50 [<8120ec2a>] __debug_object_init+0x279/0x2d7 [<8120ecb3>] debug_object_init+0x13/0x18 [<810409d2>] init_timer_key+0x17/0x6f [<81041526>] free_uid+0x50/0x6c [<8104ed2d>] put_cred_rcu+0x61/0x72 [<81067fac>] rcu_do_batch+0x70/0x121 debugobjects warns about an enqueued timer being initialized. If CONFIG_USER_SCHED=y the user management code uses delayed work to remove the user from the hash table and tear down the sysfs objects. free_uid is called from RCU and initializes/schedules delayed work if the usage count of the user_struct is 0. The init/schedule happens outside of the uidhash_lock protected region which allows a concurrent caller of find_user() to reference the about to be destroyed user_struct w/o preventing the work from being scheduled. If the next free_uid call happens before the work timer expired then the active timer is initialized and the work scheduled again. The race was introduced in commit 5cb350ba (sched: group scheduling, sysfs tunables) and made more prominent by commit 3959214f (sched: delayed cleanup of user_struct) Move the init/schedule_delayed_work inside of the uidhash_lock protected region to prevent the race. Signed-off-by: Thomas Gleixner Acked-by: Dhaval Giani Cc: Paul E. McKenney Cc: Kay Sievers Cc: stable@kernel.org --- kernel/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 2c000e7..46d0165 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -330,9 +330,9 @@ done: */ static void free_user(struct user_struct *up, unsigned long flags) { - spin_unlock_irqrestore(&uidhash_lock, flags); INIT_DELAYED_WORK(&up->work, cleanup_user_struct); schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); + spin_unlock_irqrestore(&uidhash_lock, flags); } #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ -- cgit v1.1 From 83f5b01ffbbaea6f97c9a79d21e240dbfb69f2f1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Oct 2009 08:14:49 -0700 Subject: rcu: Fix long-grace-period race between forcing and initialization Very long RCU read-side critical sections (50 milliseconds or so) can cause a race between force_quiescent_state() and rcu_start_gp() as follows on kernel builds with multi-level rcu_node hierarchies: 1. CPU 0 calls force_quiescent_state(), sees that there is a grace period in progress, and acquires ->fsqlock. 2. CPU 1 detects the end of the grace period, and so cpu_quiet_msk_finish() sets rsp->completed to rsp->gpnum. This operation is carried out under the root rnp->lock, but CPU 0 has not yet acquired that lock. Note that rsp->signaled is still RCU_SAVE_DYNTICK from the last grace period. 3. CPU 1 calls rcu_start_gp(), but no one wants a new grace period, so it drops the root rnp->lock and returns. 4. CPU 0 acquires the root rnp->lock and picks up rsp->completed and rsp->signaled, then drops rnp->lock. It then enters the RCU_SAVE_DYNTICK leg of the switch statement. 5. CPU 2 invokes call_rcu(), and now needs a new grace period. It calls rcu_start_gp(), which acquires the root rnp->lock, sets rsp->signaled to RCU_GP_INIT (too bad that CPU 0 is already in the RCU_SAVE_DYNTICK leg of the switch statement!) and starts initializing the rcu_node hierarchy. If there are multiple levels to the hierarchy, it will drop the root rnp->lock and initialize the lower levels of the hierarchy. 6. CPU 0 notes that rsp->completed has not changed, which permits both CPU 2 and CPU 0 to try updating it concurrently. If CPU 0's update prevails, later calls to force_quiescent_state() can count old quiescent states against the new grace period, which can in turn result in premature ending of grace periods. Not good. This patch adds an RCU_GP_IDLE state for rsp->signaled that is set initially at boot time and any time a grace period ends. This prevents CPU 0 from getting into the workings of force_quiescent_state() in step 4. Additional locking and checks prevent the concurrent update of rsp->signaled in step 6. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1256742889199-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 16 +++++++++++----- kernel/rcutree.h | 7 ++++--- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0536125..f3077c0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -59,7 +59,7 @@ NUM_RCU_LVL_2, \ NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_SIGNAL_INIT, \ + .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ @@ -657,14 +657,17 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; - spin_unlock(&rnp->lock); /* irqs already disabled. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + rnp = rcu_get_root(rsp); + spin_lock(&rnp->lock); /* irqs already disabled. */ rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ spin_unlock_irqrestore(&rsp->onofflock, flags); } @@ -706,6 +709,7 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; + rsp->signaled = RCU_GP_IDLE; rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -1162,9 +1166,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) } spin_unlock(&rnp->lock); switch (signaled) { + case RCU_GP_IDLE: case RCU_GP_INIT: - break; /* grace period still initializing, ignore. */ + break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: @@ -1178,7 +1183,8 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) /* Update state, record completion counter. */ spin_lock(&rnp->lock); - if (lastcomp == rsp->completed) { + if (lastcomp == rsp->completed && + rsp->signaled == RCU_SAVE_DYNTICK) { rsp->signaled = RCU_FORCE_QS; dyntick_record_completed(rsp, lastcomp); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1823c6e..1899023 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -201,9 +201,10 @@ struct rcu_data { }; /* Values for signaled field in struct rcu_state. */ -#define RCU_GP_INIT 0 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#define RCU_GP_IDLE 0 /* No grace period in progress. */ +#define RCU_GP_INIT 1 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK #else /* #ifdef CONFIG_NO_HZ */ -- cgit v1.1 From b84ff7d6f1b7f8a43414e74d972ec4c8f3361db4 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 29 Oct 2009 11:48:30 +0100 Subject: sched: Fix kthread_bind() by moving the body of kthread_bind() to sched.c Eric Paris reported that commit f685ceacab07d3f6c236f04803e2f2f0dbcc5afb causes boot time PREEMPT_DEBUG complaints. [ 4.590699] BUG: using smp_processor_id() in preemptible [00000000] code: rmmod/1314 [ 4.593043] caller is task_hot+0x86/0xd0 Since kthread_bind() messes with scheduler internals, move the body to sched.c, and lock the runqueue. Reported-by: Eric Paris Signed-off-by: Mike Galbraith Tested-by: Eric Paris Cc: Peter Zijlstra LKML-Reference: <1256813310.7574.3.camel@marge.simson.net> [ v2: fix !SMP build and clean up ] Signed-off-by: Ingo Molnar --- kernel/kthread.c | 23 ----------------------- kernel/sched.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 5fe7099..ab7ae57 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), EXPORT_SYMBOL(kthread_create); /** - * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - */ -void kthread_bind(struct task_struct *k, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - set_task_cpu(k, cpu); - k->cpus_allowed = cpumask_of_cpu(cpu); - k->rt.nr_cpus_allowed = 1; - k->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - -/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * diff --git a/kernel/sched.c b/kernel/sched.c index bf21adb..5cb7d63 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1996,6 +1996,38 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @k: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + * + * Function lives here instead of kthread.c because it messes with + * scheduler internals which require locking. + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + spin_lock_irqsave(&rq->lock, flags); + set_task_cpu(p, cpu); + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; + spin_unlock_irqrestore(&rq->lock, flags); +} +EXPORT_SYMBOL(kthread_bind); + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: -- cgit v1.1 From 76b57e613f6006ff525a17876c89326d127cadc9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 7 Oct 2009 22:37:35 +0200 Subject: PM / Hibernate: Fix blkdev refleaks While cruising through the swsusp code I found few blkdev reference leaks of resume_bdev. swsusp_read: remove blkdev_put altogether. Some fail paths do not do that. swsusp_check: make sure we always put a reference on fail paths software_resume: all fail paths between swsusp_check and swsusp_read omit swsusp_close. Add it in those cases. And since swsusp_read doesn't drop the reference anymore, do it here unconditionally. [rjw: Fixed a small coding style issue.] Signed-off-by: Jiri Slaby Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 11 ++++++++--- kernel/power/swap.c | 8 ++++---- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 04b3a83..04a9e90 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -693,21 +693,22 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; + swsusp_close(FMODE_READ); goto Unlock; } pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto Finish; + goto close_finish; error = usermodehelper_disable(); if (error) - goto Finish; + goto close_finish; error = create_basic_memory_bitmaps(); if (error) - goto Finish; + goto close_finish; pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); @@ -719,6 +720,7 @@ static int software_resume(void) pr_debug("PM: Reading hibernation image.\n"); error = swsusp_read(&flags); + swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); @@ -737,6 +739,9 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return error; +close_finish: + swsusp_close(FMODE_READ); + goto Finish; } late_initcall(software_resume); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b101cdc..a438862 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -572,8 +572,6 @@ int swsusp_read(unsigned int *flags_p) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); - blkdev_put(resume_bdev, FMODE_READ); - if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -596,7 +594,7 @@ int swsusp_check(void) error = bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) - return error; + goto put; if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); @@ -604,8 +602,10 @@ int swsusp_check(void) error = bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { - return -EINVAL; + error = -EINVAL; } + +put: if (error) blkdev_put(resume_bdev, FMODE_READ); else -- cgit v1.1 From 4ff277f9e42fa16314045bd124a61519286094c0 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 28 Oct 2009 22:55:33 +0100 Subject: PM / Hibernate: Fix error handling in save_image() There are too many retval variables in save_image(). Thus error return value from snapshot_read_next() may be ignored and only part of the snapshot (successfully) written. Remove 'error' variable, invert the condition in the do-while loop and convert the loop to use only 'ret' variable. Switch the rest of the function to consider only 'ret'. Also make sure we end printed line by \n if an error occurs. Signed-off-by: Jiri Slaby Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a438862..afa052b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -314,7 +314,6 @@ static int save_image(struct swap_map_handle *handle, { unsigned int m; int ret; - int error = 0; int nr_pages; int err2; struct bio *bio; @@ -329,26 +328,27 @@ static int save_image(struct swap_map_handle *handle, nr_pages = 0; bio = NULL; do_gettimeofday(&start); - do { + while (1) { ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - } while (ret > 0); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } err2 = wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) + if (!ret) + ret = err2; + if (!ret) printk("\b\b\b\bdone\n"); + else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; + return ret; } /** -- cgit v1.1 From bf9fd67a0328d56eff6022f80d4eb88ba6614119 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 28 Oct 2009 22:55:42 +0100 Subject: PM / Hibernate: Add newline to load_image() fail path Finish a line by \n when load_image fails in the middle of loading. Signed-off-by: Jiri Slaby Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index afa052b..890f6b1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -536,7 +536,8 @@ static int load_image(struct swap_map_handle *handle, snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; - } + } else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return error; } -- cgit v1.1 From 1d510750941a53a1d3049c1d33c75d6dfcd78618 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 3 Nov 2009 10:11:14 +0000 Subject: Correct nr_processes() when CPUs have been unplugged nr_processes() returns the sum of the per cpu counter process_counts for all online CPUs. This counter is incremented for the current CPU on fork() and decremented for the current CPU on exit(). Since a process does not necessarily fork and exit on the same CPU the process_count for an individual CPU can be either positive or negative and effectively has no meaning in isolation. Therefore calculating the sum of process_counts over only the online CPUs omits the processes which were started or stopped on any CPU which has since been unplugged. Only the sum of process_counts across all possible CPUs has meaning. The only caller of nr_processes() is proc_root_getattr() which calculates the number of links to /proc as stat->nlink = proc_root.nlink + nr_processes(); You don't have to be all that unlucky for the nr_processes() to return a negative value leading to a negative number of links (or rather, an apparently enormous number of links). If this happens then you can get failures where things like "ls /proc" start to fail because they got an -EOVERFLOW from some stat() call. Example with some debugging inserted to show what goes on: # ps haux|wc -l nr_processes: CPU0: 90 nr_processes: CPU1: 1030 nr_processes: CPU2: -900 nr_processes: CPU3: -136 nr_processes: TOTAL: 84 proc_root_getattr. nlink 12 + nr_processes() 84 = 96 84 # echo 0 >/sys/devices/system/cpu/cpu1/online # ps haux|wc -l nr_processes: CPU0: 85 nr_processes: CPU2: -901 nr_processes: CPU3: -137 nr_processes: TOTAL: -953 proc_root_getattr. nlink 12 + nr_processes() -953 = -941 75 # stat /proc/ nr_processes: CPU0: 84 nr_processes: CPU2: -901 nr_processes: CPU3: -137 nr_processes: TOTAL: -954 proc_root_getattr. nlink 12 + nr_processes() -954 = -942 File: `/proc/' Size: 0 Blocks: 0 IO Block: 1024 directory Device: 3h/3d Inode: 1 Links: 4294966354 Access: (0555/dr-xr-xr-x) Uid: ( 0/ root) Gid: ( 0/ root) Access: 2009-11-03 09:06:55.000000000 +0000 Modify: 2009-11-03 09:06:55.000000000 +0000 Change: 2009-11-03 09:06:55.000000000 +0000 I'm not 100% convinced that the per_cpu regions remain valid for offline CPUs, although my testing suggests that they do. If not then I think the correct solution would be to aggregate the process_count for a given CPU into a global base value in cpu_down(). This bug appears to pre-date the transition to git and it looks like it may even have been present in linux-2.6.0-test7-bk3 since it looks like the code Rusty patched in http://lwn.net/Articles/64773/ was already wrong. Signed-off-by: Ian Campbell Cc: Andrew Morton Cc: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4c20fff..166b8c4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,7 +91,7 @@ int nr_processes(void) int cpu; int total = 0; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; -- cgit v1.1 From f7112949f6a4cd6883d66c882d568c2197321de6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 3 Nov 2009 19:42:45 +0800 Subject: ring-buffer: Synchronize resizing buffer with reader lock We got a sudden panic when we reduced the size of the ringbuffer. We can reproduce the panic by the following steps: echo 1 > events/sched/enable cat trace_pipe > /dev/null & while ((1)) do echo 12000 > buffer_size_kb echo 512 > buffer_size_kb done (not more than 5 seconds, panic ...) Reported-by: KOSAKI Motohiro Signed-off-by: Lai Jiangshan LKML-Reference: <4AF01735.9060409@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ffa502..5dd017f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1193,6 +1193,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1207,6 +1208,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); -- cgit v1.1 From ed146b25942b428f8e8056587b7638ce76573c2f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 3 Nov 2009 08:55:38 +0800 Subject: ftrace: Fix unmatched locking in ftrace_regex_write() When a command is passed to the set_ftrace_filter, then the ftrace_regex_lock is still held going back to user space. # echo 'do_open : foo' > set_ftrace_filter (still holding ftrace_regex_lock when returning to user space!) Signed-off-by: Li Zefan LKML-Reference: <4AEF7F8A.3080300@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9c451a1..6dc4e5e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2222,15 +2222,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, ret = ftrace_process_regex(parser->buffer, parser->idx, enable); if (ret) - goto out; + goto out_unlock; trace_parser_clear(parser); } ret = read; - +out_unlock: mutex_unlock(&ftrace_regex_lock); -out: + return ret; } -- cgit v1.1 From e7e7e0c084ef862d5754701108d4a038514d6314 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Sat, 7 Nov 2009 11:16:13 +0800 Subject: genirq: try_one_irq() must be called with irq disabled Prarit reported: ================================= [ INFO: inconsistent lock state ] 2.6.32-rc5 #1 --------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. swapper/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (&irq_desc_lock_class){?.-...}, at: [] try_one_irq+0x32/0x138 {IN-HARDIRQ-W} state was registered at: [] __lock_acquire+0x2fc/0xd5d [] lock_acquire+0xf3/0x12d [] _spin_lock+0x40/0x89 [] handle_level_irq+0x30/0x105 [] handle_irq+0x95/0xb7 [] do_IRQ+0x6a/0xe0 [] ret_from_intr+0x0/0x16 irq event stamp: 195096 hardirqs last enabled at (195096): [] _spin_unlock_irq+0x3a/0x5c hardirqs last disabled at (195095): [] _spin_lock_irq+0x29/0x95 softirqs last enabled at (195088): [] __do_softirq+0x1c1/0x1ef softirqs last disabled at (195093): [] call_softirq+0x1c/0x30 other info that might help us debug this: 1 lock held by swapper/0: #0: (kernel/irq/spurious.c:21){+.-...}, at: [] run_timer_softirq+0x1a9/0x315 stack backtrace: Pid: 0, comm: swapper Not tainted 2.6.32-rc5 #1 Call Trace: [] valid_state+0x187/0x1ae [] mark_lock+0x129/0x253 [] __lock_acquire+0x370/0xd5d [] lock_acquire+0xf3/0x12d [] _spin_lock+0x40/0x89 [] try_one_irq+0x32/0x138 [] poll_all_shared_irqs+0x41/0x6d [] poll_spurious_irqs+0x1c/0x49 [] run_timer_softirq+0x239/0x315 [] __do_softirq+0x102/0x1ef [] call_softirq+0x1c/0x30 [] do_softirq+0x59/0xca [] irq_exit+0x58/0xae [] smp_apic_timer_interrupt+0x94/0xba [] apic_timer_interrupt+0x13/0x20 The reason is that try_one_irq() is called from hardirq context with interrupts disabled and from softirq context (poll_all_shared_irqs()) with interrupts enabled. Disable interrupts before calling it from poll_all_shared_irqs(). Reported-and-tested-by: Prarit Bhargava Signed-off-by: Yong Zhang LKML-Reference: <1257563773-4620-1-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 114e704..bd7273e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -121,7 +121,9 @@ static void poll_all_shared_irqs(void) if (!(status & IRQ_SPURIOUS_DISABLED)) continue; + local_irq_disable(); try_one_irq(i, desc); + local_irq_enable(); } } -- cgit v1.1 From 968c86458a5975efa7a95f832a4ec9fb21471137 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Nov 2009 15:31:08 -0800 Subject: sched: Fix kernel-doc function parameter name Fix variable name in sched.c kernel-doc notation. Fixes this DocBook warning: Warning(kernel/sched.c:2008): No description found for parameter 'p' Warning(kernel/sched.c:2008): Excess function parameter 'k' description in 'kthread_bind' Signed-off-by: Randy Dunlap LKML-Reference: <4AF4B1BC.8020604@oracle.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 28dd4f4..7d7d5fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1994,7 +1994,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, /** * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). + * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), -- cgit v1.1 From e9036b36eed4d3cdb33fa9cbcdd9888ae516889f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 26 Oct 2009 22:24:14 +0300 Subject: sched: Use root_task_group_empty only with FAIR_GROUP_SCHED root_task_group_empty is used only with FAIR_GROUP_SCHED so if we use other scheduler options we get: kernel/sched.c:314: warning: 'root_task_group_empty' defined but not used So move CONFIG_FAIR_GROUP_SCHED up that it covers root_task_group_empty(). Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra LKML-Reference: <20091026192414.GB5321@lenovo> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7d7d5fc..3c11ae0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_SMP static int root_task_group_empty(void) { @@ -316,7 +318,6 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else /* !CONFIG_USER_SCHED */ -- cgit v1.1 From 9398180097e359646d46083c3e079a54e20bee82 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Nov 2009 14:06:20 -0800 Subject: workqueue: fix race condition in schedule_on_each_cpu() Commit 65a64464349883891e21e74af16c05d6e1eeb4e9 ("HWPOISON: Allow schedule_on_each_cpu() from keventd") which allows schedule_on_each_cpu() to be called from keventd added a race condition. schedule_on_each_cpu() may race with cpu hotplug and end up executing the function twice on a cpu. Fix it by moving direct execution into the section protected with get/put_online_cpus(). While at it, update code such that direct execution is done after works have been scheduled for all other cpus and drop unnecessary cpu != orig test from flush loop. Signed-off-by: Tejun Heo Cc: Andi Kleen Acked-by: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1232814..67e526b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -692,31 +692,29 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; + get_online_cpus(); + /* - * when running in keventd don't schedule a work item on itself. - * Can just call directly because the work queue is already bound. - * This also is faster. - * Make this a generic parameter for other workqueues? + * When running in keventd don't schedule a work item on + * itself. Can just call directly because the work queue is + * already bound. This also is faster. */ - if (current_is_keventd()) { + if (current_is_keventd()) orig = raw_smp_processor_id(); - INIT_WORK(per_cpu_ptr(works, orig), func); - func(per_cpu_ptr(works, orig)); - } - get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); - if (cpu == orig) - continue; INIT_WORK(work, func); - schedule_work_on(cpu, work); - } - for_each_online_cpu(cpu) { if (cpu != orig) - flush_work(per_cpu_ptr(works, cpu)); + schedule_work_on(cpu, work); } + if (orig >= 0) + func(per_cpu_ptr(works, orig)); + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + put_online_cpus(); free_percpu(works); return 0; -- cgit v1.1