From e159489baa717dbae70f9903770a6a4990865887 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 9 Jan 2011 23:32:15 +0100 Subject: workqueue: relax lockdep annotation on flush_work() Currently, the lockdep annotation in flush_work() requires exclusive access on the workqueue the target work is queued on and triggers warning if a work is trying to flush another work on the same workqueue; however, this is no longer true as workqueues can now execute multiple works concurrently. This patch adds lock_map_acquire_read() and make process_one_work() hold read access to the workqueue while executing a work and start_flush_work() check for write access if concurrnecy level is one or the workqueue has a rescuer (as only one execution resource - the rescuer - is guaranteed to be available under memory pressure), and read access if higher. This better represents what's going on and removes spurious lockdep warnings which are triggered by fake dependency chain created through flush_work(). * Peter pointed out that flushing another work from a WQ_MEM_RECLAIM wq breaks forward progress guarantee under memory pressure. Condition check accordingly updated. Signed-off-by: Tejun Heo Reported-by: "Rafael J. Wysocki" Tested-by: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: stable@kernel.org --- kernel/workqueue.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8ee6ec8..930c239 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1840,7 +1840,7 @@ __acquires(&gcwq->lock) spin_unlock_irq(&gcwq->lock); work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); f(work); @@ -2384,8 +2384,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); - lock_map_acquire(&cwq->wq->lockdep_map); + /* + * If @max_active is 1 or rescuer is in use, flushing another work + * item on the same workqueue may lead to deadlock. Make sure the + * flusher is not running on the same workqueue by verifying write + * access. + */ + if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) + lock_map_acquire(&cwq->wq->lockdep_map); + else + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_release(&cwq->wq->lockdep_map); + return true; already_gone: spin_unlock_irq(&gcwq->lock); -- cgit v1.1 From 42c025f3de9042d9c9abd9a6f6205d1a0f4bcadf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Jan 2011 15:58:49 +0100 Subject: workqueue: note the nested NOT_RUNNING test in worker_clr_flags() isn't a noop The nested NOT_RUNNING test in worker_clr_flags() is slightly misleading in that if NOT_RUNNING were a single flag the nested test would be always %true and thus noop. Add a comment noting that the test isn't a noop. Signed-off-by: Tejun Heo Cc: Hillf Danton Cc: Andrew Morton --- kernel/workqueue.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 930c239..11869fa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) worker->flags &= ~flags; - /* if transitioning out of NOT_RUNNING, increment nr_running */ + /* + * If transitioning out of NOT_RUNNING, increment nr_running. Note + * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask + * of multiple flags, not a single flag. + */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(get_gcwq_nr_running(gcwq->cpu)); -- cgit v1.1 From 977dda7c9b540f48b228174346d8b31542c1e99f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 14 Jan 2011 17:57:50 -0800 Subject: sched: Update effective_load() to use global share weights Previously effective_load would approximate the global load weight present on a group taking advantage of: entity_weight = tg->shares ( lw / global_lw ), where entity_weight was provided by tg_shares_up. This worked (approximately) for an 'empty' (at tg level) cpu since we would place boost load representative of what a newly woken task would receive. However, now that load is instantaneously updated this assumption is no longer true and the load calculation is rather incorrect in this case. Fix this (and improve the general case) by re-writing effective_load to take advantage of the new shares distribution code. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110115015817.069769529@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c62ebae..414145c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1362,27 +1362,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long S, rw, s, a, b; + long lw, w; - S = se->my_q->tg->shares; - s = se->load.weight; - rw = se->my_q->load.weight; + tg = se->my_q->tg; + w = se->my_q->load.weight; - a = S*(rw + wl); - b = S*rw + s*wg; + /* use this cpu's instantaneous contribution */ + lw = atomic_read(&tg->load_weight); + lw -= se->my_q->load_contribution; + lw += w + wg; - wl = s*(a-b); + wl += w; - if (likely(b)) - wl /= b; + if (lw > 0 && wl < lw) + wl = (wl * tg->shares) / lw; + else + wl = tg->shares; - /* - * Assume the group is already running and will - * thus already be accounted for in the weight. - * - * That is, moving shares between CPUs, does not - * alter the group weight. - */ + /* zero point is MIN_SHARES */ + if (wl < MIN_SHARES) + wl = MIN_SHARES; + wl -= se->load.weight; wg = 0; } -- cgit v1.1 From efe25c2c7b3a5d17b0c70987a758d8fe7af8e3d1 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 11 Jan 2011 15:41:54 +0530 Subject: sched: Reinstate group names in /proc/sched_debug Displaying of group names in /proc/sched_debug was dropped in autogroup patches. Add group names while displaying cfs_rq and tasks information. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra LKML-Reference: <20110111101153.GE4772@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1dfae3d..4d36f37 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -16,6 +16,8 @@ #include #include +static DEFINE_SPINLOCK(sched_debug_lock); + /* * This allows printing both to /proc/sched_debug and * to the console @@ -86,6 +88,23 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } #endif +#ifdef CONFIG_CGROUP_SCHED +static char group_path[PATH_MAX]; + +static char *task_group_path(struct task_group *tg) +{ + /* + * May be NULL if the underlying cgroup isn't fully-created yet + */ + if (!tg->css.cgroup) { + group_path[0] = '\0'; + return group_path; + } + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; +} +#endif + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -108,6 +127,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif +#ifdef CONFIG_CGROUP_SCHED + SEQ_printf(m, " %s", task_group_path(task_group(p))); +#endif SEQ_printf(m, "\n"); } @@ -144,7 +166,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -191,7 +217,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { +#ifdef CONFIG_RT_GROUP_SCHED + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) @@ -212,6 +242,7 @@ extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; #ifdef CONFIG_X86 { @@ -266,10 +297,14 @@ static void print_cpu(struct seq_file *m, int cpu) #undef P #endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + rcu_read_lock(); print_rq(m, rq, cpu); + rcu_read_unlock(); + spin_unlock_irqrestore(&sched_debug_lock, flags); } static const char *sched_tunable_scaling_names[] = { -- cgit v1.1 From 8ecedd7a06d27a31dbb36fab88e2ba6e6edd43ca Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 11 Jan 2011 15:42:57 +0530 Subject: sched: Display autogroup names in /proc/sched_debug Add autogroup name to cfs_rq and tasks information to /proc/sched_debug. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra LKML-Reference: <20110111101257.GF4772@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_autogroup.c | 5 +++++ kernel/sched_debug.c | 3 +++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 32a723b..938d52f 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -231,6 +231,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (!enabled || !tg->autogroup) + return 0; + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } #endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4d36f37..e4d3725 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -93,6 +93,9 @@ static char group_path[PATH_MAX]; static char *task_group_path(struct task_group *tg) { + if (autogroup_path(tg, group_path, PATH_MAX)) + return group_path; + /* * May be NULL if the underlying cgroup isn't fully-created yet */ -- cgit v1.1 From f44937718ce3b8360f72f6c68c9481712517a867 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 13 Jan 2011 04:54:50 +0100 Subject: sched, autogroup: Fix CONFIG_RT_GROUP_SCHED sched_setscheduler() failure If CONFIG_RT_GROUP_SCHED is set, __sched_setscheduler() fails due to autogroup not allocating rt_runtime. Free unused/unusable rt_se and rt_rq, redirect RT tasks to the root task group, and tell __sched_setscheduler() that it's ok. Reported-and-tested-by: Bharata B Rao Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1294890890.8089.39.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- kernel/sched_autogroup.c | 27 +++++++++++++++++++++++++++ kernel/sched_autogroup.h | 4 ++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a0eb094..6cbff6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4871,7 +4871,8 @@ recheck: * assigned. */ if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) { + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); return -EPERM; diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 938d52f..9fb6562 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref) { struct autogroup *ag = container_of(kref, struct autogroup, kref); +#ifdef CONFIG_RT_GROUP_SCHED + /* We've redirected RT tasks to the root task group... */ + ag->tg->rt_se = NULL; + ag->tg->rt_rq = NULL; +#endif sched_destroy_group(ag->tg); } @@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) return ag; } +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg); +#endif + static inline struct autogroup *autogroup_create(void) { struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); @@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void) init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); ag->tg = tg; +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Autogroup RT tasks are redirected to the root task group + * so we don't have to move tasks around upon policy change, + * or flail around trying to allocate bandwidth on the fly. + * A bandwidth exception in __sched_setscheduler() allows + * the policy change to proceed. Thereafter, task_group() + * returns &root_task_group, so zero bandwidth is required. + */ + free_rt_sched_group(tg); + tg->rt_se = root_task_group.rt_se; + tg->rt_rq = root_task_group.rt_rq; +#endif tg->autogroup = ag; return ag; @@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return tg != &root_task_group && tg->autogroup; +} + static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 5358e24..7b859ff 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg); static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) -- cgit v1.1 From fce2097983d914ea8c2338efc6f6e9bb737f6ae4 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Fri, 14 Jan 2011 15:57:39 +0800 Subject: sched: Replace rq->bkl_count with rq->rq_sched_info.bkl_count Now rq->rq_sched_info.bkl_count is not used for rq, scroll rq->bkl_count into it. Thus we can save some space for rq. Signed-off-by: Yong Zhang Signed-off-by: Peter Zijlstra LKML-Reference: <1294991859-13246-1-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +---- kernel/sched_debug.c | 4 +++- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6cbff6b..0a169a8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -553,9 +553,6 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; - - /* BKL stats */ - unsigned int bkl_count; #endif }; @@ -3887,7 +3884,7 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); #ifdef CONFIG_SCHEDSTATS if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_count); + schedstat_inc(this_rq(), rq_sched_info.bkl_count); schedstat_inc(prev, sched_info.bkl_count); } #endif diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index e4d3725..eb6cb8e 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -296,9 +296,11 @@ static void print_cpu(struct seq_file *m, int cpu) P(ttwu_count); P(ttwu_local); - P(bkl_count); + SEQ_printf(m, " .%-30s: %d\n", "bkl_count", + rq->rq_sched_info.bkl_count); #undef P +#undef P64 #endif spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); -- cgit v1.1 From d7d8294415f0ce4254827d4a2a5ee88b00be52a8 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 5 Jan 2011 05:41:17 +0100 Subject: sched: Fix signed unsigned comparison in check_preempt_tick() Signed unsigned comparison may lead to superfluous resched if leftmost is right of the current task, wasting a few cycles, and inadvertently _lengthening_ the current task's slice. Reported-by: Venkatesh Pallipadi Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1294202477.9384.5.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 414145c..77e9166 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se = __pick_next_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; + if (delta < 0) + return; + if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } -- cgit v1.1 From 068c5cc5ac7414a8e9eb7856b4bf3cc4d4744267 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Jan 2011 12:26:11 +0100 Subject: sched, cgroup: Use exit hook to avoid use-after-free crash By not notifying the controller of the on-exit move back to init_css_set, we fail to move the task out of the previous cgroup's cfs_rq. This leads to an opportunity for a cgroup-destroy to come in and free the cgroup (there are no active tasks left in it after all) to which the not-quite dead task is still enqueued. Reported-by: Miklos Vajna Fixed-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Cc: Mike Galbraith Signed-off-by: Ingo Molnar LKML-Reference: <1293206353.29444.205.camel@laptop> --- kernel/sched.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0a169a8..fa5272a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -606,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; + if (p->flags & PF_EXITING) + return &root_task_group; + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -8880,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } } +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + sched_move_task(task); +} + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) @@ -8952,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .destroy = cpu_cgroup_destroy, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, .early_init = 1, -- cgit v1.1 From 2ce802f62ba32a7d95748ac92bf351f76affb6ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Jan 2011 12:06:35 +0100 Subject: lockdep: Move early boot local IRQ enable/disable status to init/main.c During early boot, local IRQ is disabled until IRQ subsystem is properly initialized. During this time, no one should enable local IRQ and some operations which usually are not allowed with IRQ disabled, e.g. operations which might sleep or require communications with other processors, are allowed. lockdep tracked this with early_boot_irqs_off/on() callbacks. As other subsystems need this information too, move it to init/main.c and make it generally available. While at it, toggle the boolean to early_boot_irqs_disabled instead of enabled so that it can be initialized with %false and %true indicates the exceptional condition. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Pekka Enberg Cc: Linus Torvalds LKML-Reference: <20110120110635.GB6036@htj.dyndns.org> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 18 +----------------- kernel/trace/trace_irqsoff.c | 8 -------- 2 files changed, 1 insertion(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65d..0d2058d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) } /* - * Debugging helper: via this flag we know that we are in - * 'early bootup code', and will warn about any invalid irqs-on event: - */ -static int early_boot_irqs_enabled; - -void early_boot_irqs_off(void) -{ - early_boot_irqs_enabled = 0; -} - -void early_boot_irqs_on(void) -{ - early_boot_irqs_enabled = 1; -} - -/* * Hardirqs will be enabled: */ void trace_hardirqs_on_caller(unsigned long ip) @@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; - if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; if (unlikely(curr->hardirqs_enabled)) { diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c60..92b6e1e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) * Stubs: */ -void early_boot_irqs_off(void) -{ -} - -void early_boot_irqs_on(void) -{ -} - void trace_softirqs_on(unsigned long ip) { } -- cgit v1.1 From bd924e8cbd4b73ffb7d707a774c04f7e2cae88ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Jan 2011 12:07:13 +0100 Subject: smp: Allow on_each_cpu() to be called while early_boot_irqs_disabled status to init/main.c percpu may end up calling vfree() during early boot which in turn may call on_each_cpu() for TLB flushes. The function of on_each_cpu() can be done safely while IRQ is disabled during early boot but it assumed that the function is always called with local IRQ enabled which ended up enabling local IRQ prematurely during boot and triggering a couple of warnings. This patch updates on_each_cpu() and smp_call_function_many() such on_each_cpu() can be used safely while early_boot_irqs_disabled is set. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Pekka Enberg Cc: Linus Torvalds LKML-Reference: <20110120110713.GC6036@htj.dyndns.org> Signed-off-by: Ingo Molnar Reported-by: Ingo Molnar --- kernel/smp.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4ec30e0..4b83cd6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -430,7 +430,7 @@ void smp_call_function_many(const struct cpumask *mask, * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + && !oops_in_progress && !early_boot_irqs_disabled); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -533,17 +533,20 @@ void ipi_call_unlock_irq(void) #endif /* USE_GENERIC_SMP_HELPERS */ /* - * Call a function on all processors + * Call a function on all processors. May be used during early boot while + * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead + * of local_irq_disable/enable(). */ int on_each_cpu(void (*func) (void *info), void *info, int wait) { + unsigned long flags; int ret = 0; preempt_disable(); ret = smp_call_function(func, info, wait); - local_irq_disable(); + local_irq_save(flags); func(info); - local_irq_enable(); + local_irq_restore(flags); preempt_enable(); return ret; } -- cgit v1.1 From 6dc19899958e420a931274b94019e267e2396d3e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 20 Jan 2011 14:44:33 -0800 Subject: kernel/smp.c: fix smp_call_function_many() SMP race I noticed a failure where we hit the following WARN_ON in generic_smp_call_function_interrupt: if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; data->csd.func(data->csd.info); refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); <------------------------- We atomically tested and cleared our bit in the cpumask, and yet the number of cpus left (ie refs) was 0. How can this be? It turns out commit 54fdade1c3332391948ec43530c02c4794a38172 ("generic-ipi: make struct call_function_data lockless") is at fault. It removes locking from smp_call_function_many and in doing so creates a rather complicated race. The problem comes about because: - The smp_call_function_many interrupt handler walks call_function.queue without any locking. - We reuse a percpu data structure in smp_call_function_many. - We do not wait for any RCU grace period before starting the next smp_call_function_many. Imagine a scenario where CPU A does two smp_call_functions back to back, and CPU B does an smp_call_function in between. We concentrate on how CPU C handles the calls: CPU A CPU B CPU C CPU D smp_call_function smp_call_function_interrupt walks call_function.queue sees data from CPU A on list smp_call_function smp_call_function_interrupt walks call_function.queue sees (stale) CPU A on list smp_call_function int clears last ref on A list_del_rcu, unlock smp_call_function reuses percpu *data A data->cpumask sees and clears bit in cpumask might be using old or new fn! decrements refs below 0 set data->refs (too late!) The important thing to note is since the interrupt handler walks a potentially stale call_function.queue without any locking, then another cpu can view the percpu *data structure at any time, even when the owner is in the process of initialising it. The following test case hits the WARN_ON 100% of the time on my PowerPC box (having 128 threads does help :) #include #include #define ITERATIONS 100 static void do_nothing_ipi(void *dummy) { } static void do_ipis(struct work_struct *dummy) { int i; for (i = 0; i < ITERATIONS; i++) smp_call_function(do_nothing_ipi, NULL, 1); printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id()); } static struct work_struct work[NR_CPUS]; static int __init testcase_init(void) { int cpu; for_each_online_cpu(cpu) { INIT_WORK(&work[cpu], do_ipis); schedule_work_on(cpu, &work[cpu]); } return 0; } static void __exit testcase_exit(void) { } module_init(testcase_init) module_exit(testcase_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Anton Blanchard"); I tried to fix it by ordering the read and the write of ->cpumask and ->refs. In doing so I missed a critical case but Paul McKenney was able to spot my bug thankfully :) To ensure we arent viewing previous iterations the interrupt handler needs to read ->refs then ->cpumask then ->refs _again_. Thanks to Milton Miller and Paul McKenney for helping to debug this issue. [miltonm@bga.com: add WARN_ON and BUG_ON, remove extra read of refs before initial read of mask that doesn't help (also noted by Peter Zijlstra), adjust comments, hopefully clarify scenario ] [miltonm@bga.com: remove excess tests] Signed-off-by: Anton Blanchard Signed-off-by: Milton Miller Cc: Ingo Molnar Cc: "Paul E. McKenney" Cc: [2.6.32+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4ec30e0..17c6e58 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -195,6 +195,24 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + /* + * Since we walk the list without any locks, we might + * see an entry that was completed, removed from the + * list and is in the process of being reused. + * + * We must check that the cpu is in the cpumask before + * checking the refs, and both must be set before + * executing the callback on this cpu. + */ + + if (!cpumask_test_cpu(cpu, data->cpumask)) + continue; + + smp_rmb(); + + if (atomic_read(&data->refs) == 0) + continue; + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; @@ -203,6 +221,8 @@ void generic_smp_call_function_interrupt(void) refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); if (!refs) { + WARN_ON(!cpumask_empty(data->cpumask)); + raw_spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); raw_spin_unlock(&call_function.lock); @@ -454,11 +474,21 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); + + /* + * To ensure the interrupt handler gets an complete view + * we order the cpumask and refs writes and order the read + * of them in the interrupt handler. In addition we may + * only clear our own cpu bit from the mask. + */ + smp_wmb(); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_lock_irqsave(&call_function.lock, flags); -- cgit v1.1 From 225c8e010f2d17a62aef131e24c6e7c111f36f9b Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Thu, 20 Jan 2011 14:44:34 -0800 Subject: kernel/smp.c: consolidate writes in smp_call_function_interrupt() We have to test the cpu mask in the interrupt handler before checking the refs, otherwise we can start to follow an entry before its deleted and find it partially initailzed for the next trip. Presently we also clear the cpumask bit before executing the called function, which implies getting write access to the line. After the function is called we then decrement refs, and if they go to zero we then unlock the structure. However, this implies getting write access to the call function data before and after another the function is called. If we can assert that no smp_call_function execution function is allowed to enable interrupts, then we can move both writes to after the function is called, hopfully allowing both writes with one cache line bounce. On a 256 thread system with a kernel compiled for 1024 threads, the time to execute testcase in the "smp_call_function_many race" changelog was reduced by about 30-40ms out of about 545 ms. I decided to keep this as WARN because its now a buggy function, even though the stack trace is of no value -- a simple printk would give us the information needed. Raw data: Without patch: ipi_test startup took 1219366ns complete 539819014ns total 541038380ns ipi_test startup took 1695754ns complete 543439872ns total 545135626ns ipi_test startup took 7513568ns complete 539606362ns total 547119930ns ipi_test startup took 13304064ns complete 533898562ns total 547202626ns ipi_test startup took 8668192ns complete 544264074ns total 552932266ns ipi_test startup took 4977626ns complete 548862684ns total 553840310ns ipi_test startup took 2144486ns complete 541292318ns total 543436804ns ipi_test startup took 21245824ns complete 530280180ns total 551526004ns With patch: ipi_test startup took 5961748ns complete 500859628ns total 506821376ns ipi_test startup took 8975996ns complete 495098924ns total 504074920ns ipi_test startup took 19797750ns complete 492204740ns total 512002490ns ipi_test startup took 14824796ns complete 487495878ns total 502320674ns ipi_test startup took 11514882ns complete 494439372ns total 505954254ns ipi_test startup took 8288084ns complete 502570774ns total 510858858ns ipi_test startup took 6789954ns complete 493388112ns total 500178066ns #include #include #include /* sched clock */ #define ITERATIONS 100 static void do_nothing_ipi(void *dummy) { } static void do_ipis(struct work_struct *dummy) { int i; for (i = 0; i < ITERATIONS; i++) smp_call_function(do_nothing_ipi, NULL, 1); printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id()); } static struct work_struct work[NR_CPUS]; static int __init testcase_init(void) { int cpu; u64 start, started, done; start = local_clock(); for_each_online_cpu(cpu) { INIT_WORK(&work[cpu], do_ipis); schedule_work_on(cpu, &work[cpu]); } started = local_clock(); for_each_online_cpu(cpu) flush_work(&work[cpu]); done = local_clock(); pr_info("ipi_test startup took %lldns complete %lldns total %lldns\n", started-start, done-started, done-start); return 0; } static void __exit testcase_exit(void) { } module_init(testcase_init) module_exit(testcase_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Anton Blanchard"); Signed-off-by: Milton Miller Cc: Anton Blanchard Cc: Ingo Molnar Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 17c6e58..2fe66f7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,6 +194,7 @@ void generic_smp_call_function_interrupt(void) */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + void (*func) (void *info); /* * Since we walk the list without any locks, we might @@ -213,24 +214,32 @@ void generic_smp_call_function_interrupt(void) if (atomic_read(&data->refs) == 0) continue; - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) - continue; - + func = data->csd.func; /* for later warn */ data->csd.func(data->csd.info); + /* + * If the cpu mask is not still set then it enabled interrupts, + * we took another smp interrupt, and executed the function + * twice on this cpu. In theory that copy decremented refs. + */ + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { + WARN(1, "%pS enabled interrupts and double executed\n", + func); + continue; + } + refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); - if (!refs) { - WARN_ON(!cpumask_empty(data->cpumask)); - - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - } if (refs) continue; + WARN_ON(!cpumask_empty(data->cpumask)); + + raw_spin_lock(&call_function.lock); + list_del_rcu(&data->csd.list); + raw_spin_unlock(&call_function.lock); + csd_unlock(&data->csd); } -- cgit v1.1 From 1c77ff22f539ceaa64ea43d6a26d867e84602cb7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Jan 2011 19:41:35 +0100 Subject: genirq: Remove __do_IRQ All architectures are finally converted. Remove the cruft. Signed-off-by: Thomas Gleixner Cc: Richard Henderson Cc: Mike Frysinger Cc: David Howells Cc: Tony Luck Cc: Greg Ungerer Cc: Michal Simek Acked-by: David Howells Cc: Kyle McMartin Acked-by: Benjamin Herrenschmidt Cc: Chen Liqin Cc: "David S. Miller" Cc: Chris Metcalf Cc: Jeff Dike --- kernel/irq/Kconfig | 3 -- kernel/irq/handle.c | 111 ---------------------------------------------------- 2 files changed, 114 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 31d766b..8e42fec 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -9,9 +9,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED def_bool n diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e2347eb..3540a71 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) return retval; } - -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ - -#ifdef CONFIG_ENABLE_WARN_DEPRECATED -# warning __do_IRQ is deprecated. Please convert to proper flow handlers -#endif - -/** - * __do_IRQ - original all in one highlevel IRQ handler - * @irq: the interrupt number - * - * __do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - * - * This is the original x86 implementation which is used for every - * interrupt type. - */ -unsigned int __do_IRQ(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned int status; - - kstat_incr_irqs_this_cpu(irq, desc); - - if (CHECK_IRQ_PER_CPU(desc->status)) { - irqreturn_t action_ret; - - /* - * No locking required for CPU-local interrupts: - */ - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - if (likely(!(desc->status & IRQ_DISABLED))) { - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - } - desc->irq_data.chip->end(irq); - return 1; - } - - raw_spin_lock(&desc->lock); - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - /* - * REPLAY is when Linux resends an IRQ that was dropped earlier - * WAITING is used by probe to mark irqs that are being tested - */ - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - status |= IRQ_PENDING; /* we _want_ to handle it */ - - /* - * If the IRQ is disabled for whatever reason, we cannot - * use the action we have. - */ - action = NULL; - if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { - action = desc->action; - status &= ~IRQ_PENDING; /* we commit to handling */ - status |= IRQ_INPROGRESS; /* we are handling it */ - } - desc->status = status; - - /* - * If there is no IRQ handler or it was disabled, exit early. - * Since we set PENDING, if another processor is handling - * a different instance of this same irq, the other processor - * will take care of it. - */ - if (unlikely(!action)) - goto out; - - /* - * Edge triggered interrupts need to remember - * pending events. - * This applies to any hw interrupts that allow a second - * instance of the same irq to arrive while we are in do_IRQ - * or in the handler. But the code here only handles the _second_ - * instance of the irq, not the third or fourth. So it is mostly - * useful for irq hardware that does not mask cleanly in an - * SMP environment. - */ - for (;;) { - irqreturn_t action_ret; - - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - if (likely(!(desc->status & IRQ_PENDING))) - break; - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - -out: - /* - * The ->end() handler has to deal with interrupts which got - * disabled while the handler was running. - */ - desc->irq_data.chip->end(irq); - raw_spin_unlock(&desc->lock); - - return 1; -} -#endif -- cgit v1.1