From e159489baa717dbae70f9903770a6a4990865887 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 9 Jan 2011 23:32:15 +0100
Subject: workqueue: relax lockdep annotation on flush_work()

Currently, the lockdep annotation in flush_work() requires exclusive
access on the workqueue the target work is queued on and triggers
warning if a work is trying to flush another work on the same
workqueue; however, this is no longer true as workqueues can now
execute multiple works concurrently.

This patch adds lock_map_acquire_read() and make process_one_work()
hold read access to the workqueue while executing a work and
start_flush_work() check for write access if concurrnecy level is one
or the workqueue has a rescuer (as only one execution resource - the
rescuer - is guaranteed to be available under memory pressure), and
read access if higher.

This better represents what's going on and removes spurious lockdep
warnings which are triggered by fake dependency chain created through
flush_work().

* Peter pointed out that flushing another work from a WQ_MEM_RECLAIM
  wq breaks forward progress guarantee under memory pressure.
  Condition check accordingly updated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Tested-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@kernel.org
---
 kernel/workqueue.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ee6ec8..930c239 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1840,7 +1840,7 @@ __acquires(&gcwq->lock)
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
-	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_acquire_read(&cwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	trace_workqueue_execute_start(work);
 	f(work);
@@ -2384,8 +2384,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 	insert_wq_barrier(cwq, barr, work, worker);
 	spin_unlock_irq(&gcwq->lock);
 
-	lock_map_acquire(&cwq->wq->lockdep_map);
+	/*
+	 * If @max_active is 1 or rescuer is in use, flushing another work
+	 * item on the same workqueue may lead to deadlock.  Make sure the
+	 * flusher is not running on the same workqueue by verifying write
+	 * access.
+	 */
+	if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+		lock_map_acquire(&cwq->wq->lockdep_map);
+	else
+		lock_map_acquire_read(&cwq->wq->lockdep_map);
 	lock_map_release(&cwq->wq->lockdep_map);
+
 	return true;
 already_gone:
 	spin_unlock_irq(&gcwq->lock);
-- 
cgit v1.1


From 42c025f3de9042d9c9abd9a6f6205d1a0f4bcadf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Jan 2011 15:58:49 +0100
Subject: workqueue: note the nested NOT_RUNNING test in worker_clr_flags()
 isn't a noop

The nested NOT_RUNNING test in worker_clr_flags() is slightly
misleading in that if NOT_RUNNING were a single flag the nested test
would be always %true and thus noop.  Add a comment noting that the
test isn't a noop.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/workqueue.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 930c239..11869fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 
 	worker->flags &= ~flags;
 
-	/* if transitioning out of NOT_RUNNING, increment nr_running */
+	/*
+	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
+	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
+	 * of multiple flags, not a single flag.
+	 */
 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 		if (!(worker->flags & WORKER_NOT_RUNNING))
 			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
-- 
cgit v1.1


From 977dda7c9b540f48b228174346d8b31542c1e99f Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 14 Jan 2011 17:57:50 -0800
Subject: sched: Update effective_load() to use global share weights

Previously effective_load would approximate the global load weight present on
a group taking advantage of:

entity_weight = tg->shares ( lw / global_lw ), where entity_weight was provided
by tg_shares_up.

This worked (approximately) for an 'empty' (at tg level) cpu since we would
place boost load representative of what a newly woken task would receive.

However, now that load is instantaneously updated this assumption is no longer
true and the load calculation is rather incorrect in this case.

Fix this (and improve the general case) by re-writing effective_load to take
advantage of the new shares distribution code.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110115015817.069769529@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae..414145c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1362,27 +1362,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 		return wl;
 
 	for_each_sched_entity(se) {
-		long S, rw, s, a, b;
+		long lw, w;
 
-		S = se->my_q->tg->shares;
-		s = se->load.weight;
-		rw = se->my_q->load.weight;
+		tg = se->my_q->tg;
+		w = se->my_q->load.weight;
 
-		a = S*(rw + wl);
-		b = S*rw + s*wg;
+		/* use this cpu's instantaneous contribution */
+		lw = atomic_read(&tg->load_weight);
+		lw -= se->my_q->load_contribution;
+		lw += w + wg;
 
-		wl = s*(a-b);
+		wl += w;
 
-		if (likely(b))
-			wl /= b;
+		if (lw > 0 && wl < lw)
+			wl = (wl * tg->shares) / lw;
+		else
+			wl = tg->shares;
 
-		/*
-		 * Assume the group is already running and will
-		 * thus already be accounted for in the weight.
-		 *
-		 * That is, moving shares between CPUs, does not
-		 * alter the group weight.
-		 */
+		/* zero point is MIN_SHARES */
+		if (wl < MIN_SHARES)
+			wl = MIN_SHARES;
+		wl -= se->load.weight;
 		wg = 0;
 	}
 
-- 
cgit v1.1


From efe25c2c7b3a5d17b0c70987a758d8fe7af8e3d1 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Tue, 11 Jan 2011 15:41:54 +0530
Subject: sched: Reinstate group names in /proc/sched_debug

Displaying of group names in /proc/sched_debug was dropped in autogroup
patches. Add group names while displaying cfs_rq and tasks information.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110111101153.GE4772@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d..4d36f37 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
 
+static DEFINE_SPINLOCK(sched_debug_lock);
+
 /*
  * This allows printing both to /proc/sched_debug and
  * to the console
@@ -86,6 +88,23 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 }
 #endif
 
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+
+static char *task_group_path(struct task_group *tg)
+{
+	/*
+	 * May be NULL if the underlying cgroup isn't fully-created yet
+	 */
+	if (!tg->css.cgroup) {
+		group_path[0] = '\0';
+		return group_path;
+	}
+	cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+	return group_path;
+}
+#endif
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -108,6 +127,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_CGROUP_SCHED
+	SEQ_printf(m, " %s", task_group_path(task_group(p)));
+#endif
 
 	SEQ_printf(m, "\n");
 }
@@ -144,7 +166,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	struct sched_entity *last;
 	unsigned long flags;
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+#else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
@@ -191,7 +217,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+#else
 	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -212,6 +242,7 @@ extern __read_mostly int sched_clock_running;
 static void print_cpu(struct seq_file *m, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
 
 #ifdef CONFIG_X86
 	{
@@ -266,10 +297,14 @@ static void print_cpu(struct seq_file *m, int cpu)
 
 #undef P
 #endif
+	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
 	print_rt_stats(m, cpu);
 
+	rcu_read_lock();
 	print_rq(m, rq, cpu);
+	rcu_read_unlock();
+	spin_unlock_irqrestore(&sched_debug_lock, flags);
 }
 
 static const char *sched_tunable_scaling_names[] = {
-- 
cgit v1.1


From 8ecedd7a06d27a31dbb36fab88e2ba6e6edd43ca Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Tue, 11 Jan 2011 15:42:57 +0530
Subject: sched: Display autogroup names in /proc/sched_debug

Add autogroup name to cfs_rq and tasks information to /proc/sched_debug.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110111101257.GF4772@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_autogroup.c | 5 +++++
 kernel/sched_debug.c     | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 32a723b..938d52f 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -231,6 +231,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 #ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
+	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+	if (!enabled || !tg->autogroup)
+		return 0;
+
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
 #endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4d36f37..e4d3725 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -93,6 +93,9 @@ static char group_path[PATH_MAX];
 
 static char *task_group_path(struct task_group *tg)
 {
+	if (autogroup_path(tg, group_path, PATH_MAX))
+		return group_path;
+
 	/*
 	 * May be NULL if the underlying cgroup isn't fully-created yet
 	 */
-- 
cgit v1.1


From f44937718ce3b8360f72f6c68c9481712517a867 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 13 Jan 2011 04:54:50 +0100
Subject: sched, autogroup: Fix CONFIG_RT_GROUP_SCHED sched_setscheduler()
 failure

If CONFIG_RT_GROUP_SCHED is set, __sched_setscheduler() fails due to autogroup
not allocating rt_runtime.  Free unused/unusable rt_se and rt_rq, redirect RT
tasks to the root task group, and tell __sched_setscheduler() that it's ok.

Reported-and-tested-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294890890.8089.39.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c           |  3 ++-
 kernel/sched_autogroup.c | 27 +++++++++++++++++++++++++++
 kernel/sched_autogroup.h |  4 ++++
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a0eb094..6cbff6b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4871,7 +4871,8 @@ recheck:
 		 * assigned.
 		 */
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
-				task_group(p)->rt_bandwidth.rt_runtime == 0) {
+				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+				!task_group_is_autogroup(task_group(p))) {
 			__task_rq_unlock(rq);
 			raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 			return -EPERM;
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 938d52f..9fb6562 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref)
 {
 	struct autogroup *ag = container_of(kref, struct autogroup, kref);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	/* We've redirected RT tasks to the root task group... */
+	ag->tg->rt_se = NULL;
+	ag->tg->rt_rq = NULL;
+#endif
 	sched_destroy_group(ag->tg);
 }
 
@@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
 	return ag;
 }
 
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
+
 static inline struct autogroup *autogroup_create(void)
 {
 	struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void)
 	init_rwsem(&ag->lock);
 	ag->id = atomic_inc_return(&autogroup_seq_nr);
 	ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+	/*
+	 * Autogroup RT tasks are redirected to the root task group
+	 * so we don't have to move tasks around upon policy change,
+	 * or flail around trying to allocate bandwidth on the fly.
+	 * A bandwidth exception in __sched_setscheduler() allows
+	 * the policy change to proceed.  Thereafter, task_group()
+	 * returns &root_task_group, so zero bandwidth is required.
+	 */
+	free_rt_sched_group(tg);
+	tg->rt_se = root_task_group.rt_se;
+	tg->rt_rq = root_task_group.rt_rq;
+#endif
 	tg->autogroup = ag;
 
 	return ag;
@@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 	return true;
 }
 
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+	return tg != &root_task_group && tg->autogroup;
+}
+
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 5358e24..7b859ff 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg);
 
 static inline void autogroup_init(struct task_struct *init_task) {  }
 static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+	return 0;
+}
 
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
-- 
cgit v1.1


From fce2097983d914ea8c2338efc6f6e9bb737f6ae4 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Fri, 14 Jan 2011 15:57:39 +0800
Subject: sched: Replace rq->bkl_count with rq->rq_sched_info.bkl_count

Now rq->rq_sched_info.bkl_count is not used for rq, scroll
rq->bkl_count into it. Thus we can save some space for rq.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294991859-13246-1-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 5 +----
 kernel/sched_debug.c | 4 +++-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6cbff6b..0a169a8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -553,9 +553,6 @@ struct rq {
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_count;
 	unsigned int ttwu_local;
-
-	/* BKL stats */
-	unsigned int bkl_count;
 #endif
 };
 
@@ -3887,7 +3884,7 @@ static inline void schedule_debug(struct task_struct *prev)
 	schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
 	if (unlikely(prev->lock_depth >= 0)) {
-		schedstat_inc(this_rq(), bkl_count);
+		schedstat_inc(this_rq(), rq_sched_info.bkl_count);
 		schedstat_inc(prev, sched_info.bkl_count);
 	}
 #endif
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e4d3725..eb6cb8e 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -296,9 +296,11 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(ttwu_count);
 	P(ttwu_local);
 
-	P(bkl_count);
+	SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
+				rq->rq_sched_info.bkl_count);
 
 #undef P
+#undef P64
 #endif
 	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
-- 
cgit v1.1


From d7d8294415f0ce4254827d4a2a5ee88b00be52a8 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 5 Jan 2011 05:41:17 +0100
Subject: sched: Fix signed unsigned comparison in check_preempt_tick()

Signed unsigned comparison may lead to superfluous resched if leftmost
is right of the current task, wasting a few cycles, and inadvertently
_lengthening_ the current task's slice.

Reported-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294202477.9384.5.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 414145c..77e9166 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		struct sched_entity *se = __pick_next_entity(cfs_rq);
 		s64 delta = curr->vruntime - se->vruntime;
 
+		if (delta < 0)
+			return;
+
 		if (delta > ideal_runtime)
 			resched_task(rq_of(cfs_rq)->curr);
 	}
-- 
cgit v1.1


From 068c5cc5ac7414a8e9eb7856b4bf3cc4d4744267 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 19 Jan 2011 12:26:11 +0100
Subject: sched, cgroup: Use exit hook to avoid use-after-free crash

By not notifying the controller of the on-exit move back to
init_css_set, we fail to move the task out of the previous
cgroup's cfs_rq. This leads to an opportunity for a
cgroup-destroy to come in and free the cgroup (there are no
active tasks left in it after all) to which the not-quite dead
task is still enqueued.

Reported-by: Miklos Vajna <vmiklos@frugalware.org>
Fixed-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1293206353.29444.205.camel@laptop>
---
 kernel/sched.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0a169a8..fa5272a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -606,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
+	if (p->flags & PF_EXITING)
+		return &root_task_group;
+
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
@@ -8880,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	}
 }
 
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+	/*
+	 * cgroup_exit() is called in the copy_process() failure path.
+	 * Ignore this case since the task hasn't ran yet, this avoids
+	 * trying to poke a half freed task state from generic code.
+	 */
+	if (!(task->flags & PF_EXITING))
+		return;
+
+	sched_move_task(task);
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
@@ -8952,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.destroy	= cpu_cgroup_destroy,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
+	.exit		= cpu_cgroup_exit,
 	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
 	.early_init	= 1,
-- 
cgit v1.1


From 2ce802f62ba32a7d95748ac92bf351f76affb6ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 20 Jan 2011 12:06:35 +0100
Subject: lockdep: Move early boot local IRQ enable/disable status to
 init/main.c

During early boot, local IRQ is disabled until IRQ subsystem is
properly initialized.  During this time, no one should enable
local IRQ and some operations which usually are not allowed with
IRQ disabled, e.g. operations which might sleep or require
communications with other processors, are allowed.

lockdep tracked this with early_boot_irqs_off/on() callbacks.
As other subsystems need this information too, move it to
init/main.c and make it generally available.  While at it,
toggle the boolean to early_boot_irqs_disabled instead of
enabled so that it can be initialized with %false and %true
indicates the exceptional condition.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110120110635.GB6036@htj.dyndns.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c             | 18 +-----------------
 kernel/trace/trace_irqsoff.c |  8 --------
 2 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65d..0d2058d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 }
 
 /*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-
-void early_boot_irqs_off(void)
-{
-	early_boot_irqs_enabled = 0;
-}
-
-void early_boot_irqs_on(void)
-{
-	early_boot_irqs_enabled = 1;
-}
-
-/*
  * Hardirqs will be enabled:
  */
 void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
-	if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+	if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
 		return;
 
 	if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c60..92b6e1e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
  * Stubs:
  */
 
-void early_boot_irqs_off(void)
-{
-}
-
-void early_boot_irqs_on(void)
-{
-}
-
 void trace_softirqs_on(unsigned long ip)
 {
 }
-- 
cgit v1.1


From bd924e8cbd4b73ffb7d707a774c04f7e2cae88ed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 20 Jan 2011 12:07:13 +0100
Subject: smp: Allow on_each_cpu() to be called while early_boot_irqs_disabled
 status to init/main.c

percpu may end up calling vfree() during early boot which in
turn may call on_each_cpu() for TLB flushes.  The function of
on_each_cpu() can be done safely while IRQ is disabled during
early boot but it assumed that the function is always called
with local IRQ enabled which ended up enabling local IRQ
prematurely during boot and triggering a couple of warnings.

This patch updates on_each_cpu() and smp_call_function_many()
such on_each_cpu() can be used safely while
early_boot_irqs_disabled is set.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110120110713.GC6036@htj.dyndns.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 4ec30e0..4b83cd6 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -430,7 +430,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	 * can't happen.
 	 */
 	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
-		     && !oops_in_progress);
+		     && !oops_in_progress && !early_boot_irqs_disabled);
 
 	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -533,17 +533,20 @@ void ipi_call_unlock_irq(void)
 #endif /* USE_GENERIC_SMP_HELPERS */
 
 /*
- * Call a function on all processors
+ * Call a function on all processors.  May be used during early boot while
+ * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
  */
 int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
+	unsigned long flags;
 	int ret = 0;
 
 	preempt_disable();
 	ret = smp_call_function(func, info, wait);
-	local_irq_disable();
+	local_irq_save(flags);
 	func(info);
-	local_irq_enable();
+	local_irq_restore(flags);
 	preempt_enable();
 	return ret;
 }
-- 
cgit v1.1


From 6dc19899958e420a931274b94019e267e2396d3e Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 20 Jan 2011 14:44:33 -0800
Subject: kernel/smp.c: fix smp_call_function_many() SMP race

I noticed a failure where we hit the following WARN_ON in
generic_smp_call_function_interrupt:

                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
                        continue;

                data->csd.func(data->csd.info);

                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);      <-------------------------

We atomically tested and cleared our bit in the cpumask, and yet the
number of cpus left (ie refs) was 0.  How can this be?

It turns out commit 54fdade1c3332391948ec43530c02c4794a38172
("generic-ipi: make struct call_function_data lockless") is at fault.  It
removes locking from smp_call_function_many and in doing so creates a
rather complicated race.

The problem comes about because:

 - The smp_call_function_many interrupt handler walks call_function.queue
   without any locking.
 - We reuse a percpu data structure in smp_call_function_many.
 - We do not wait for any RCU grace period before starting the next
   smp_call_function_many.

Imagine a scenario where CPU A does two smp_call_functions back to back,
and CPU B does an smp_call_function in between.  We concentrate on how CPU
C handles the calls:

CPU A            CPU B                  CPU C              CPU D

smp_call_function
                                        smp_call_function_interrupt
                                            walks
					call_function.queue sees
					data from CPU A on list

                 smp_call_function

                                        smp_call_function_interrupt
                                            walks

                                        call_function.queue sees
                                          (stale) CPU A on list
							   smp_call_function int
							   clears last ref on A
							   list_del_rcu, unlock
smp_call_function reuses
percpu *data A
                                         data->cpumask sees and
                                         clears bit in cpumask
                                         might be using old or new fn!
                                         decrements refs below 0

set data->refs (too late!)

The important thing to note is since the interrupt handler walks a
potentially stale call_function.queue without any locking, then another
cpu can view the percpu *data structure at any time, even when the owner
is in the process of initialising it.

The following test case hits the WARN_ON 100% of the time on my PowerPC
box (having 128 threads does help :)

#include <linux/module.h>
#include <linux/init.h>

#define ITERATIONS 100

static void do_nothing_ipi(void *dummy)
{
}

static void do_ipis(struct work_struct *dummy)
{
	int i;

	for (i = 0; i < ITERATIONS; i++)
		smp_call_function(do_nothing_ipi, NULL, 1);

	printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id());
}

static struct work_struct work[NR_CPUS];

static int __init testcase_init(void)
{
	int cpu;

	for_each_online_cpu(cpu) {
		INIT_WORK(&work[cpu], do_ipis);
		schedule_work_on(cpu, &work[cpu]);
	}

	return 0;
}

static void __exit testcase_exit(void)
{
}

module_init(testcase_init)
module_exit(testcase_exit)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Anton Blanchard");

I tried to fix it by ordering the read and the write of ->cpumask and
->refs.  In doing so I missed a critical case but Paul McKenney was able
to spot my bug thankfully :) To ensure we arent viewing previous
iterations the interrupt handler needs to read ->refs then ->cpumask then
->refs _again_.

Thanks to Milton Miller and Paul McKenney for helping to debug this issue.

[miltonm@bga.com: add WARN_ON and BUG_ON, remove extra read of refs before initial read of mask that doesn't help (also noted by Peter Zijlstra), adjust comments, hopefully clarify scenario ]
[miltonm@bga.com: remove excess tests]
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Milton Miller <miltonm@bga.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: <stable@kernel.org> [2.6.32+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 4ec30e0..17c6e58 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -195,6 +195,24 @@ void generic_smp_call_function_interrupt(void)
 	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
 
+		/*
+		 * Since we walk the list without any locks, we might
+		 * see an entry that was completed, removed from the
+		 * list and is in the process of being reused.
+		 *
+		 * We must check that the cpu is in the cpumask before
+		 * checking the refs, and both must be set before
+		 * executing the callback on this cpu.
+		 */
+
+		if (!cpumask_test_cpu(cpu, data->cpumask))
+			continue;
+
+		smp_rmb();
+
+		if (atomic_read(&data->refs) == 0)
+			continue;
+
 		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
 			continue;
 
@@ -203,6 +221,8 @@ void generic_smp_call_function_interrupt(void)
 		refs = atomic_dec_return(&data->refs);
 		WARN_ON(refs < 0);
 		if (!refs) {
+			WARN_ON(!cpumask_empty(data->cpumask));
+
 			raw_spin_lock(&call_function.lock);
 			list_del_rcu(&data->csd.list);
 			raw_spin_unlock(&call_function.lock);
@@ -454,11 +474,21 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	data = &__get_cpu_var(cfd_data);
 	csd_lock(&data->csd);
+	BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
 
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
 	cpumask_clear_cpu(this_cpu, data->cpumask);
+
+	/*
+	 * To ensure the interrupt handler gets an complete view
+	 * we order the cpumask and refs writes and order the read
+	 * of them in the interrupt handler.  In addition we may
+	 * only clear our own cpu bit from the mask.
+	 */
+	smp_wmb();
+
 	atomic_set(&data->refs, cpumask_weight(data->cpumask));
 
 	raw_spin_lock_irqsave(&call_function.lock, flags);
-- 
cgit v1.1


From 225c8e010f2d17a62aef131e24c6e7c111f36f9b Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Thu, 20 Jan 2011 14:44:34 -0800
Subject: kernel/smp.c: consolidate writes in smp_call_function_interrupt()

We have to test the cpu mask in the interrupt handler before checking the
refs, otherwise we can start to follow an entry before its deleted and
find it partially initailzed for the next trip.  Presently we also clear
the cpumask bit before executing the called function, which implies
getting write access to the line.  After the function is called we then
decrement refs, and if they go to zero we then unlock the structure.

However, this implies getting write access to the call function data
before and after another the function is called.  If we can assert that no
smp_call_function execution function is allowed to enable interrupts, then
we can move both writes to after the function is called, hopfully allowing
both writes with one cache line bounce.

On a 256 thread system with a kernel compiled for 1024 threads, the time
to execute testcase in the "smp_call_function_many race" changelog was
reduced by about 30-40ms out of about 545 ms.

I decided to keep this as WARN because its now a buggy function, even
though the stack trace is of no value -- a simple printk would give us the
information needed.

Raw data:

Without patch:
  ipi_test startup took 1219366ns complete 539819014ns total 541038380ns
  ipi_test startup took 1695754ns complete 543439872ns total 545135626ns
  ipi_test startup took 7513568ns complete 539606362ns total 547119930ns
  ipi_test startup took 13304064ns complete 533898562ns total 547202626ns
  ipi_test startup took 8668192ns complete 544264074ns total 552932266ns
  ipi_test startup took 4977626ns complete 548862684ns total 553840310ns
  ipi_test startup took 2144486ns complete 541292318ns total 543436804ns
  ipi_test startup took 21245824ns complete 530280180ns total 551526004ns

With patch:
  ipi_test startup took 5961748ns complete 500859628ns total 506821376ns
  ipi_test startup took 8975996ns complete 495098924ns total 504074920ns
  ipi_test startup took 19797750ns complete 492204740ns total 512002490ns
  ipi_test startup took 14824796ns complete 487495878ns total 502320674ns
  ipi_test startup took 11514882ns complete 494439372ns total 505954254ns
  ipi_test startup took 8288084ns complete 502570774ns total 510858858ns
  ipi_test startup took 6789954ns complete 493388112ns total 500178066ns

	#include <linux/module.h>
	#include <linux/init.h>
	#include <linux/sched.h> /* sched clock */

	#define ITERATIONS 100

	static void do_nothing_ipi(void *dummy)
	{
	}

	static void do_ipis(struct work_struct *dummy)
	{
		int i;

		for (i = 0; i < ITERATIONS; i++)
			smp_call_function(do_nothing_ipi, NULL, 1);

		printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id());
	}

	static struct work_struct work[NR_CPUS];

	static int __init testcase_init(void)
	{
		int cpu;
		u64 start, started, done;

		start = local_clock();
		for_each_online_cpu(cpu) {
			INIT_WORK(&work[cpu], do_ipis);
			schedule_work_on(cpu, &work[cpu]);
		}
		started = local_clock();
		for_each_online_cpu(cpu)
			flush_work(&work[cpu]);
		done = local_clock();
		pr_info("ipi_test startup took %lldns complete %lldns total %lldns\n",
			started-start, done-started, done-start);

		return 0;
	}

	static void __exit testcase_exit(void)
	{
	}

	module_init(testcase_init)
	module_exit(testcase_exit)
	MODULE_LICENSE("GPL");
	MODULE_AUTHOR("Anton Blanchard");

Signed-off-by: Milton Miller <miltonm@bga.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 17c6e58..2fe66f7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,6 +194,7 @@ void generic_smp_call_function_interrupt(void)
 	 */
 	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
+		void (*func) (void *info);
 
 		/*
 		 * Since we walk the list without any locks, we might
@@ -213,24 +214,32 @@ void generic_smp_call_function_interrupt(void)
 		if (atomic_read(&data->refs) == 0)
 			continue;
 
-		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
-			continue;
-
+		func = data->csd.func;			/* for later warn */
 		data->csd.func(data->csd.info);
 
+		/*
+		 * If the cpu mask is not still set then it enabled interrupts,
+		 * we took another smp interrupt, and executed the function
+		 * twice on this cpu.  In theory that copy decremented refs.
+		 */
+		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+			WARN(1, "%pS enabled interrupts and double executed\n",
+			     func);
+			continue;
+		}
+
 		refs = atomic_dec_return(&data->refs);
 		WARN_ON(refs < 0);
-		if (!refs) {
-			WARN_ON(!cpumask_empty(data->cpumask));
-
-			raw_spin_lock(&call_function.lock);
-			list_del_rcu(&data->csd.list);
-			raw_spin_unlock(&call_function.lock);
-		}
 
 		if (refs)
 			continue;
 
+		WARN_ON(!cpumask_empty(data->cpumask));
+
+		raw_spin_lock(&call_function.lock);
+		list_del_rcu(&data->csd.list);
+		raw_spin_unlock(&call_function.lock);
+
 		csd_unlock(&data->csd);
 	}
 
-- 
cgit v1.1


From 1c77ff22f539ceaa64ea43d6a26d867e84602cb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Jan 2011 19:41:35 +0100
Subject: genirq: Remove __do_IRQ

All architectures are finally converted. Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Michal Simek <monstr@monstr.eu>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chen Liqin <liqin.chen@sunplusct.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Jeff Dike <jdike@addtoit.com>
---
 kernel/irq/Kconfig  |   3 --
 kernel/irq/handle.c | 111 ----------------------------------------------------
 2 files changed, 114 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766b..8e42fec 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
 config GENERIC_HARDIRQS
        def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-       def_bool y
-
 # Select this to disable the deprecated stuff
 config GENERIC_HARDIRQS_NO_DEPRECATED
        def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb..3540a71 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	return retval;
 }
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq:	the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action;
-	unsigned int status;
-
-	kstat_incr_irqs_this_cpu(irq, desc);
-
-	if (CHECK_IRQ_PER_CPU(desc->status)) {
-		irqreturn_t action_ret;
-
-		/*
-		 * No locking required for CPU-local interrupts:
-		 */
-		if (desc->irq_data.chip->ack)
-			desc->irq_data.chip->ack(irq);
-		if (likely(!(desc->status & IRQ_DISABLED))) {
-			action_ret = handle_IRQ_event(irq, desc->action);
-			if (!noirqdebug)
-				note_interrupt(irq, desc, action_ret);
-		}
-		desc->irq_data.chip->end(irq);
-		return 1;
-	}
-
-	raw_spin_lock(&desc->lock);
-	if (desc->irq_data.chip->ack)
-		desc->irq_data.chip->ack(irq);
-	/*
-	 * REPLAY is when Linux resends an IRQ that was dropped earlier
-	 * WAITING is used by probe to mark irqs that are being tested
-	 */
-	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
-	status |= IRQ_PENDING; /* we _want_ to handle it */
-
-	/*
-	 * If the IRQ is disabled for whatever reason, we cannot
-	 * use the action we have.
-	 */
-	action = NULL;
-	if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
-		action = desc->action;
-		status &= ~IRQ_PENDING; /* we commit to handling */
-		status |= IRQ_INPROGRESS; /* we are handling it */
-	}
-	desc->status = status;
-
-	/*
-	 * If there is no IRQ handler or it was disabled, exit early.
-	 * Since we set PENDING, if another processor is handling
-	 * a different instance of this same irq, the other processor
-	 * will take care of it.
-	 */
-	if (unlikely(!action))
-		goto out;
-
-	/*
-	 * Edge triggered interrupts need to remember
-	 * pending events.
-	 * This applies to any hw interrupts that allow a second
-	 * instance of the same irq to arrive while we are in do_IRQ
-	 * or in the handler. But the code here only handles the _second_
-	 * instance of the irq, not the third or fourth. So it is mostly
-	 * useful for irq hardware that does not mask cleanly in an
-	 * SMP environment.
-	 */
-	for (;;) {
-		irqreturn_t action_ret;
-
-		raw_spin_unlock(&desc->lock);
-
-		action_ret = handle_IRQ_event(irq, action);
-		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
-
-		raw_spin_lock(&desc->lock);
-		if (likely(!(desc->status & IRQ_PENDING)))
-			break;
-		desc->status &= ~IRQ_PENDING;
-	}
-	desc->status &= ~IRQ_INPROGRESS;
-
-out:
-	/*
-	 * The ->end() handler has to deal with interrupts which got
-	 * disabled while the handler was running.
-	 */
-	desc->irq_data.chip->end(irq);
-	raw_spin_unlock(&desc->lock);
-
-	return 1;
-}
-#endif
-- 
cgit v1.1