From 48e6484d49020dba3578ad117b461e8a391e8f0f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:48 -0700
Subject: [PATCH] proc: Rewrite the proc dentry flush on exit optimization

To keep the dcache from filling up with dead /proc entries we flush them on
process exit.  However over the years that code has gotten hairy with a
dentry_pointer and a lock in task_struct and misdocumented as a correctness
feature.

I have rewritten this code to look and see if we have a corresponding entry in
the dcache and if so flush it on process exit.  This removes the extra fields
in the task_struct and allows me to trivially handle the case of a
/proc/<tgid>/task/<pid> entry as well as the current /proc/<pid> entries.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs/exec.c')
diff --git a/fs/exec.c b/fs/exec.c
index 0b88bf6..8c51960 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(current)) {
-		struct dentry *proc_dentry1, *proc_dentry2;
-
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
@@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk)
 		 */
 		current->start_time = leader->start_time;
 
-		spin_lock(&leader->proc_lock);
-		spin_lock(&current->proc_lock);
-		proc_dentry1 = proc_pid_unhash(current);
-		proc_dentry2 = proc_pid_unhash(leader);
 		write_lock_irq(&tasklist_lock);
 
 		BUG_ON(leader->tgid != current->tgid);
@@ -729,10 +723,6 @@ static int de_thread(struct task_struct *tsk)
 		leader->exit_state = EXIT_DEAD;
 
 		write_unlock_irq(&tasklist_lock);
-		spin_unlock(&leader->proc_lock);
-		spin_unlock(&current->proc_lock);
-		proc_pid_flush(proc_dentry1);
-		proc_pid_flush(proc_dentry2);
         }
 
 	/*
-- 
cgit v1.1


From 2ceb8693ef63ae3d154ce1a05d275f2bb20a5e4c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:04 -0700
Subject: [PATCH] de_thread: fix lockless do_each_thread

We should keep the value of old_leader->tasks.next in de_thread, otherwise
we can't do for_each_process/do_each_thread without tasklist_lock held.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index 8c51960..fffea1e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -707,7 +707,7 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_PID,  current->pid);
 		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_add_tail_rcu(&current->tasks, &init_task.tasks);
+		list_replace_rcu(&leader->tasks, &current->tasks);
 
 		current->group_leader = current;
 		leader->group_leader = current;
@@ -715,7 +715,6 @@ static int de_thread(struct task_struct *tsk)
 		/* Reduce leader to a thread */
 		detach_pid(leader, PIDTYPE_PGID);
 		detach_pid(leader, PIDTYPE_SID);
-		list_del_init(&leader->tasks);
 
 		current->exit_signal = SIGCHLD;
 
-- 
cgit v1.1


From aceecc041217b35df753d1ed6e25bd17c0c558d8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:05 -0700
Subject: [PATCH] coredump: optimize ->mm users traversal

zap_threads() iterates over all threads to find those ones which share
current->mm.  All threads in the thread group share the same ->mm, so we can
skip entire thread group if it has another ->mm.

This patch shifts the killing of thread group into the newly added
zap_process() function.  This looks as unnecessary complication, but it is
used in further patches.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index fffea1e..80fe7bc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1368,6 +1368,22 @@ static void format_corename(char *corename, const char *pattern, long signr)
 	*out_ptr = 0;
 }
 
+static void zap_process(struct task_struct *start, int *ptraced)
+{
+	struct task_struct *t;
+
+	t = start;
+	do {
+		if (t != current && t->mm) {
+			t->mm->core_waiters++;
+			force_sig_specific(SIGKILL, t);
+			if (unlikely(t->ptrace) &&
+			    unlikely(t->parent->mm == t->mm))
+				*ptraced = 1;
+		}
+	} while ((t = next_thread(t)) != start);
+}
+
 static void zap_threads (struct mm_struct *mm)
 {
 	struct task_struct *g, *p;
@@ -1385,16 +1401,16 @@ static void zap_threads (struct mm_struct *mm)
 	}
 
 	read_lock(&tasklist_lock);
-	do_each_thread(g,p)
-		if (mm == p->mm && p != tsk) {
-			force_sig_specific(SIGKILL, p);
-			mm->core_waiters++;
-			if (unlikely(p->ptrace) &&
-			    unlikely(p->parent->mm == mm))
-				traced = 1;
-		}
-	while_each_thread(g,p);
-
+	for_each_process(g) {
+		p = g;
+		do {
+			if (p->mm) {
+				if (p->mm == mm)
+					zap_process(p, &traced);
+				break;
+			}
+		} while ((p = next_thread(p)) != g);
+	}
 	read_unlock(&tasklist_lock);
 
 	if (unlikely(traced)) {
-- 
cgit v1.1


From 281de339ceb822ca6c04d4373ecb9a45c1890ce4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:06 -0700
Subject: [PATCH] coredump: speedup SIGKILL sending

With this patch a thread group is killed atomically under ->siglock.  This is
faster because we can use sigaddset() instead of force_sig_info() and this is
used in further patches.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index 80fe7bc..a5c5164 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1371,17 +1371,24 @@ static void format_corename(char *corename, const char *pattern, long signr)
 static void zap_process(struct task_struct *start, int *ptraced)
 {
 	struct task_struct *t;
+	unsigned long flags;
+
+	spin_lock_irqsave(&start->sighand->siglock, flags);
 
 	t = start;
 	do {
 		if (t != current && t->mm) {
 			t->mm->core_waiters++;
-			force_sig_specific(SIGKILL, t);
+			sigaddset(&t->pending.signal, SIGKILL);
+			signal_wake_up(t, 1);
+
 			if (unlikely(t->ptrace) &&
 			    unlikely(t->parent->mm == t->mm))
 				*ptraced = 1;
 		}
 	} while ((t = next_thread(t)) != start);
+
+	spin_unlock_irqrestore(&start->sighand->siglock, flags);
 }
 
 static void zap_threads (struct mm_struct *mm)
-- 
cgit v1.1


From d5f70c00ad24cd1158d3678b44ff969b4c971d49 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:07 -0700
Subject: [PATCH] coredump: kill ptrace related stuff

With this patch zap_process() sets SIGNAL_GROUP_EXIT while sending SIGKILL to
the thread group.  This means that a TASK_TRACED task

	1. Will be awakened by signal_wake_up(1)

	2. Can't sleep again via ptrace_notify()

	3. Can't go to do_signal_stop() after return
	   from ptrace_stop() in get_signal_to_deliver()

So we can remove all ptrace related stuff from coredump path.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index a5c5164..b58ba7d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1368,12 +1368,14 @@ static void format_corename(char *corename, const char *pattern, long signr)
 	*out_ptr = 0;
 }
 
-static void zap_process(struct task_struct *start, int *ptraced)
+static void zap_process(struct task_struct *start)
 {
 	struct task_struct *t;
 	unsigned long flags;
 
 	spin_lock_irqsave(&start->sighand->siglock, flags);
+	start->signal->flags = SIGNAL_GROUP_EXIT;
+	start->signal->group_stop_count = 0;
 
 	t = start;
 	do {
@@ -1381,22 +1383,17 @@ static void zap_process(struct task_struct *start, int *ptraced)
 			t->mm->core_waiters++;
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
-
-			if (unlikely(t->ptrace) &&
-			    unlikely(t->parent->mm == t->mm))
-				*ptraced = 1;
 		}
 	} while ((t = next_thread(t)) != start);
 
 	spin_unlock_irqrestore(&start->sighand->siglock, flags);
 }
 
-static void zap_threads (struct mm_struct *mm)
+static void zap_threads(struct mm_struct *mm)
 {
 	struct task_struct *g, *p;
 	struct task_struct *tsk = current;
 	struct completion *vfork_done = tsk->vfork_done;
-	int traced = 0;
 
 	/*
 	 * Make sure nobody is waiting for us to release the VM,
@@ -1413,29 +1410,12 @@ static void zap_threads (struct mm_struct *mm)
 		do {
 			if (p->mm) {
 				if (p->mm == mm)
-					zap_process(p, &traced);
+					zap_process(p);
 				break;
 			}
 		} while ((p = next_thread(p)) != g);
 	}
 	read_unlock(&tasklist_lock);
-
-	if (unlikely(traced)) {
-		/*
-		 * We are zapping a thread and the thread it ptraces.
-		 * If the tracee went into a ptrace stop for exit tracing,
-		 * we could deadlock since the tracer is waiting for this
-		 * coredump to finish.  Detach them so they can both die.
-		 */
-		write_lock_irq(&tasklist_lock);
-		do_each_thread(g,p) {
-			if (mm == p->mm && p != tsk &&
-			    p->ptrace && p->parent->mm == mm) {
-				__ptrace_detach(p, 0);
-			}
-		} while_each_thread(g,p);
-		write_unlock_irq(&tasklist_lock);
-	}
 }
 
 static void coredump_wait(struct mm_struct *mm)
-- 
cgit v1.1


From 7b1c6154fa8bb937e0b1b4f2adbb315d70270f10 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:08 -0700
Subject: [PATCH] coredump: don't take tasklist_lock

This patch removes tasklist_lock from zap_threads().
This is safe wrt:

	do_exit:
		The caller holds mm->mmap_sem. This means that task which
		shares the same ->mm can't pass exit_mm(), so it can't be
		unhashed from init_task.tasks or ->thread_group lists.

	fork:
		None of sub-threads can fork after zap_process(leader). All
		processes which were created before this point should be
		visible to zap_threads() because copy_process() adds the new
		process to the tail of init_task.tasks list, and ->siglock
		lock/unlock provides a memory barrier.

	de_thread:
		It does list_replace_rcu(&leader->tasks, &current->tasks).
		So zap_threads() will see either old or new leader, it does
		not matter. However, it can change p->sighand, so we should
		use lock_task_sighand() in zap_process().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index b58ba7d..49fa012 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1373,7 +1373,11 @@ static void zap_process(struct task_struct *start)
 	struct task_struct *t;
 	unsigned long flags;
 
-	spin_lock_irqsave(&start->sighand->siglock, flags);
+	/*
+	 * start->sighand can't disappear, but may be
+	 * changed by de_thread()
+	 */
+	lock_task_sighand(start, &flags);
 	start->signal->flags = SIGNAL_GROUP_EXIT;
 	start->signal->group_stop_count = 0;
 
@@ -1386,7 +1390,7 @@ static void zap_process(struct task_struct *start)
 		}
 	} while ((t = next_thread(t)) != start);
 
-	spin_unlock_irqrestore(&start->sighand->siglock, flags);
+	unlock_task_sighand(start, &flags);
 }
 
 static void zap_threads(struct mm_struct *mm)
@@ -1404,7 +1408,7 @@ static void zap_threads(struct mm_struct *mm)
 		complete(vfork_done);
 	}
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	for_each_process(g) {
 		p = g;
 		do {
@@ -1415,7 +1419,7 @@ static void zap_threads(struct mm_struct *mm)
 			}
 		} while ((p = next_thread(p)) != g);
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 }
 
 static void coredump_wait(struct mm_struct *mm)
-- 
cgit v1.1


From dcf560c59330945a231d5e52f95dfedde4e32c9d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:08 -0700
Subject: [PATCH] coredump: some code relocations

This is a preparation for the next patch.  No functional changes.
Basically, this patch moves '->flags & SIGNAL_GROUP_EXIT' check into
zap_threads(), and 'complete(vfork_done)' into coredump_wait outside of
->mmap_sem protected area.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 70 ++++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index 49fa012..8c8f289 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1393,20 +1393,22 @@ static void zap_process(struct task_struct *start)
 	unlock_task_sighand(start, &flags);
 }
 
-static void zap_threads(struct mm_struct *mm)
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+				int exit_code)
 {
 	struct task_struct *g, *p;
-	struct task_struct *tsk = current;
-	struct completion *vfork_done = tsk->vfork_done;
-
-	/*
-	 * Make sure nobody is waiting for us to release the VM,
-	 * otherwise we can deadlock when we wait on each other
-	 */
-	if (vfork_done) {
-		tsk->vfork_done = NULL;
-		complete(vfork_done);
+	int err = -EAGAIN;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+		tsk->signal->flags = SIGNAL_GROUP_EXIT;
+		tsk->signal->group_exit_code = exit_code;
+		tsk->signal->group_stop_count = 0;
+		err = 0;
 	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (err)
+		return err;
 
 	rcu_read_lock();
 	for_each_process(g) {
@@ -1420,22 +1422,43 @@ static void zap_threads(struct mm_struct *mm)
 		} while ((p = next_thread(p)) != g);
 	}
 	rcu_read_unlock();
+
+	return mm->core_waiters;
 }
 
-static void coredump_wait(struct mm_struct *mm)
+static int coredump_wait(int exit_code)
 {
-	DECLARE_COMPLETION(startup_done);
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+	struct completion startup_done;
+	struct completion *vfork_done;
 	int core_waiters;
 
+	init_completion(&mm->core_done);
+	init_completion(&startup_done);
 	mm->core_startup_done = &startup_done;
 
-	zap_threads(mm);
-	core_waiters = mm->core_waiters;
+	core_waiters = zap_threads(tsk, mm, exit_code);
 	up_write(&mm->mmap_sem);
 
+	if (unlikely(core_waiters < 0))
+		goto fail;
+
+	/*
+	 * Make sure nobody is waiting for us to release the VM,
+	 * otherwise we can deadlock when we wait on each other
+	 */
+	vfork_done = tsk->vfork_done;
+	if (vfork_done) {
+		tsk->vfork_done = NULL;
+		complete(vfork_done);
+	}
+
 	if (core_waiters)
 		wait_for_completion(&startup_done);
+fail:
 	BUG_ON(mm->core_waiters);
+	return core_waiters;
 }
 
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
@@ -1469,22 +1492,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	}
 	mm->dumpable = 0;
 
-	retval = -EAGAIN;
-	spin_lock_irq(&current->sighand->siglock);
-	if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
-		current->signal->flags = SIGNAL_GROUP_EXIT;
-		current->signal->group_exit_code = exit_code;
-		current->signal->group_stop_count = 0;
-		retval = 0;
-	}
-	spin_unlock_irq(&current->sighand->siglock);
-	if (retval) {
-		up_write(&mm->mmap_sem);
+	retval = coredump_wait(exit_code);
+	if (retval < 0)
 		goto fail;
-	}
-
-	init_completion(&mm->core_done);
-	coredump_wait(mm);
 
 	/*
 	 * Clear any false indication of pending signals that might
-- 
cgit v1.1


From 5debfa6da5b06954bc79fe8deed0d1062c58dcec Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:09 -0700
Subject: [PATCH] coredump: shutdown current process first

This patch optimizes zap_threads() for the case when there are no ->mm
users except the current's thread group.  In that case we can avoid
'for_each_process()' loop.

It also adds a useful invariant: SIGNAL_GROUP_EXIT (if checked under
->siglock) always implies that all threads (except may be current) have
pending SIGKILL.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index 8c8f289..c8494f5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1371,13 +1371,7 @@ static void format_corename(char *corename, const char *pattern, long signr)
 static void zap_process(struct task_struct *start)
 {
 	struct task_struct *t;
-	unsigned long flags;
 
-	/*
-	 * start->sighand can't disappear, but may be
-	 * changed by de_thread()
-	 */
-	lock_task_sighand(start, &flags);
 	start->signal->flags = SIGNAL_GROUP_EXIT;
 	start->signal->group_stop_count = 0;
 
@@ -1389,40 +1383,51 @@ static void zap_process(struct task_struct *start)
 			signal_wake_up(t, 1);
 		}
 	} while ((t = next_thread(t)) != start);
-
-	unlock_task_sighand(start, &flags);
 }
 
 static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 				int exit_code)
 {
 	struct task_struct *g, *p;
+	unsigned long flags;
 	int err = -EAGAIN;
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
-		tsk->signal->flags = SIGNAL_GROUP_EXIT;
 		tsk->signal->group_exit_code = exit_code;
-		tsk->signal->group_stop_count = 0;
+		zap_process(tsk);
 		err = 0;
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
 	if (err)
 		return err;
 
+	if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+		goto done;
+
 	rcu_read_lock();
 	for_each_process(g) {
+		if (g == tsk->group_leader)
+			continue;
+
 		p = g;
 		do {
 			if (p->mm) {
-				if (p->mm == mm)
+				if (p->mm == mm) {
+					/*
+					 * p->sighand can't disappear, but
+					 * may be changed by de_thread()
+					 */
+					lock_task_sighand(p, &flags);
 					zap_process(p);
+					unlock_task_sighand(p, &flags);
+				}
 				break;
 			}
 		} while ((p = next_thread(p)) != g);
 	}
 	rcu_read_unlock();
-
+done:
 	return mm->core_waiters;
 }
 
-- 
cgit v1.1