From 48e6484d49020dba3578ad117b461e8a391e8f0f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Jun 2006 00:25:48 -0700 Subject: [PATCH] proc: Rewrite the proc dentry flush on exit optimization To keep the dcache from filling up with dead /proc entries we flush them on process exit. However over the years that code has gotten hairy with a dentry_pointer and a lock in task_struct and misdocumented as a correctness feature. I have rewritten this code to look and see if we have a corresponding entry in the dcache and if so flush it on process exit. This removes the extra fields in the task_struct and allows me to trivially handle the case of a /proc//task/ entry as well as the current /proc/ entries. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 0b88bf6..8c51960 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct dentry *proc_dentry1, *proc_dentry2; - /* * Wait for the thread group leader to be a zombie. * It should already be zombie at this point, most @@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk) */ current->start_time = leader->start_time; - spin_lock(&leader->proc_lock); - spin_lock(¤t->proc_lock); - proc_dentry1 = proc_pid_unhash(current); - proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); BUG_ON(leader->tgid != current->tgid); @@ -729,10 +723,6 @@ static int de_thread(struct task_struct *tsk) leader->exit_state = EXIT_DEAD; write_unlock_irq(&tasklist_lock); - spin_unlock(&leader->proc_lock); - spin_unlock(¤t->proc_lock); - proc_pid_flush(proc_dentry1); - proc_pid_flush(proc_dentry2); } /* -- cgit v1.1 From 2ceb8693ef63ae3d154ce1a05d275f2bb20a5e4c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:04 -0700 Subject: [PATCH] de_thread: fix lockless do_each_thread We should keep the value of old_leader->tasks.next in de_thread, otherwise we can't do for_each_process/do_each_thread without tasklist_lock held. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 8c51960..fffea1e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -707,7 +707,7 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail_rcu(¤t->tasks, &init_task.tasks); + list_replace_rcu(&leader->tasks, ¤t->tasks); current->group_leader = current; leader->group_leader = current; @@ -715,7 +715,6 @@ static int de_thread(struct task_struct *tsk) /* Reduce leader to a thread */ detach_pid(leader, PIDTYPE_PGID); detach_pid(leader, PIDTYPE_SID); - list_del_init(&leader->tasks); current->exit_signal = SIGCHLD; -- cgit v1.1 From aceecc041217b35df753d1ed6e25bd17c0c558d8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:05 -0700 Subject: [PATCH] coredump: optimize ->mm users traversal zap_threads() iterates over all threads to find those ones which share current->mm. All threads in the thread group share the same ->mm, so we can skip entire thread group if it has another ->mm. This patch shifts the killing of thread group into the newly added zap_process() function. This looks as unnecessary complication, but it is used in further patches. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index fffea1e..80fe7bc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1368,6 +1368,22 @@ static void format_corename(char *corename, const char *pattern, long signr) *out_ptr = 0; } +static void zap_process(struct task_struct *start, int *ptraced) +{ + struct task_struct *t; + + t = start; + do { + if (t != current && t->mm) { + t->mm->core_waiters++; + force_sig_specific(SIGKILL, t); + if (unlikely(t->ptrace) && + unlikely(t->parent->mm == t->mm)) + *ptraced = 1; + } + } while ((t = next_thread(t)) != start); +} + static void zap_threads (struct mm_struct *mm) { struct task_struct *g, *p; @@ -1385,16 +1401,16 @@ static void zap_threads (struct mm_struct *mm) } read_lock(&tasklist_lock); - do_each_thread(g,p) - if (mm == p->mm && p != tsk) { - force_sig_specific(SIGKILL, p); - mm->core_waiters++; - if (unlikely(p->ptrace) && - unlikely(p->parent->mm == mm)) - traced = 1; - } - while_each_thread(g,p); - + for_each_process(g) { + p = g; + do { + if (p->mm) { + if (p->mm == mm) + zap_process(p, &traced); + break; + } + } while ((p = next_thread(p)) != g); + } read_unlock(&tasklist_lock); if (unlikely(traced)) { -- cgit v1.1 From 281de339ceb822ca6c04d4373ecb9a45c1890ce4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:06 -0700 Subject: [PATCH] coredump: speedup SIGKILL sending With this patch a thread group is killed atomically under ->siglock. This is faster because we can use sigaddset() instead of force_sig_info() and this is used in further patches. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 80fe7bc..a5c5164 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1371,17 +1371,24 @@ static void format_corename(char *corename, const char *pattern, long signr) static void zap_process(struct task_struct *start, int *ptraced) { struct task_struct *t; + unsigned long flags; + + spin_lock_irqsave(&start->sighand->siglock, flags); t = start; do { if (t != current && t->mm) { t->mm->core_waiters++; - force_sig_specific(SIGKILL, t); + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + if (unlikely(t->ptrace) && unlikely(t->parent->mm == t->mm)) *ptraced = 1; } } while ((t = next_thread(t)) != start); + + spin_unlock_irqrestore(&start->sighand->siglock, flags); } static void zap_threads (struct mm_struct *mm) -- cgit v1.1 From d5f70c00ad24cd1158d3678b44ff969b4c971d49 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:07 -0700 Subject: [PATCH] coredump: kill ptrace related stuff With this patch zap_process() sets SIGNAL_GROUP_EXIT while sending SIGKILL to the thread group. This means that a TASK_TRACED task 1. Will be awakened by signal_wake_up(1) 2. Can't sleep again via ptrace_notify() 3. Can't go to do_signal_stop() after return from ptrace_stop() in get_signal_to_deliver() So we can remove all ptrace related stuff from coredump path. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index a5c5164..b58ba7d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1368,12 +1368,14 @@ static void format_corename(char *corename, const char *pattern, long signr) *out_ptr = 0; } -static void zap_process(struct task_struct *start, int *ptraced) +static void zap_process(struct task_struct *start) { struct task_struct *t; unsigned long flags; spin_lock_irqsave(&start->sighand->siglock, flags); + start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_stop_count = 0; t = start; do { @@ -1381,22 +1383,17 @@ static void zap_process(struct task_struct *start, int *ptraced) t->mm->core_waiters++; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - - if (unlikely(t->ptrace) && - unlikely(t->parent->mm == t->mm)) - *ptraced = 1; } } while ((t = next_thread(t)) != start); spin_unlock_irqrestore(&start->sighand->siglock, flags); } -static void zap_threads (struct mm_struct *mm) +static void zap_threads(struct mm_struct *mm) { struct task_struct *g, *p; struct task_struct *tsk = current; struct completion *vfork_done = tsk->vfork_done; - int traced = 0; /* * Make sure nobody is waiting for us to release the VM, @@ -1413,29 +1410,12 @@ static void zap_threads (struct mm_struct *mm) do { if (p->mm) { if (p->mm == mm) - zap_process(p, &traced); + zap_process(p); break; } } while ((p = next_thread(p)) != g); } read_unlock(&tasklist_lock); - - if (unlikely(traced)) { - /* - * We are zapping a thread and the thread it ptraces. - * If the tracee went into a ptrace stop for exit tracing, - * we could deadlock since the tracer is waiting for this - * coredump to finish. Detach them so they can both die. - */ - write_lock_irq(&tasklist_lock); - do_each_thread(g,p) { - if (mm == p->mm && p != tsk && - p->ptrace && p->parent->mm == mm) { - __ptrace_detach(p, 0); - } - } while_each_thread(g,p); - write_unlock_irq(&tasklist_lock); - } } static void coredump_wait(struct mm_struct *mm) -- cgit v1.1 From 7b1c6154fa8bb937e0b1b4f2adbb315d70270f10 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:08 -0700 Subject: [PATCH] coredump: don't take tasklist_lock This patch removes tasklist_lock from zap_threads(). This is safe wrt: do_exit: The caller holds mm->mmap_sem. This means that task which shares the same ->mm can't pass exit_mm(), so it can't be unhashed from init_task.tasks or ->thread_group lists. fork: None of sub-threads can fork after zap_process(leader). All processes which were created before this point should be visible to zap_threads() because copy_process() adds the new process to the tail of init_task.tasks list, and ->siglock lock/unlock provides a memory barrier. de_thread: It does list_replace_rcu(&leader->tasks, ¤t->tasks). So zap_threads() will see either old or new leader, it does not matter. However, it can change p->sighand, so we should use lock_task_sighand() in zap_process(). Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index b58ba7d..49fa012 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1373,7 +1373,11 @@ static void zap_process(struct task_struct *start) struct task_struct *t; unsigned long flags; - spin_lock_irqsave(&start->sighand->siglock, flags); + /* + * start->sighand can't disappear, but may be + * changed by de_thread() + */ + lock_task_sighand(start, &flags); start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_stop_count = 0; @@ -1386,7 +1390,7 @@ static void zap_process(struct task_struct *start) } } while ((t = next_thread(t)) != start); - spin_unlock_irqrestore(&start->sighand->siglock, flags); + unlock_task_sighand(start, &flags); } static void zap_threads(struct mm_struct *mm) @@ -1404,7 +1408,7 @@ static void zap_threads(struct mm_struct *mm) complete(vfork_done); } - read_lock(&tasklist_lock); + rcu_read_lock(); for_each_process(g) { p = g; do { @@ -1415,7 +1419,7 @@ static void zap_threads(struct mm_struct *mm) } } while ((p = next_thread(p)) != g); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); } static void coredump_wait(struct mm_struct *mm) -- cgit v1.1 From dcf560c59330945a231d5e52f95dfedde4e32c9d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:08 -0700 Subject: [PATCH] coredump: some code relocations This is a preparation for the next patch. No functional changes. Basically, this patch moves '->flags & SIGNAL_GROUP_EXIT' check into zap_threads(), and 'complete(vfork_done)' into coredump_wait outside of ->mmap_sem protected area. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 70 ++++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 30 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 49fa012..8c8f289 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1393,20 +1393,22 @@ static void zap_process(struct task_struct *start) unlock_task_sighand(start, &flags); } -static void zap_threads(struct mm_struct *mm) +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + int exit_code) { struct task_struct *g, *p; - struct task_struct *tsk = current; - struct completion *vfork_done = tsk->vfork_done; - - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); + int err = -EAGAIN; + + spin_lock_irq(&tsk->sighand->siglock); + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + tsk->signal->flags = SIGNAL_GROUP_EXIT; + tsk->signal->group_exit_code = exit_code; + tsk->signal->group_stop_count = 0; + err = 0; } + spin_unlock_irq(&tsk->sighand->siglock); + if (err) + return err; rcu_read_lock(); for_each_process(g) { @@ -1420,22 +1422,43 @@ static void zap_threads(struct mm_struct *mm) } while ((p = next_thread(p)) != g); } rcu_read_unlock(); + + return mm->core_waiters; } -static void coredump_wait(struct mm_struct *mm) +static int coredump_wait(int exit_code) { - DECLARE_COMPLETION(startup_done); + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + struct completion startup_done; + struct completion *vfork_done; int core_waiters; + init_completion(&mm->core_done); + init_completion(&startup_done); mm->core_startup_done = &startup_done; - zap_threads(mm); - core_waiters = mm->core_waiters; + core_waiters = zap_threads(tsk, mm, exit_code); up_write(&mm->mmap_sem); + if (unlikely(core_waiters < 0)) + goto fail; + + /* + * Make sure nobody is waiting for us to release the VM, + * otherwise we can deadlock when we wait on each other + */ + vfork_done = tsk->vfork_done; + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + if (core_waiters) wait_for_completion(&startup_done); +fail: BUG_ON(mm->core_waiters); + return core_waiters; } int do_coredump(long signr, int exit_code, struct pt_regs * regs) @@ -1469,22 +1492,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) } mm->dumpable = 0; - retval = -EAGAIN; - spin_lock_irq(¤t->sighand->siglock); - if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) { - current->signal->flags = SIGNAL_GROUP_EXIT; - current->signal->group_exit_code = exit_code; - current->signal->group_stop_count = 0; - retval = 0; - } - spin_unlock_irq(¤t->sighand->siglock); - if (retval) { - up_write(&mm->mmap_sem); + retval = coredump_wait(exit_code); + if (retval < 0) goto fail; - } - - init_completion(&mm->core_done); - coredump_wait(mm); /* * Clear any false indication of pending signals that might -- cgit v1.1 From 5debfa6da5b06954bc79fe8deed0d1062c58dcec Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:09 -0700 Subject: [PATCH] coredump: shutdown current process first This patch optimizes zap_threads() for the case when there are no ->mm users except the current's thread group. In that case we can avoid 'for_each_process()' loop. It also adds a useful invariant: SIGNAL_GROUP_EXIT (if checked under ->siglock) always implies that all threads (except may be current) have pending SIGKILL. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 8c8f289..c8494f5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1371,13 +1371,7 @@ static void format_corename(char *corename, const char *pattern, long signr) static void zap_process(struct task_struct *start) { struct task_struct *t; - unsigned long flags; - /* - * start->sighand can't disappear, but may be - * changed by de_thread() - */ - lock_task_sighand(start, &flags); start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_stop_count = 0; @@ -1389,40 +1383,51 @@ static void zap_process(struct task_struct *start) signal_wake_up(t, 1); } } while ((t = next_thread(t)) != start); - - unlock_task_sighand(start, &flags); } static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, int exit_code) { struct task_struct *g, *p; + unsigned long flags; int err = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) { - tsk->signal->flags = SIGNAL_GROUP_EXIT; tsk->signal->group_exit_code = exit_code; - tsk->signal->group_stop_count = 0; + zap_process(tsk); err = 0; } spin_unlock_irq(&tsk->sighand->siglock); if (err) return err; + if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + goto done; + rcu_read_lock(); for_each_process(g) { + if (g == tsk->group_leader) + continue; + p = g; do { if (p->mm) { - if (p->mm == mm) + if (p->mm == mm) { + /* + * p->sighand can't disappear, but + * may be changed by de_thread() + */ + lock_task_sighand(p, &flags); zap_process(p); + unlock_task_sighand(p, &flags); + } break; } } while ((p = next_thread(p)) != g); } rcu_read_unlock(); - +done: return mm->core_waiters; } -- cgit v1.1