From d68b46fe16ad59b3a5f51ec73daaa5dc06753798 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: vfork: make it killable Make vfork() killable. Change do_fork(CLONE_VFORK) to do wait_for_completion_killable(). If it fails we do not return to the user-mode and never touch the memory shared with our child. However, in this case we should clear child->vfork_done before return, we use task_lock() in do_fork()->wait_for_vfork_done() and complete_vfork_done() to serialize with each other. Note: now that we use task_lock() we don't really need completion, we could turn task->vfork_done into "task_struct *wake_up_me" but this needs some complications. NOTE: this and the next patches do not affect in-kernel users of CLONE_VFORK, kernel threads run with all signals ignored including SIGKILL/SIGSTOP. However this is obviously the user-visible change. Not only a fatal signal can kill the vforking parent, a sub-thread can do execve or exit_group() and kill the thread sleeping in vfork(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- kernel/fork.c | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1b25a37..b646771 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2372,7 +2372,7 @@ static inline int thread_group_empty(struct task_struct *p) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. + * ->cgroup.subsys[]. And ->vfork_done. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/kernel/fork.c b/kernel/fork.c index cf3d963..892c534 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -670,10 +670,34 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) void complete_vfork_done(struct task_struct *tsk) { - struct completion *vfork_done = tsk->vfork_done; + struct completion *vfork; - tsk->vfork_done = NULL; - complete(vfork_done); + task_lock(tsk); + vfork = tsk->vfork_done; + if (likely(vfork)) { + tsk->vfork_done = NULL; + complete(vfork); + } + task_unlock(tsk); +} + +static int wait_for_vfork_done(struct task_struct *child, + struct completion *vfork) +{ + int killed; + + freezer_do_not_count(); + killed = wait_for_completion_killable(vfork); + freezer_count(); + + if (killed) { + task_lock(child); + child->vfork_done = NULL; + task_unlock(child); + } + + put_task_struct(child); + return killed; } /* Please note the differences between mmput and mm_release. @@ -717,7 +741,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave * the value intact in a core dump, and to save the unnecessary - * trouble otherwise. Userland only wants this done for a sys_exit. + * trouble, say, a killed vfork parent shouldn't touch this mm. + * Userland only wants this done for a sys_exit. */ if (tsk->clear_child_tid) { if (!(tsk->flags & PF_SIGNALED) && @@ -1551,6 +1576,7 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); + get_task_struct(p); } /* @@ -1568,10 +1594,8 @@ long do_fork(unsigned long clone_flags, ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { - freezer_do_not_count(); - wait_for_completion(&vfork); - freezer_count(); - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + if (!wait_for_vfork_done(p, &vfork)) + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); -- cgit v1.1