From 8f3ff20862cfcb85500a2bb55ee64622bd59fd0c Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 23 Sep 2009 15:56:25 -0700 Subject: cgroups: revert "cgroups: fix pid namespace bug" The following series adds a "cgroup.procs" file to each cgroup that reports unique tgids rather than pids, and allows all threads in a threadgroup to be atomically moved to a new cgroup. The subsystem "attach" interface is modified to support attaching whole threadgroups at a time, which could introduce potential problems if any subsystem were to need to access the old cgroup of every thread being moved. The attach interface may need to be revised if this becomes the case. Also added is functionality for read/write locking all CLONE_THREAD fork()ing within a threadgroup, by means of an rwsem that lives in the sighand_struct, for per-threadgroup-ness and also for sharing a cacheline with the sighand's atomic count. This scheme should introduce no extra overhead in the fork path when there's no contention. The final patch reveals potential for a race when forking before a subsystem's attach function is called - one potential solution in case any subsystem has this problem is to hang on to the group's fork mutex through the attach() calls, though no subsystem yet demonstrates need for an extended critical section. This patch: Revert commit 096b7fe012d66ed55e98bc8022405ede0cc80e96 Author: Li Zefan AuthorDate: Wed Jul 29 15:04:04 2009 -0700 Commit: Linus Torvalds CommitDate: Wed Jul 29 19:10:35 2009 -0700 cgroups: fix pid namespace bug This is in preparation for some clashing cgroups changes that subsume the original commit's functionaliy. The original commit fixed a pid namespace bug which Ben Blum fixed independently (in the same way, but with different code) as part of a series of patches. I played around with trying to reconcile Ben's patch series with Li's patch, but concluded that it was simpler to just revert Li's, given that Ben's patch series contained essentially the same fix. Signed-off-by: Paul Menage Cc: Li Zefan Cc: Matt Helsley Cc: "Eric W. Biederman" Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 90bba9e..c833d6f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -179,11 +179,14 @@ struct cgroup { */ struct list_head release_list; - /* pids_mutex protects pids_list and cached pid arrays. */ + /* pids_mutex protects the fields below */ struct rw_semaphore pids_mutex; - - /* Linked list of struct cgroup_pids */ - struct list_head pids_list; + /* Array of process ids in the cgroup */ + pid_t *tasks_pids; + /* How many files are using the current tasks_pids array */ + int pids_use_count; + /* Length of the current tasks_pids array */ + int pids_length; /* For RCU-protected deletion */ struct rcu_head rcu_head; -- cgit v1.1 From 102a775e3647628727ae83a9a6abf0564c3ca7cb Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 23 Sep 2009 15:56:26 -0700 Subject: cgroups: add a read-only "procs" file similar to "tasks" that shows only unique tgids struct cgroup used to have a bunch of fields for keeping track of the pidlist for the tasks file. Those are now separated into a new struct cgroup_pidlist, of which two are had, one for procs and one for tasks. The way the seq_file operations are set up is changed so that just the pidlist struct gets passed around as the private data. Interface example: Suppose a multithreaded process has pid 1000 and other threads with ids 1001, 1002, 1003: $ cat tasks 1000 1001 1002 1003 $ cat cgroup.procs 1000 $ Signed-off-by: Ben Blum Signed-off-by: Paul Menage Acked-by: Li Zefan Cc: Matt Helsley Cc: "Eric W. Biederman" Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c833d6f..2357733 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -141,6 +141,17 @@ enum { CGRP_WAIT_ON_RMDIR, }; +struct cgroup_pidlist { + /* protects the other fields */ + struct rw_semaphore mutex; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* how many files are using the current array */ + int use_count; +}; + struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ @@ -179,14 +190,9 @@ struct cgroup { */ struct list_head release_list; - /* pids_mutex protects the fields below */ - struct rw_semaphore pids_mutex; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the current tasks_pids array */ - int pids_use_count; - /* Length of the current tasks_pids array */ - int pids_length; + /* we will have two separate pidlists, one for pids (the tasks file) + * and one for tgids (the procs file). */ + struct cgroup_pidlist tasks, procs; /* For RCU-protected deletion */ struct rcu_head rcu_head; -- cgit v1.1 From 72a8cb30d10d4041c455a7054607a7d519167c87 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 23 Sep 2009 15:56:27 -0700 Subject: cgroups: ensure correct concurrent opening/reading of pidlists across pid namespaces Previously there was the problem in which two processes from different pid namespaces reading the tasks or procs file could result in one process seeing results from the other's namespace. Rather than one pidlist for each file in a cgroup, we now keep a list of pidlists keyed by namespace and file type (tasks versus procs) in which entries are placed on demand. Each pidlist has its own lock, and that the pidlists themselves are passed around in the seq_file's private pointer means we don't have to touch the cgroup or its master list except when creating and destroying entries. Signed-off-by: Ben Blum Signed-off-by: Paul Menage Cc: Li Zefan Cc: Matt Helsley Cc: "Eric W. Biederman" Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2357733..88e8634 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -141,15 +141,36 @@ enum { CGRP_WAIT_ON_RMDIR, }; +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ struct cgroup_pidlist { - /* protects the other fields */ - struct rw_semaphore mutex; + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; /* array of xids */ pid_t *list; /* how many elements the above list has */ int length; /* how many files are using the current array */ int use_count; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* protects the other fields */ + struct rw_semaphore mutex; }; struct cgroup { @@ -190,9 +211,12 @@ struct cgroup { */ struct list_head release_list; - /* we will have two separate pidlists, one for pids (the tasks file) - * and one for tgids (the procs file). */ - struct cgroup_pidlist tasks, procs; + /* + * list of pidlists, up to two for each namespace (one for procs, one + * for tasks); created on demand. + */ + struct list_head pidlists; + struct mutex pidlist_mutex; /* For RCU-protected deletion */ struct rcu_head rcu_head; -- cgit v1.1 From c378369d8b4fa516ff2b1e79c3eded4e0e955ebb Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 23 Sep 2009 15:56:29 -0700 Subject: cgroups: change css_set freeing mechanism to be under RCU Changes css_set freeing mechanism to be under RCU This is a prepatch for making the procs file writable. In order to free the old css_sets for each task to be moved as they're being moved, the freeing mechanism must be RCU-protected, or else we would have to have a call to synchronize_rcu() for each task before freeing its old css_set. Signed-off-by: Ben Blum Signed-off-by: Paul Menage Cc: "Paul E. McKenney" Acked-by: Li Zefan Cc: Matt Helsley Cc: "Eric W. Biederman" Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 88e8634..3ac78a2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -260,6 +260,9 @@ struct css_set { * during subsystem registration (at boot time). */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + + /* For RCU-protected deletion */ + struct rcu_head rcu_head; }; /* -- cgit v1.1 From be367d09927023d081f9199665c8500f69f14d22 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 23 Sep 2009 15:56:31 -0700 Subject: cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time Alter the ss->can_attach and ss->attach functions to be able to deal with a whole threadgroup at a time, for use in cgroup_attach_proc. (This is a pre-patch to cgroup-procs-writable.patch.) Currently, new mode of the attach function can only tell the subsystem about the old cgroup of the threadgroup leader. No subsystem currently needs that information for each thread that's being moved, but if one were to be added (for example, one that counts tasks within a group) this bit would need to be reworked a bit to tell the subsystem the right information. [hidave.darkstar@gmail.com: fix build] Signed-off-by: Ben Blum Signed-off-by: Paul Menage Acked-by: Li Zefan Reviewed-by: Matt Helsley Cc: "Eric W. Biederman" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3ac78a2..b62bb92 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -425,10 +425,11 @@ struct cgroup_subsys { struct cgroup *cgrp); int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); - int (*can_attach)(struct cgroup_subsys *ss, - struct cgroup *cgrp, struct task_struct *tsk); + int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup); void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *tsk); + struct cgroup *old_cgrp, struct task_struct *tsk, + bool threadgroup); void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); int (*populate)(struct cgroup_subsys *ss, -- cgit v1.1