diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-26 19:50:04 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-26 19:50:04 -0700 |
commit | bbe179f88d39274630823a0dc07d2714fd19a103 (patch) | |
tree | f70181a660e0f859f230233643faded7d44360e5 /include | |
parent | 4b703b1d4c46ca4a00109ca1a391943ec21991b3 (diff) | |
parent | 8a0792ef8e01f03cb43806c6a87738bde34df713 (diff) | |
download | op-kernel-dev-bbe179f88d39274630823a0dc07d2714fd19a103.zip op-kernel-dev-bbe179f88d39274630823a0dc07d2714fd19a103.tar.gz |
Merge branch 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- threadgroup_lock got reorganized so that its users can pick the
actual locking mechanism to use. Its only user - cgroups - is
updated to use a percpu_rwsem instead of per-process rwsem.
This makes things a bit lighter on hot paths and allows cgroups to
perform and fail multi-task (a process) migrations atomically.
Multi-task migrations are used in several places including the
unified hierarchy.
- Delegation rule and documentation added to unified hierarchy. This
will likely be the last interface update from the cgroup core side
for unified hierarchy before lifting the devel mask.
- Some groundwork for the pids controller which is scheduled to be
merged in the coming devel cycle.
* 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup: add delegation section to unified hierarchy documentation
cgroup: require write perm on common ancestor when moving processes on the default hierarchy
cgroup: separate out cgroup_procs_write_permission() from __cgroup_procs_write()
kernfs: make kernfs_get_inode() public
MAINTAINERS: add a cgroup core co-maintainer
cgroup: fix uninitialised iterator in for_each_subsys_which
cgroup: replace explicit ss_mask checking with for_each_subsys_which
cgroup: use bitmask to filter for_each_subsys
cgroup: add seq_file forward declaration for struct cftype
cgroup: simplify threadgroup locking
sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem
sched, cgroup: reorganize threadgroup locking
cgroup: switch to unsigned long for bitmasks
cgroup: reorganize include/linux/cgroup.h
cgroup: separate out include/linux/cgroup-defs.h
cgroup: fix some comment typos
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/cgroup-defs.h | 501 | ||||
-rw-r--r-- | include/linux/cgroup.h | 1007 | ||||
-rw-r--r-- | include/linux/init_task.h | 8 | ||||
-rw-r--r-- | include/linux/kernfs.h | 5 | ||||
-rw-r--r-- | include/linux/sched.h | 65 |
5 files changed, 796 insertions, 790 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h new file mode 100644 index 0000000..93755a6 --- /dev/null +++ b/include/linux/cgroup-defs.h @@ -0,0 +1,501 @@ +/* + * linux/cgroup-defs.h - basic definitions for cgroup + * + * This file provides basic type and interface. Include this file directly + * only if necessary to avoid cyclic dependencies. + */ +#ifndef _LINUX_CGROUP_DEFS_H +#define _LINUX_CGROUP_DEFS_H + +#include <linux/limits.h> +#include <linux/list.h> +#include <linux/idr.h> +#include <linux/wait.h> +#include <linux/mutex.h> +#include <linux/rcupdate.h> +#include <linux/percpu-refcount.h> +#include <linux/percpu-rwsem.h> +#include <linux/workqueue.h> + +#ifdef CONFIG_CGROUPS + +struct cgroup; +struct cgroup_root; +struct cgroup_subsys; +struct cgroup_taskset; +struct kernfs_node; +struct kernfs_ops; +struct kernfs_open_file; +struct seq_file; + +#define MAX_CGROUP_TYPE_NAMELEN 32 +#define MAX_CGROUP_ROOT_NAMELEN 64 +#define MAX_CFTYPE_NAME 64 + +/* define the enumeration of all cgroup subsystems */ +#define SUBSYS(_x) _x ## _cgrp_id, +enum cgroup_subsys_id { +#include <linux/cgroup_subsys.h> + CGROUP_SUBSYS_COUNT, +}; +#undef SUBSYS + +/* bits in struct cgroup_subsys_state flags field */ +enum { + CSS_NO_REF = (1 << 0), /* no reference counting for this css */ + CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ + CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ +}; + +/* bits in struct cgroup flags field */ +enum { + /* Control Group requires release notifications to userspace */ + CGRP_NOTIFY_ON_RELEASE, + /* + * Clone the parent's configuration when creating a new child + * cpuset cgroup. For historical reasons, this option can be + * specified at mount time and thus is implemented here. + */ + CGRP_CPUSET_CLONE_CHILDREN, +}; + +/* cgroup_root->flags */ +enum { + CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ + CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ + CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ +}; + +/* cftype->flags */ +enum { + CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ + CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ + + /* internal flags, do not use outside cgroup core proper */ + __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ + __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ +}; + +/* + * Per-subsystem/per-cgroup state maintained by the system. This is the + * fundamental structural building block that controllers deal with. + * + * Fields marked with "PI:" are public and immutable and may be accessed + * directly without synchronization. + */ +struct cgroup_subsys_state { + /* PI: the cgroup that this css is attached to */ + struct cgroup *cgroup; + + /* PI: the cgroup subsystem that this css is attached to */ + struct cgroup_subsys *ss; + + /* reference count - access via css_[try]get() and css_put() */ + struct percpu_ref refcnt; + + /* PI: the parent css */ + struct cgroup_subsys_state *parent; + + /* siblings list anchored at the parent's ->children */ + struct list_head sibling; + struct list_head children; + + /* + * PI: Subsys-unique ID. 0 is unused and root is always 1. The + * matching css can be looked up using css_from_id(). + */ + int id; + + unsigned int flags; + + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all csses. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr and + * used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + + /* percpu_ref killing and RCU release */ + struct rcu_head rcu_head; + struct work_struct destroy_work; +}; + +/* + * A css_set is a structure holding pointers to a set of + * cgroup_subsys_state objects. This saves space in the task struct + * object and speeds up fork()/exit(), since a single inc/dec and a + * list_add()/del() can bump the reference count on the entire cgroup + * set for a task. + */ +struct css_set { + /* Reference count */ + atomic_t refcount; + + /* + * List running through all cgroup groups in the same hash + * slot. Protected by css_set_lock + */ + struct hlist_node hlist; + + /* + * Lists running through all tasks using this cgroup group. + * mg_tasks lists tasks which belong to this cset but are in the + * process of being migrated out or in. Protected by + * css_set_rwsem, but, during migration, once tasks are moved to + * mg_tasks, it can be read safely while holding cgroup_mutex. + */ + struct list_head tasks; + struct list_head mg_tasks; + + /* + * List of cgrp_cset_links pointing at cgroups referenced from this + * css_set. Protected by css_set_lock. + */ + struct list_head cgrp_links; + + /* the default cgroup associated with this css_set */ + struct cgroup *dfl_cgrp; + + /* + * Set of subsystem states, one for each subsystem. This array is + * immutable after creation apart from the init_css_set during + * subsystem registration (at boot time). + */ + struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + + /* + * List of csets participating in the on-going migration either as + * source or destination. Protected by cgroup_mutex. + */ + struct list_head mg_preload_node; + struct list_head mg_node; + + /* + * If this cset is acting as the source of migration the following + * two fields are set. mg_src_cgrp is the source cgroup of the + * on-going migration and mg_dst_cset is the destination cset the + * target tasks on this cset should be migrated to. Protected by + * cgroup_mutex. + */ + struct cgroup *mg_src_cgrp; + struct css_set *mg_dst_cset; + + /* + * On the default hierarhcy, ->subsys[ssid] may point to a css + * attached to an ancestor instead of the cgroup this css_set is + * associated with. The following node is anchored at + * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to + * iterate through all css's attached to a given cgroup. + */ + struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + + /* For RCU-protected deletion */ + struct rcu_head rcu_head; +}; + +struct cgroup { + /* self css with NULL ->ss, points back to this cgroup */ + struct cgroup_subsys_state self; + + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * idr allocated in-hierarchy ID. + * + * ID 0 is not used, the ID of the root cgroup is always 1, and a + * new cgroup will be assigned with a smallest available ID. + * + * Allocating/Removing ID must be protected by cgroup_mutex. + */ + int id; + + /* + * If this cgroup contains any tasks, it contributes one to + * populated_cnt. All children with non-zero popuplated_cnt of + * their own contribute one. The count is zero iff there's no task + * in this cgroup or its subtree. + */ + int populated_cnt; + + struct kernfs_node *kn; /* cgroup kernfs entry */ + struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */ + struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ + + /* + * The bitmask of subsystems enabled on the child cgroups. + * ->subtree_control is the one configured through + * "cgroup.subtree_control" while ->child_subsys_mask is the + * effective one which may have more subsystems enabled. + * Controller knobs are made available iff it's enabled in + * ->subtree_control. + */ + unsigned int subtree_control; + unsigned int child_subsys_mask; + + /* Private pointers for each registered subsystem */ + struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; + + struct cgroup_root *root; + + /* + * List of cgrp_cset_links pointing at css_sets with tasks in this + * cgroup. Protected by css_set_lock. + */ + struct list_head cset_links; + + /* + * On the default hierarchy, a css_set for a cgroup with some + * susbsys disabled will point to css's which are associated with + * the closest ancestor which has the subsys enabled. The + * following lists all css_sets which point to this cgroup's css + * for the given subsystem. + */ + struct list_head e_csets[CGROUP_SUBSYS_COUNT]; + + /* + * list of pidlists, up to two for each namespace (one for procs, one + * for tasks); created on demand. + */ + struct list_head pidlists; + struct mutex pidlist_mutex; + + /* used to wait for offlining of csses */ + wait_queue_head_t offline_waitq; + + /* used to schedule release agent */ + struct work_struct release_agent_work; +}; + +/* + * A cgroup_root represents the root of a cgroup hierarchy, and may be + * associated with a kernfs_root to form an active hierarchy. This is + * internal to cgroup core. Don't access directly from controllers. + */ +struct cgroup_root { + struct kernfs_root *kf_root; + + /* The bitmask of subsystems attached to this hierarchy */ + unsigned int subsys_mask; + + /* Unique id for this hierarchy. */ + int hierarchy_id; + + /* The root cgroup. Root is destroyed on its release. */ + struct cgroup cgrp; + + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ + atomic_t nr_cgrps; + + /* A list running through the active hierarchies */ + struct list_head root_list; + + /* Hierarchy-specific flags */ + unsigned int flags; + + /* IDs for cgroups in this hierarchy */ + struct idr cgroup_idr; + + /* The path to use for release notifications. */ + char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; +}; + +/* + * struct cftype: handler definitions for cgroup control files + * + * When reading/writing to a file: + * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_path.dentry->d_fsdata + */ +struct cftype { + /* + * By convention, the name should begin with the name of the + * subsystem, followed by a period. Zero length string indicates + * end of cftype array. + */ + char name[MAX_CFTYPE_NAME]; + int private; + /* + * If not 0, file mode is set to this value, otherwise it will + * be figured out automatically + */ + umode_t mode; + + /* + * The maximum length of string, excluding trailing nul, that can + * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. + */ + size_t max_write_len; + + /* CFTYPE_* flags */ + unsigned int flags; + + /* + * Fields used for internal bookkeeping. Initialized automatically + * during registration. + */ + struct cgroup_subsys *ss; /* NULL for cgroup core files */ + struct list_head node; /* anchored at ss->cfts */ + struct kernfs_ops *kf_ops; + + /* + * read_u64() is a shortcut for the common case of returning a + * single integer. Use it in place of read() + */ + u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); + /* + * read_s64() is a signed version of read_u64() + */ + s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); + + /* generic seq_file read interface */ + int (*seq_show)(struct seq_file *sf, void *v); + + /* optional ops, implement all or none */ + void *(*seq_start)(struct seq_file *sf, loff_t *ppos); + void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); + void (*seq_stop)(struct seq_file *sf, void *v); + + /* + * write_u64() is a shortcut for the common case of accepting + * a single integer (as parsed by simple_strtoull) from + * userspace. Use in place of write(); return 0 or error. + */ + int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val); + /* + * write_s64() is a signed version of write_u64() + */ + int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val); + + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. Use + * of_css/cft() to access the associated css and cft. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lock_class_key lockdep_key; +#endif +}; + +/* + * Control Group subsystem type. + * See Documentation/cgroups/cgroups.txt for details + */ +struct cgroup_subsys { + struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); + int (*css_online)(struct cgroup_subsys_state *css); + void (*css_offline)(struct cgroup_subsys_state *css); + void (*css_released)(struct cgroup_subsys_state *css); + void (*css_free)(struct cgroup_subsys_state *css); + void (*css_reset)(struct cgroup_subsys_state *css); + void (*css_e_css_changed)(struct cgroup_subsys_state *css); + + int (*can_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*fork)(struct task_struct *task); + void (*exit)(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task); + void (*bind)(struct cgroup_subsys_state *root_css); + + int disabled; + int early_init; + + /* + * If %false, this subsystem is properly hierarchical - + * configuration, resource accounting and restriction on a parent + * cgroup cover those of its children. If %true, hierarchy support + * is broken in some ways - some subsystems ignore hierarchy + * completely while others are only implemented half-way. + * + * It's now disallowed to create nested cgroups if the subsystem is + * broken and cgroup core will emit a warning message on such + * cases. Eventually, all subsystems will be made properly + * hierarchical and this will go away. + */ + bool broken_hierarchy; + bool warned_broken_hierarchy; + + /* the following two fields are initialized automtically during boot */ + int id; + const char *name; + + /* link to parent, protected by cgroup_lock() */ + struct cgroup_root *root; + + /* idr for css->id */ + struct idr css_idr; + + /* + * List of cftypes. Each entry is the first entry of an array + * terminated by zero length name. + */ + struct list_head cfts; + + /* + * Base cftypes which are automatically registered. The two can + * point to the same array. + */ + struct cftype *dfl_cftypes; /* for the default hierarchy */ + struct cftype *legacy_cftypes; /* for the legacy hierarchies */ + + /* + * A subsystem may depend on other subsystems. When such subsystem + * is enabled on a cgroup, the depended-upon subsystems are enabled + * together if available. Subsystems enabled due to dependency are + * not visible to userland until explicitly enabled. The following + * specifies the mask of subsystems that this one depends on. + */ + unsigned int depends_on; +}; + +extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + +/** + * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_begin() and allows cgroup operations to + * synchronize against threadgroup changes using a percpu_rw_semaphore. + */ +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + percpu_down_read(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_end(). Counterpart of + * cgroup_threadcgroup_change_begin(). + */ +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + percpu_up_read(&cgroup_threadgroup_rwsem); +} + +#else /* CONFIG_CGROUPS */ + +#define CGROUP_SUBSYS_COUNT 0 + +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {} +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} + +#endif /* CONFIG_CGROUPS */ + +#endif /* _LINUX_CGROUP_DEFS_H */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e7da0aa..a593e29 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -11,94 +11,200 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> -#include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/rwsem.h> -#include <linux/idr.h> -#include <linux/workqueue.h> #include <linux/fs.h> -#include <linux/percpu-refcount.h> #include <linux/seq_file.h> #include <linux/kernfs.h> -#include <linux/wait.h> + +#include <linux/cgroup-defs.h> #ifdef CONFIG_CGROUPS -struct cgroup_root; -struct cgroup_subsys; -struct cgroup; +/* a css_task_iter should be treated as an opaque object */ +struct css_task_iter { + struct cgroup_subsys *ss; -extern int cgroup_init_early(void); -extern int cgroup_init(void); -extern void cgroup_fork(struct task_struct *p); -extern void cgroup_post_fork(struct task_struct *p); -extern void cgroup_exit(struct task_struct *p); -extern int cgroupstats_build(struct cgroupstats *stats, - struct dentry *dentry); + struct list_head *cset_pos; + struct list_head *cset_head; + + struct list_head *task_pos; + struct list_head *tasks_head; + struct list_head *mg_tasks_head; +}; -extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk); +extern struct cgroup_root cgrp_dfl_root; +extern struct css_set init_css_set; -/* define the enumeration of all cgroup subsystems */ -#define SUBSYS(_x) _x ## _cgrp_id, -enum cgroup_subsys_id { +#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> - CGROUP_SUBSYS_COUNT, -}; #undef SUBSYS +bool css_has_online_children(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); +struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, + struct cgroup_subsys *ss); +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); + +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); + +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_rm_cftypes(struct cftype *cfts); + +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); +int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk); + +void cgroup_fork(struct task_struct *p); +void cgroup_post_fork(struct task_struct *p); +void cgroup_exit(struct task_struct *p); + +int cgroup_init_early(void); +int cgroup_init(void); + /* - * Per-subsystem/per-cgroup state maintained by the system. This is the - * fundamental structural building block that controllers deal with. + * Iteration helpers and macros. + */ + +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent); +struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); +struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); +struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); + +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); + +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it); +struct task_struct *css_task_iter_next(struct css_task_iter *it); +void css_task_iter_end(struct css_task_iter *it); + +/** + * css_for_each_child - iterate through children of a css + * @pos: the css * to use as the loop cursor + * @parent: css whose children to walk * - * Fields marked with "PI:" are public and immutable and may be accessed - * directly without synchronization. + * Walk @parent's children. Must be called under rcu_read_lock(). + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. */ -struct cgroup_subsys_state { - /* PI: the cgroup that this css is attached to */ - struct cgroup *cgroup; - - /* PI: the cgroup subsystem that this css is attached to */ - struct cgroup_subsys *ss; - - /* reference count - access via css_[try]get() and css_put() */ - struct percpu_ref refcnt; - - /* PI: the parent css */ - struct cgroup_subsys_state *parent; - - /* siblings list anchored at the parent's ->children */ - struct list_head sibling; - struct list_head children; - - /* - * PI: Subsys-unique ID. 0 is unused and root is always 1. The - * matching css can be looked up using css_from_id(). - */ - int id; - - unsigned int flags; - - /* - * Monotonically increasing unique serial number which defines a - * uniform order among all csses. It's guaranteed that all - * ->children lists are in the ascending order of ->serial_nr and - * used to allow interrupting and resuming iterations. - */ - u64 serial_nr; - - /* percpu_ref killing and RCU release */ - struct rcu_head rcu_head; - struct work_struct destroy_work; -}; +#define css_for_each_child(pos, parent) \ + for ((pos) = css_next_child(NULL, (parent)); (pos); \ + (pos) = css_next_child((pos), (parent))) -/* bits in struct cgroup_subsys_state flags field */ -enum { - CSS_NO_REF = (1 << 0), /* no reference counting for this css */ - CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ - CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ -}; +/** + * css_for_each_descendant_pre - pre-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @root: css whose descendants to walk + * + * Walk @root's descendants. @root is included in the iteration and the + * first node to be visited. Must be called under rcu_read_lock(). + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * For example, the following guarantees that a descendant can't escape + * state updates of its ancestors. + * + * my_online(@css) + * { + * Lock @css's parent and @css; + * Inherit state from the parent; + * Unlock both. + * } + * + * my_update_state(@css) + * { + * css_for_each_descendant_pre(@pos, @css) { + * Lock @pos; + * if (@pos == @css) + * Update @css's state; + * else + * Verify @pos is alive and inherit state from its parent; + * Unlock @pos; + * } + * } + * + * As long as the inheriting step, including checking the parent state, is + * enclosed inside @pos locking, double-locking the parent isn't necessary + * while inheriting. The state update to the parent is guaranteed to be + * visible by walking order and, as long as inheriting operations to the + * same @pos are atomic to each other, multiple updates racing each other + * still result in the correct state. It's guaranateed that at least one + * inheritance happens for any css after the latest update to its parent. + * + * If checking parent's state requires locking the parent, each inheriting + * iteration should lock and unlock both @pos->parent and @pos. + * + * Alternatively, a subsystem may choose to use a single global lock to + * synchronize ->css_online() and ->css_offline() against tree-walking + * operations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. + */ +#define css_for_each_descendant_pre(pos, css) \ + for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ + (pos) = css_next_descendant_pre((pos), (css))) + +/** + * css_for_each_descendant_post - post-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @css: css whose descendants to walk + * + * Similar to css_for_each_descendant_pre() but performs post-order + * traversal instead. @root is included in the iteration and the last + * node to be visited. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * Note that the walk visibility guarantee example described in pre-order + * walk doesn't apply the same to post-order walks. + */ +#define css_for_each_descendant_post(pos, css) \ + for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ + (pos) = css_next_descendant_post((pos), (css))) + +/** + * cgroup_taskset_for_each - iterate cgroup_taskset + * @task: the loop cursor + * @tset: taskset to iterate + */ +#define cgroup_taskset_for_each(task, tset) \ + for ((task) = cgroup_taskset_first((tset)); (task); \ + (task) = cgroup_taskset_next((tset))) + +/* + * Inline functions. + */ /** * css_get - obtain a reference on the specified css @@ -185,532 +291,6 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } -/* bits in struct cgroup flags field */ -enum { - /* Control Group requires release notifications to userspace */ - CGRP_NOTIFY_ON_RELEASE, - /* - * Clone the parent's configuration when creating a new child - * cpuset cgroup. For historical reasons, this option can be - * specified at mount time and thus is implemented here. - */ - CGRP_CPUSET_CLONE_CHILDREN, -}; - -struct cgroup { - /* self css with NULL ->ss, points back to this cgroup */ - struct cgroup_subsys_state self; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * idr allocated in-hierarchy ID. - * - * ID 0 is not used, the ID of the root cgroup is always 1, and a - * new cgroup will be assigned with a smallest available ID. - * - * Allocating/Removing ID must be protected by cgroup_mutex. - */ - int id; - - /* - * If this cgroup contains any tasks, it contributes one to - * populated_cnt. All children with non-zero popuplated_cnt of - * their own contribute one. The count is zero iff there's no task - * in this cgroup or its subtree. - */ - int populated_cnt; - - struct kernfs_node *kn; /* cgroup kernfs entry */ - struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ - - /* - * The bitmask of subsystems enabled on the child cgroups. - * ->subtree_control is the one configured through - * "cgroup.subtree_control" while ->child_subsys_mask is the - * effective one which may have more subsystems enabled. - * Controller knobs are made available iff it's enabled in - * ->subtree_control. - */ - unsigned int subtree_control; - unsigned int child_subsys_mask; - - /* Private pointers for each registered subsystem */ - struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; - - struct cgroup_root *root; - - /* - * List of cgrp_cset_links pointing at css_sets with tasks in this - * cgroup. Protected by css_set_lock. - */ - struct list_head cset_links; - - /* - * On the default hierarchy, a css_set for a cgroup with some - * susbsys disabled will point to css's which are associated with - * the closest ancestor which has the subsys enabled. The - * following lists all css_sets which point to this cgroup's css - * for the given subsystem. - */ - struct list_head e_csets[CGROUP_SUBSYS_COUNT]; - - /* - * list of pidlists, up to two for each namespace (one for procs, one - * for tasks); created on demand. - */ - struct list_head pidlists; - struct mutex pidlist_mutex; - - /* used to wait for offlining of csses */ - wait_queue_head_t offline_waitq; - - /* used to schedule release agent */ - struct work_struct release_agent_work; -}; - -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* cgroup_root->flags */ -enum { - CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ - CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ - CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ -}; - -/* - * A cgroup_root represents the root of a cgroup hierarchy, and may be - * associated with a kernfs_root to form an active hierarchy. This is - * internal to cgroup core. Don't access directly from controllers. - */ -struct cgroup_root { - struct kernfs_root *kf_root; - - /* The bitmask of subsystems attached to this hierarchy */ - unsigned int subsys_mask; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The root cgroup. Root is destroyed on its release. */ - struct cgroup cgrp; - - /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ - atomic_t nr_cgrps; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* Hierarchy-specific flags */ - unsigned int flags; - - /* IDs for cgroups in this hierarchy */ - struct idr cgroup_idr; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - -/* - * A css_set is a structure holding pointers to a set of - * cgroup_subsys_state objects. This saves space in the task struct - * object and speeds up fork()/exit(), since a single inc/dec and a - * list_add()/del() can bump the reference count on the entire cgroup - * set for a task. - */ - -struct css_set { - - /* Reference count */ - atomic_t refcount; - - /* - * List running through all cgroup groups in the same hash - * slot. Protected by css_set_lock - */ - struct hlist_node hlist; - - /* - * Lists running through all tasks using this cgroup group. - * mg_tasks lists tasks which belong to this cset but are in the - * process of being migrated out or in. Protected by - * css_set_rwsem, but, during migration, once tasks are moved to - * mg_tasks, it can be read safely while holding cgroup_mutex. - */ - struct list_head tasks; - struct list_head mg_tasks; - - /* - * List of cgrp_cset_links pointing at cgroups referenced from this - * css_set. Protected by css_set_lock. - */ - struct list_head cgrp_links; - - /* the default cgroup associated with this css_set */ - struct cgroup *dfl_cgrp; - - /* - * Set of subsystem states, one for each subsystem. This array is - * immutable after creation apart from the init_css_set during - * subsystem registration (at boot time). - */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; - - /* - * List of csets participating in the on-going migration either as - * source or destination. Protected by cgroup_mutex. - */ - struct list_head mg_preload_node; - struct list_head mg_node; - - /* - * If this cset is acting as the source of migration the following - * two fields are set. mg_src_cgrp is the source cgroup of the - * on-going migration and mg_dst_cset is the destination cset the - * target tasks on this cset should be migrated to. Protected by - * cgroup_mutex. - */ - struct cgroup *mg_src_cgrp; - struct css_set *mg_dst_cset; - - /* - * On the default hierarhcy, ->subsys[ssid] may point to a css - * attached to an ancestor instead of the cgroup this css_set is - * associated with. The following node is anchored at - * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to - * iterate through all css's attached to a given cgroup. - */ - struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; - - /* For RCU-protected deletion */ - struct rcu_head rcu_head; -}; - -/* - * struct cftype: handler definitions for cgroup control files - * - * When reading/writing to a file: - * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_path.dentry->d_fsdata - */ - -/* cftype->flags */ -enum { - CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ - CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ - CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ - - /* internal flags, do not use outside cgroup core proper */ - __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ - __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ -}; - -#define MAX_CFTYPE_NAME 64 - -struct cftype { - /* - * By convention, the name should begin with the name of the - * subsystem, followed by a period. Zero length string indicates - * end of cftype array. - */ - char name[MAX_CFTYPE_NAME]; - int private; - /* - * If not 0, file mode is set to this value, otherwise it will - * be figured out automatically - */ - umode_t mode; - - /* - * The maximum length of string, excluding trailing nul, that can - * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. - */ - size_t max_write_len; - - /* CFTYPE_* flags */ - unsigned int flags; - - /* - * Fields used for internal bookkeeping. Initialized automatically - * during registration. - */ - struct cgroup_subsys *ss; /* NULL for cgroup core files */ - struct list_head node; /* anchored at ss->cfts */ - struct kernfs_ops *kf_ops; - - /* - * read_u64() is a shortcut for the common case of returning a - * single integer. Use it in place of read() - */ - u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); - /* - * read_s64() is a signed version of read_u64() - */ - s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); - - /* generic seq_file read interface */ - int (*seq_show)(struct seq_file *sf, void *v); - - /* optional ops, implement all or none */ - void *(*seq_start)(struct seq_file *sf, loff_t *ppos); - void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); - void (*seq_stop)(struct seq_file *sf, void *v); - - /* - * write_u64() is a shortcut for the common case of accepting - * a single integer (as parsed by simple_strtoull) from - * userspace. Use in place of write(); return 0 or error. - */ - int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val); - /* - * write_s64() is a signed version of write_u64() - */ - int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val); - - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. Use - * of_css/cft() to access the associated css and cft. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lock_class_key lockdep_key; -#endif -}; - -extern struct cgroup_root cgrp_dfl_root; -extern struct css_set init_css_set; - -/** - * cgroup_on_dfl - test whether a cgroup is on the default hierarchy - * @cgrp: the cgroup of interest - * - * The default hierarchy is the v2 interface of cgroup and this function - * can be used to test whether a cgroup is on the default hierarchy for - * cases where a subsystem should behave differnetly depending on the - * interface version. - * - * The set of behaviors which change on the default hierarchy are still - * being determined and the mount option is prefixed with __DEVEL__. - * - * List of changed behaviors: - * - * - Mount options "noprefix", "xattr", "clone_children", "release_agent" - * and "name" are disallowed. - * - * - When mounting an existing superblock, mount options should match. - * - * - Remount is disallowed. - * - * - rename(2) is disallowed. - * - * - "tasks" is removed. Everything should be at process granularity. Use - * "cgroup.procs" instead. - * - * - "cgroup.procs" is not sorted. pids will be unique unless they got - * recycled inbetween reads. - * - * - "release_agent" and "notify_on_release" are removed. Replacement - * notification mechanism will be implemented. - * - * - "cgroup.clone_children" is removed. - * - * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup - * and its descendants contain no task; otherwise, 1. The file also - * generates kernfs notification which can be monitored through poll and - * [di]notify when the value of the file changes. - * - * - cpuset: tasks will be kept in empty cpusets when hotplug happens and - * take masks of ancestors with non-empty cpus/mems, instead of being - * moved to an ancestor. - * - * - cpuset: a task can be moved into an empty cpuset, and again it takes - * masks of ancestors. - * - * - memcg: use_hierarchy is on by default and the cgroup file for the flag - * is not created. - * - * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. - */ -static inline bool cgroup_on_dfl(const struct cgroup *cgrp) -{ - return cgrp->root == &cgrp_dfl_root; -} - -/* no synchronization, the result can only be used as a hint */ -static inline bool cgroup_has_tasks(struct cgroup *cgrp) -{ - return !list_empty(&cgrp->cset_links); -} - -/* returns ino associated with a cgroup */ -static inline ino_t cgroup_ino(struct cgroup *cgrp) -{ - return cgrp->kn->ino; -} - -/* cft/css accessors for cftype->write() operation */ -static inline struct cftype *of_cft(struct kernfs_open_file *of) -{ - return of->kn->priv; -} - -struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); - -/* cft/css accessors for cftype->seq_*() operations */ -static inline struct cftype *seq_cft(struct seq_file *seq) -{ - return of_cft(seq->private); -} - -static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) -{ - return of_css(seq->private); -} - -/* - * Name / path handling functions. All are thin wrappers around the kernfs - * counterparts and can be called under any context. - */ - -static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) -{ - return kernfs_name(cgrp->kn, buf, buflen); -} - -static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, - size_t buflen) -{ - return kernfs_path(cgrp->kn, buf, buflen); -} - -static inline void pr_cont_cgroup_name(struct cgroup *cgrp) -{ - pr_cont_kernfs_name(cgrp->kn); -} - -static inline void pr_cont_cgroup_path(struct cgroup *cgrp) -{ - pr_cont_kernfs_path(cgrp->kn); -} - -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); - -int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_rm_cftypes(struct cftype *cfts); - -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); - -/* - * Control Group taskset, used to pass around set of tasks to cgroup_subsys - * methods. - */ -struct cgroup_taskset; -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); - -/** - * cgroup_taskset_for_each - iterate cgroup_taskset - * @task: the loop cursor - * @tset: taskset to iterate - */ -#define cgroup_taskset_for_each(task, tset) \ - for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) - -/* - * Control Group subsystem type. - * See Documentation/cgroups/cgroups.txt for details - */ - -struct cgroup_subsys { - struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); - int (*css_online)(struct cgroup_subsys_state *css); - void (*css_offline)(struct cgroup_subsys_state *css); - void (*css_released)(struct cgroup_subsys_state *css); - void (*css_free)(struct cgroup_subsys_state *css); - void (*css_reset)(struct cgroup_subsys_state *css); - void (*css_e_css_changed)(struct cgroup_subsys_state *css); - - int (*can_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*fork)(struct task_struct *task); - void (*exit)(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task); - void (*bind)(struct cgroup_subsys_state *root_css); - - int disabled; - int early_init; - - /* - * If %false, this subsystem is properly hierarchical - - * configuration, resource accounting and restriction on a parent - * cgroup cover those of its children. If %true, hierarchy support - * is broken in some ways - some subsystems ignore hierarchy - * completely while others are only implemented half-way. - * - * It's now disallowed to create nested cgroups if the subsystem is - * broken and cgroup core will emit a warning message on such - * cases. Eventually, all subsystems will be made properly - * hierarchical and this will go away. - */ - bool broken_hierarchy; - bool warned_broken_hierarchy; - - /* the following two fields are initialized automtically during boot */ - int id; -#define MAX_CGROUP_TYPE_NAMELEN 32 - const char *name; - - /* link to parent, protected by cgroup_lock() */ - struct cgroup_root *root; - - /* idr for css->id */ - struct idr css_idr; - - /* - * List of cftypes. Each entry is the first entry of an array - * terminated by zero length name. - */ - struct list_head cfts; - - /* - * Base cftypes which are automatically registered. The two can - * point to the same array. - */ - struct cftype *dfl_cftypes; /* for the default hierarchy */ - struct cftype *legacy_cftypes; /* for the legacy hierarchies */ - - /* - * A subsystem may depend on other subsystems. When such subsystem - * is enabled on a cgroup, the depended-upon subsystems are enabled - * together if available. Subsystems enabled due to dependency are - * not visible to userland until explicitly enabled. The following - * specifies the mask of subsystems that this one depends on. - */ - unsigned int depends_on; -}; - -#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; -#include <linux/cgroup_subsys.h> -#undef SUBSYS - /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for @@ -818,178 +398,137 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } -struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *parent); - -struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); - /** - * css_for_each_child - iterate through children of a css - * @pos: the css * to use as the loop cursor - * @parent: css whose children to walk + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy + * @cgrp: the cgroup of interest * - * Walk @parent's children. Must be called under rcu_read_lock(). + * The default hierarchy is the v2 interface of cgroup and this function + * can be used to test whether a cgroup is on the default hierarchy for + * cases where a subsystem should behave differnetly depending on the + * interface version. * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. + * The set of behaviors which change on the default hierarchy are still + * being determined and the mount option is prefixed with __DEVEL__. * - * It is allowed to temporarily drop RCU read lock during iteration. The - * caller is responsible for ensuring that @pos remains accessible until - * the start of the next iteration by, for example, bumping the css refcnt. - */ -#define css_for_each_child(pos, parent) \ - for ((pos) = css_next_child(NULL, (parent)); (pos); \ - (pos) = css_next_child((pos), (parent))) - -struct cgroup_subsys_state * -css_next_descendant_pre(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *css); - -struct cgroup_subsys_state * -css_rightmost_descendant(struct cgroup_subsys_state *pos); - -/** - * css_for_each_descendant_pre - pre-order walk of a css's descendants - * @pos: the css * to use as the loop cursor - * @root: css whose descendants to walk + * List of changed behaviors: * - * Walk @root's descendants. @root is included in the iteration and the - * first node to be visited. Must be called under rcu_read_lock(). + * - Mount options "noprefix", "xattr", "clone_children", "release_agent" + * and "name" are disallowed. * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. + * - When mounting an existing superblock, mount options should match. * - * For example, the following guarantees that a descendant can't escape - * state updates of its ancestors. + * - Remount is disallowed. * - * my_online(@css) - * { - * Lock @css's parent and @css; - * Inherit state from the parent; - * Unlock both. - * } + * - rename(2) is disallowed. * - * my_update_state(@css) - * { - * css_for_each_descendant_pre(@pos, @css) { - * Lock @pos; - * if (@pos == @css) - * Update @css's state; - * else - * Verify @pos is alive and inherit state from its parent; - * Unlock @pos; - * } - * } + * - "tasks" is removed. Everything should be at process granularity. Use + * "cgroup.procs" instead. * - * As long as the inheriting step, including checking the parent state, is - * enclosed inside @pos locking, double-locking the parent isn't necessary - * while inheriting. The state update to the parent is guaranteed to be - * visible by walking order and, as long as inheriting operations to the - * same @pos are atomic to each other, multiple updates racing each other - * still result in the correct state. It's guaranateed that at least one - * inheritance happens for any css after the latest update to its parent. + * - "cgroup.procs" is not sorted. pids will be unique unless they got + * recycled inbetween reads. * - * If checking parent's state requires locking the parent, each inheriting - * iteration should lock and unlock both @pos->parent and @pos. + * - "release_agent" and "notify_on_release" are removed. Replacement + * notification mechanism will be implemented. * - * Alternatively, a subsystem may choose to use a single global lock to - * synchronize ->css_online() and ->css_offline() against tree-walking - * operations. + * - "cgroup.clone_children" is removed. * - * It is allowed to temporarily drop RCU read lock during iteration. The - * caller is responsible for ensuring that @pos remains accessible until - * the start of the next iteration by, for example, bumping the css refcnt. - */ -#define css_for_each_descendant_pre(pos, css) \ - for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ - (pos) = css_next_descendant_pre((pos), (css))) - -struct cgroup_subsys_state * -css_next_descendant_post(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *css); - -/** - * css_for_each_descendant_post - post-order walk of a css's descendants - * @pos: the css * to use as the loop cursor - * @css: css whose descendants to walk + * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup + * and its descendants contain no task; otherwise, 1. The file also + * generates kernfs notification which can be monitored through poll and + * [di]notify when the value of the file changes. * - * Similar to css_for_each_descendant_pre() but performs post-order - * traversal instead. @root is included in the iteration and the last - * node to be visited. + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and + * take masks of ancestors with non-empty cpus/mems, instead of being + * moved to an ancestor. * - * If a subsystem synchronizes ->css_online() and the start of iteration, a - * css which finished ->css_online() is guaranteed to be visible in the - * future iterations and will stay visible until the last reference is put. - * A css which hasn't finished ->css_online() or already finished - * ->css_offline() may show up during traversal. It's each subsystem's - * responsibility to synchronize against on/offlining. + * - cpuset: a task can be moved into an empty cpuset, and again it takes + * masks of ancestors. * - * Note that the walk visibility guarantee example described in pre-order - * walk doesn't apply the same to post-order walks. + * - memcg: use_hierarchy is on by default and the cgroup file for the flag + * is not created. + * + * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. */ -#define css_for_each_descendant_post(pos, css) \ - for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ - (pos) = css_next_descendant_post((pos), (css))) +static inline bool cgroup_on_dfl(const struct cgroup *cgrp) +{ + return cgrp->root == &cgrp_dfl_root; +} -bool css_has_online_children(struct cgroup_subsys_state *css); +/* no synchronization, the result can only be used as a hint */ +static inline bool cgroup_has_tasks(struct cgroup *cgrp) +{ + return !list_empty(&cgrp->cset_links); +} -/* A css_task_iter should be treated as an opaque object */ -struct css_task_iter { - struct cgroup_subsys *ss; +/* returns ino associated with a cgroup */ +static inline ino_t cgroup_ino(struct cgroup *cgrp) +{ + return cgrp->kn->ino; +} - struct list_head *cset_pos; - struct list_head *cset_head; +/* cft/css accessors for cftype->write() operation */ +static inline struct cftype *of_cft(struct kernfs_open_file *of) +{ + return of->kn->priv; +} - struct list_head *task_pos; - struct list_head *tasks_head; - struct list_head *mg_tasks_head; -}; +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); -void css_task_iter_start(struct cgroup_subsys_state *css, - struct css_task_iter *it); -struct task_struct *css_task_iter_next(struct css_task_iter *it); -void css_task_iter_end(struct css_task_iter *it); +/* cft/css accessors for cftype->seq_*() operations */ +static inline struct cftype *seq_cft(struct seq_file *seq) +{ + return of_cft(seq->private); +} -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); +static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + return of_css(seq->private); +} -struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, - struct cgroup_subsys *ss); -struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss); +/* + * Name / path handling functions. All are thin wrappers around the kernfs + * counterparts and can be called under any context. + */ -#else /* !CONFIG_CGROUPS */ +static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) +{ + return kernfs_name(cgrp->kn, buf, buflen); +} -struct cgroup_subsys_state; +static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, + size_t buflen) +{ + return kernfs_path(cgrp->kn, buf, buflen); +} -static inline int cgroup_init_early(void) { return 0; } -static inline int cgroup_init(void) { return 0; } -static inline void cgroup_fork(struct task_struct *p) {} -static inline void cgroup_post_fork(struct task_struct *p) {} -static inline void cgroup_exit(struct task_struct *p) {} +static inline void pr_cont_cgroup_name(struct cgroup *cgrp) +{ + pr_cont_kernfs_name(cgrp->kn); +} -static inline int cgroupstats_build(struct cgroupstats *stats, - struct dentry *dentry) +static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { - return -EINVAL; + pr_cont_kernfs_path(cgrp->kn); } -static inline void css_put(struct cgroup_subsys_state *css) {} +#else /* !CONFIG_CGROUPS */ -/* No cgroups - nothing to do */ +struct cgroup_subsys_state; + +static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) -{ - return 0; -} + struct task_struct *t) { return 0; } +static inline int cgroupstats_build(struct cgroupstats *stats, + struct dentry *dentry) { return -EINVAL; } + +static inline void cgroup_fork(struct task_struct *p) {} +static inline void cgroup_post_fork(struct task_struct *p) {} +static inline void cgroup_exit(struct task_struct *p) {} + +static inline int cgroup_init_early(void) { return 0; } +static inline int cgroup_init(void) { return 0; } #endif /* !CONFIG_CGROUPS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bb9b075..e8493fe 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,13 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#ifdef CONFIG_CGROUPS -#define INIT_GROUP_RWSEM(sig) \ - .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), -#else -#define INIT_GROUP_RWSEM(sig) -#endif - #ifdef CONFIG_CPUSETS #define INIT_CPUSET_SEQ(tsk) \ .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), @@ -55,7 +48,6 @@ extern struct fs_struct init_fs; }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ - INIT_GROUP_RWSEM(sig) \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 71ecdab..e6b2f7d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -277,6 +277,7 @@ void kernfs_put(struct kernfs_node *kn); struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); @@ -352,6 +353,10 @@ static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { return NULL; } +static inline struct inode * +kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ return NULL; } + static inline struct kernfs_root * kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) diff --git a/include/linux/sched.h b/include/linux/sched.h index 93ed0b6..a09ece3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -58,6 +58,7 @@ struct sched_param { #include <linux/uidgid.h> #include <linux/gfp.h> #include <linux/magic.h> +#include <linux/cgroup-defs.h> #include <asm/processor.h> @@ -755,18 +756,6 @@ struct signal_struct { unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif -#ifdef CONFIG_CGROUPS - /* - * group_rwsem prevents new tasks from entering the threadgroup and - * member tasks from exiting,a more specifically, setting of - * PF_EXITING. fork and exit paths are protected with this rwsem - * using threadgroup_change_begin/end(). Users which require - * threadgroup to remain stable should use threadgroup_[un]lock() - * which also takes care of exec path. Currently, cgroup is the - * only user. - */ - struct rw_semaphore group_rwsem; -#endif oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ @@ -2725,53 +2714,33 @@ static inline void unlock_task_sighand(struct task_struct *tsk, spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } -#ifdef CONFIG_CGROUPS -static inline void threadgroup_change_begin(struct task_struct *tsk) -{ - down_read(&tsk->signal->group_rwsem); -} -static inline void threadgroup_change_end(struct task_struct *tsk) -{ - up_read(&tsk->signal->group_rwsem); -} - /** - * threadgroup_lock - lock threadgroup - * @tsk: member task of the threadgroup to lock - * - * Lock the threadgroup @tsk belongs to. No new task is allowed to enter - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or - * change ->group_leader/pid. This is useful for cases where the threadgroup - * needs to stay stable across blockable operations. - * - * fork and exit paths explicitly call threadgroup_change_{begin|end}() for - * synchronization. While held, no new task will be added to threadgroup - * and no existing live task will have its PF_EXITING set. + * threadgroup_change_begin - mark the beginning of changes to a threadgroup + * @tsk: task causing the changes * - * de_thread() does threadgroup_change_{begin|end}() when a non-leader - * sub-thread becomes a new leader. + * All operations which modify a threadgroup - a new thread joining the + * group, death of a member thread (the assertion of PF_EXITING) and + * exec(2) dethreading the process and replacing the leader - are wrapped + * by threadgroup_change_{begin|end}(). This is to provide a place which + * subsystems needing threadgroup stability can hook into for + * synchronization. */ -static inline void threadgroup_lock(struct task_struct *tsk) +static inline void threadgroup_change_begin(struct task_struct *tsk) { - down_write(&tsk->signal->group_rwsem); + might_sleep(); + cgroup_threadgroup_change_begin(tsk); } /** - * threadgroup_unlock - unlock threadgroup - * @tsk: member task of the threadgroup to unlock + * threadgroup_change_end - mark the end of changes to a threadgroup + * @tsk: task causing the changes * - * Reverse threadgroup_lock(). + * See threadgroup_change_begin(). */ -static inline void threadgroup_unlock(struct task_struct *tsk) +static inline void threadgroup_change_end(struct task_struct *tsk) { - up_write(&tsk->signal->group_rwsem); + cgroup_threadgroup_change_end(tsk); } -#else -static inline void threadgroup_change_begin(struct task_struct *tsk) {} -static inline void threadgroup_change_end(struct task_struct *tsk) {} -static inline void threadgroup_lock(struct task_struct *tsk) {} -static inline void threadgroup_unlock(struct task_struct *tsk) {} -#endif #ifndef __HAVE_THREAD_FUNCTIONS |