diff options
author | Ingo Molnar <mingo@kernel.org> | 2015-12-14 09:31:23 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2015-12-14 09:31:23 +0100 |
commit | 057032e457f702e2f4af18cfa99c3afab6841d24 (patch) | |
tree | d46c8325756a3a2be28c286b1408e2e314e3dafb /kernel | |
parent | 54c9238cfd495d234b0e02fa7dcce5b1671a42dd (diff) | |
parent | 9f9499ae8e6415cefc4fe0a96ad0e27864353c89 (diff) | |
download | op-kernel-dev-057032e457f702e2f4af18cfa99c3afab6841d24.zip op-kernel-dev-057032e457f702e2f4af18cfa99c3afab6841d24.tar.gz |
Merge tag 'v4.4-rc5' into perf/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/arraymap.c | 10 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 34 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 6 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 40 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 3 | ||||
-rw-r--r-- | kernel/cgroup.c | 99 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 23 | ||||
-rw-r--r-- | kernel/cgroup_pids.c | 77 | ||||
-rw-r--r-- | kernel/cpuset.c | 33 | ||||
-rw-r--r-- | kernel/events/core.c | 6 | ||||
-rw-r--r-- | kernel/fork.c | 9 | ||||
-rw-r--r-- | kernel/pid.c | 4 | ||||
-rw-r--r-- | kernel/sched/core.c | 48 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 3 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 3 | ||||
-rw-r--r-- | kernel/sched/wait.c | 28 | ||||
-rw-r--r-- | kernel/stop_machine.c | 4 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 17 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 16 |
20 files changed, 285 insertions, 180 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3f4c99e..b0799bc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -28,11 +28,17 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) attr->value_size == 0) return ERR_PTR(-EINVAL); + if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return ERR_PTR(-E2BIG); + elem_size = round_up(attr->value_size, 8); /* check round_up into zero and u32 overflow */ if (elem_size == 0 || - attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) return ERR_PTR(-ENOMEM); array_size = sizeof(*array) + attr->max_entries * elem_size; @@ -105,7 +111,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, array->elem_size); + memcpy(array->value + array->elem_size * index, value, map->value_size); return 0; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 19909b2..34777b3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -64,12 +64,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - err = -ENOMEM; + if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + goto free_htab; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) goto free_htab; + if ((u64) htab->n_buckets * sizeof(struct hlist_head) + + (u64) htab->elem_size * htab->map.max_entries >= + U32_MAX - PAGE_SIZE) + /* make sure page count doesn't overflow */ + goto free_htab; + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->elem_size * htab->map.max_entries, + PAGE_SIZE) >> PAGE_SHIFT; + + err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), GFP_USER | __GFP_NOWARN); @@ -85,13 +108,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->lock); htab->count = 0; - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; - - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; return &htab->map; free_htab: @@ -222,7 +238,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, WARN_ON_ONCE(!rcu_read_lock_held()); /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); if (!l_new) return -ENOMEM; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be6d726..5a8a797 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); break; case BPF_TYPE_MAP: - atomic_inc(&((struct bpf_map *)raw)->refcnt); + bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -51,7 +51,7 @@ static void bpf_any_put(void *raw, enum bpf_type type) bpf_prog_put(raw); break; case BPF_TYPE_MAP: - bpf_map_put(raw); + bpf_map_put_with_uref(raw); break; default: WARN_ON_ONCE(1); @@ -64,7 +64,7 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) void *raw; *type = BPF_TYPE_MAP; - raw = bpf_map_get(ufd); + raw = bpf_map_get_with_uref(ufd); if (IS_ERR(raw)) { *type = BPF_TYPE_PROG; raw = bpf_prog_get(ufd); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d3313d..3b39550 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -82,6 +82,14 @@ static void bpf_map_free_deferred(struct work_struct *work) map->ops->map_free(map); } +static void bpf_map_put_uref(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->usercnt)) { + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) + bpf_fd_array_map_clear(map); + } +} + /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ @@ -93,17 +101,15 @@ void bpf_map_put(struct bpf_map *map) } } -static int bpf_map_release(struct inode *inode, struct file *filp) +void bpf_map_put_with_uref(struct bpf_map *map) { - struct bpf_map *map = filp->private_data; - - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - /* prog_array stores refcnt-ed bpf_prog pointers - * release them all when user space closes prog_array_fd - */ - bpf_fd_array_map_clear(map); - + bpf_map_put_uref(map); bpf_map_put(map); +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + bpf_map_put_with_uref(filp->private_data); return 0; } @@ -142,6 +148,7 @@ static int map_create(union bpf_attr *attr) return PTR_ERR(map); atomic_set(&map->refcnt, 1); + atomic_set(&map->usercnt, 1); err = bpf_map_charge_memlock(map); if (err) @@ -174,7 +181,14 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -struct bpf_map *bpf_map_get(u32 ufd) +void bpf_map_inc(struct bpf_map *map, bool uref) +{ + atomic_inc(&map->refcnt); + if (uref) + atomic_inc(&map->usercnt); +} + +struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); struct bpf_map *map; @@ -183,7 +197,7 @@ struct bpf_map *bpf_map_get(u32 ufd) if (IS_ERR(map)) return map; - atomic_inc(&map->refcnt); + bpf_map_inc(map, true); fdput(f); return map; @@ -226,7 +240,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -285,7 +299,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c607305..a7945d1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2021,8 +2021,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - atomic_inc(&map->refcnt); - + bpf_map_inc(map, false); fdput(f); next_insn: insn++; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1603c1..470f653 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -98,6 +98,12 @@ static DEFINE_SPINLOCK(css_set_lock); static DEFINE_SPINLOCK(cgroup_idr_lock); /* + * Protects cgroup_file->kn for !self csses. It synchronizes notifications + * against file removal/re-creation across css hiding. + */ +static DEFINE_SPINLOCK(cgroup_file_kn_lock); + +/* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. */ @@ -754,9 +760,11 @@ static void put_css_set_locked(struct css_set *cset) if (!atomic_dec_and_test(&cset->refcount)) return; - /* This css_set is dead. unlink it and release cgroup refcounts */ - for_each_subsys(ss, ssid) + /* This css_set is dead. unlink it and release cgroup and css refs */ + for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); + css_put(cset->subsys[ssid]); + } hash_del(&cset->hlist); css_set_count--; @@ -1056,9 +1064,13 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - for_each_subsys(ss, ssid) + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cset->subsys[ssid]; + list_add_tail(&cset->e_cset_node[ssid], - &cset->subsys[ssid]->cgroup->e_csets[ssid]); + &css->cgroup->e_csets[ssid]); + css_get(css); + } spin_unlock_bh(&css_set_lock); @@ -1393,6 +1405,16 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); + + if (cft->file_offset) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + spin_lock_irq(&cgroup_file_kn_lock); + cfile->kn = NULL; + spin_unlock_irq(&cgroup_file_kn_lock); + } + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1856,7 +1878,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); - INIT_LIST_HEAD(&cgrp->self.files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -2216,6 +2237,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the subsys currently being processed */ + int ssid; + /* * Fields for cgroup_taskset_*() iteration. * @@ -2278,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task, /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; - return cgroup_taskset_next(tset); + return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; @@ -2311,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; + + /* + * This function may be called both before and + * after cgroup_taskset_migrate(). The two cases + * can be distinguished by looking at whether @cset + * has its ->mg_dst_cset set. + */ + if (cset->mg_dst_cset) + *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; + else + *dst_cssp = cset->subsys[tset->ssid]; + return task; } @@ -2346,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, /* check that we can legitimately attach to the cgroup */ for_each_e_css(css, i, dst_cgrp) { if (css->ss->can_attach) { - ret = css->ss->can_attach(css, tset); + tset->ssid = i; + ret = css->ss->can_attach(tset); if (ret) { failed_css = css; goto out_cancel_attach; @@ -2379,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) - if (css->ss->attach) - css->ss->attach(css, tset); + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->attach) { + tset->ssid = i; + css->ss->attach(tset); + } + } ret = 0; goto out_release_tset; @@ -2390,8 +2434,10 @@ out_cancel_attach: for_each_e_css(css, i, dst_cgrp) { if (css == failed_css) break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, tset); + if (css->ss->cancel_attach) { + tset->ssid = i; + css->ss->cancel_attach(tset); + } } out_release_tset: spin_lock_bh(&css_set_lock); @@ -3313,9 +3359,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; - kernfs_get(kn); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; - list_add(&cfile->node, &css->files); + spin_unlock_irq(&cgroup_file_kn_lock); } return 0; @@ -3553,6 +3599,22 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } /** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +void cgroup_file_notify(struct cgroup_file *cfile) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_file_kn_lock, flags); + if (cfile->kn) + kernfs_notify(cfile->kn); + spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -4613,13 +4675,9 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - struct cgroup_file *cfile; percpu_ref_exit(&css->refcnt); - list_for_each_entry(cfile, &css->files, node) - kernfs_put(cfile->kn); - if (ss) { /* css free path */ int id = css->id; @@ -4724,7 +4782,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->files); css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f1b30ad..2d3df82 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -155,12 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_taskset *tset) { - struct freezer *freezer = css_freezer(new_css); struct task_struct *task; - bool clear_frozen = false; + struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); @@ -174,22 +172,21 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { freeze_task(task); - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = true; + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } } } - /* propagate FROZEN clearing upwards */ - while (clear_frozen && (freezer = parent_freezer(freezer))) { - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = freezer->state & CGROUP_FREEZING; - } - mutex_unlock(&freezer_mutex); } diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index cdd8df4..b50d5a1 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -106,7 +106,7 @@ static void pids_uncharge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; p; p = parent_pids(p)) + for (p = pids; parent_pids(p); p = parent_pids(p)) pids_cancel(p, num); } @@ -123,7 +123,7 @@ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; p; p = parent_pids(p)) + for (p = pids; parent_pids(p); p = parent_pids(p)) atomic64_add(num, &p->counter); } @@ -140,7 +140,7 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p, *q; - for (p = pids; p; p = parent_pids(p)) { + for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); /* @@ -162,13 +162,13 @@ revert: return -EAGAIN; } -static int pids_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int pids_can_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css, return 0; } -static void pids_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void pids_cancel_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -205,65 +205,28 @@ static void pids_cancel_attach(struct cgroup_subsys_state *css, } } +/* + * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies + * on threadgroup_change_begin() held by the copy_process(). + */ static int pids_can_fork(struct task_struct *task, void **priv_p) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; - int err; - /* - * Use the "current" task_css for the pids subsystem as the tentative - * css. It is possible we will charge the wrong hierarchy, in which - * case we will forcefully revert/reapply the charge on the right - * hierarchy after it is committed to the task proper. - */ - css = task_get_css(current, pids_cgrp_id); + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - - err = pids_try_charge(pids, 1); - if (err) - goto err_css_put; - - *priv_p = css; - return 0; - -err_css_put: - css_put(css); - return err; + return pids_try_charge(pids, 1); } static void pids_cancel_fork(struct task_struct *task, void *priv) { - struct cgroup_subsys_state *css = priv; - struct pids_cgroup *pids = css_pids(css); - - pids_uncharge(pids, 1); - css_put(css); -} - -static void pids_fork(struct task_struct *task, void *priv) -{ struct cgroup_subsys_state *css; - struct cgroup_subsys_state *old_css = priv; struct pids_cgroup *pids; - struct pids_cgroup *old_pids = css_pids(old_css); - css = task_get_css(task, pids_cgrp_id); + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - - /* - * If the association has changed, we have to revert and reapply the - * charge/uncharge on the wrong hierarchy to the current one. Since - * the association can only change due to an organisation event, its - * okay for us to ignore the limit in this case. - */ - if (pids != old_pids) { - pids_uncharge(old_pids, 1); - pids_charge(pids, 1); - } - - css_put(css); - css_put(old_css); + pids_uncharge(pids, 1); } static void pids_free(struct task_struct *task) @@ -335,6 +298,7 @@ static struct cftype pids_files[] = { { .name = "current", .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; @@ -346,7 +310,6 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .fork = pids_fork, .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10ae736..02a8ea5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1429,15 +1429,16 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_taskset *tset) { - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1447,7 +1448,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->cpus_allowed); if (ret) goto out_unlock; @@ -1467,9 +1468,14 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_cancel_attach(struct cgroup_taskset *tset) { + struct cgroup_subsys_state *css; + struct cpuset *cs; + + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1482,16 +1488,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1502,7 +1511,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1518,7 +1527,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { diff --git a/kernel/events/core.c b/kernel/events/core.c index c3d61b9..a627f36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9466,12 +9466,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/fork.c b/kernel/fork.c index f97f2c4..fce002e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1368,8 +1368,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - if (clone_flags & CLONE_THREAD) - threadgroup_change_begin(current); + threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1610,8 +1609,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); cgroup_post_fork(p, cgrp_ss_priv); - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -1652,8 +1650,7 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/pid.c b/kernel/pid.c index ca36879..78b3d9f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -467,7 +467,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_lock(); if (type != PIDTYPE_PID) task = task->group_leader; - pid = get_pid(task->pids[type].pid); + pid = get_pid(rcu_dereference(task->pids[type].pid)); rcu_read_unlock(); return pid; } @@ -528,7 +528,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; - nr = pid_nr_ns(task->pids[type].pid, ns); + nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9..732e993 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1947,13 +1947,38 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #ifdef CONFIG_SMP /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. + */ + smp_rmb(); + + /* * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ while (p->on_cpu) cpu_relax(); /* - * Pairs with the smp_wmb() in finish_lock_switch(). + * Combined with the control dependency above, we have an effective + * smp_load_acquire() without the need for full barriers. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. */ smp_rmb(); @@ -2039,7 +2064,6 @@ out: */ int wake_up_process(struct task_struct *p) { - WARN_ON(task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); @@ -5847,13 +5871,13 @@ static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); @@ -8217,12 +8241,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8235,12 +8259,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 26a5446..05de80b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -788,6 +788,9 @@ cputime_t task_gtime(struct task_struct *t) unsigned int seq; cputime_t gtime; + if (!context_tracking_is_enabled()) + return t->gtime; + do { seq = read_seqbegin(&t->vtime_seqlock); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e3cc163..8ec86ab 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -64,7 +64,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) static void push_irq_work_func(struct irq_work *work); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efd3bfc..b242775 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1073,6 +1073,9 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * We must ensure this doesn't happen until the switch is completely * finished. * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * * Pairs with the control dependency and rmb in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e026..f15d6b6 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,44 +581,44 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { - if (signal_pending_state(current->state, current)) - return 1; schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { - if (signal_pending_state(current->state, current)) - return 1; io_schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20..a3bbaee 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -531,7 +531,7 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { @@ -631,4 +631,4 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return ret ?: done.ret; } -#endif /* CONFIG_STOP_MACHINE */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 75f1d05..9c6045a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,12 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; - cpu_buffer->reader_page->read = 0; -} - static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -2803,8 +2797,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, event = __rb_reserve_next(cpu_buffer, &info); - if (unlikely(PTR_ERR(event) == -EAGAIN)) + if (unlikely(PTR_ERR(event) == -EAGAIN)) { + if (info.add_timestamp) + info.length -= RB_LEN_TIME_EXTEND; goto again; + } if (!event) goto out_fail; @@ -3626,7 +3623,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; - rb_reset_reader_page(cpu_buffer); + cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; @@ -3636,6 +3633,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: + /* Update the read_stamp on the first event */ + if (reader && reader->read == 0) + cpu_buffer->read_stamp = reader->page->time_stamp; + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6bbc5f6..4f6ef69 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -582,6 +582,12 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); + + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); + list_for_each_entry(file, &tr->events, list) { clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } @@ -1729,6 +1735,16 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); + + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, + tr, 0); } /* |