summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_tree.c16
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/bpf/test_stub.c56
-rw-r--r--kernel/bpf/verifier.c171
-rw-r--r--kernel/cgroup.c175
-rw-r--r--kernel/cpuset.c162
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/groups.c11
-rw-r--r--kernel/irq_work.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c18
-rw-r--r--kernel/nsproxy.c10
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/pid_namespace.c29
-rw-r--r--kernel/printk/printk.c10
-rw-r--r--kernel/stacktrace.c32
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c1
-rw-r--r--kernel/trace/blktrace.c3
-rw-r--r--kernel/trace/trace.c27
-rw-r--r--kernel/trace/trace.h14
-rw-r--r--kernel/trace/trace_events.c42
-rw-r--r--kernel/trace/trace_syscalls.c7
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/user_namespace.c153
-rw-r--r--kernel/utsname.c31
-rw-r--r--kernel/workqueue.c30
39 files changed, 1301 insertions, 392 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 1f37f151..f8f203e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -833,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
s.backlog = skb_queue_len(&audit_skb_queue);
- s.version = AUDIT_VERSION_LATEST;
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
s.backlog_wait_time = audit_backlog_wait_time;
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 80f29e0..2e0c974 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk)
struct fsnotify_mark *entry = &chunk->mark;
struct list_head *list;
- if (!entry->i.inode)
+ if (!entry->inode)
return;
- list = chunk_hash(entry->i.inode);
+ list = chunk_hash(entry->inode);
list_add_rcu(&chunk->hash, list);
}
@@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
list_for_each_entry_rcu(p, list, hash) {
/* mark.inode may have gone NULL, but who cares? */
- if (p->mark.i.inode == inode) {
+ if (p->mark.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
@@ -231,7 +231,7 @@ static void untag_chunk(struct node *p)
new = alloc_chunk(size);
spin_lock(&entry->lock);
- if (chunk->dead || !entry->i.inode) {
+ if (chunk->dead || !entry->inode) {
spin_unlock(&entry->lock);
if (new)
free_chunk(new);
@@ -258,7 +258,7 @@ static void untag_chunk(struct node *p)
goto Fallback;
fsnotify_duplicate_mark(&new->mark, entry);
- if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk_entry = &chunk->mark;
spin_lock(&old_entry->lock);
- if (!old_entry->i.inode) {
+ if (!old_entry->inode) {
/* old_entry is being shot, lets just lie */
spin_unlock(&old_entry->lock);
fsnotify_put_mark(old_entry);
@@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
}
fsnotify_duplicate_mark(chunk_entry, old_entry);
- if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
@@ -611,7 +611,7 @@ void audit_trim_trees(void)
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
- struct inode *inode = chunk->mark.i.inode;
+ struct inode *inode = chunk->mark.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
node->index &= ~(1U<<31);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6..a5ae60f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
ifdef CONFIG_TEST_BPF
obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 0000000..9eb4d8a
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+ struct bpf_map map;
+ u32 elem_size;
+ char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array *array;
+ u32 elem_size, array_size;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ elem_size = round_up(attr->value_size, 8);
+
+ /* check round_up into zero and u32 overflow */
+ if (elem_size == 0 ||
+ attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
+ return ERR_PTR(-ENOMEM);
+
+ array_size = sizeof(*array) + attr->max_entries * elem_size;
+
+ /* allocate all map elements and zero-initialize them */
+ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
+ if (!array) {
+ array = vzalloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* copy mandatory map attributes */
+ array->map.key_size = attr->key_size;
+ array->map.value_size = attr->value_size;
+ array->map.max_entries = attr->max_entries;
+
+ array->elem_size = elem_size;
+
+ return &array->map;
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return NULL;
+
+ return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (map_flags == BPF_NOEXIST)
+ /* all elements already exist */
+ return -EEXIST;
+
+ memcpy(array->value + array->elem_size * index, value, array->elem_size);
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding programs to complete
+ * and free the array
+ */
+ synchronize_rcu();
+
+ kvfree(array);
+}
+
+static struct bpf_map_ops array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+ .ops = &array_ops,
+ .type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+ bpf_register_map_type(&tl);
+ return 0;
+}
+late_initcall(register_array_map);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 0000000..b3ba436
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct hlist_head *buckets;
+ spinlock_t lock;
+ u32 count; /* number of elements in this hashtable */
+ u32 n_buckets; /* number of hash buckets */
+ u32 elem_size; /* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+ struct hlist_node hash_node;
+ struct rcu_head rcu;
+ u32 hash;
+ char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int err, i;
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ htab->map.key_size = attr->key_size;
+ htab->map.value_size = attr->value_size;
+ htab->map.max_entries = attr->max_entries;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ err = -EINVAL;
+ if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+ htab->map.value_size == 0)
+ goto free_htab;
+
+ /* hash table size must be power of 2 */
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+ err = -E2BIG;
+ if (htab->map.key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ goto free_htab;
+
+ err = -ENOMEM;
+ /* prevent zero size kmalloc and check for u32 overflow */
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ goto free_htab;
+
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ GFP_USER | __GFP_NOWARN);
+
+ if (!htab->buckets) {
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ if (!htab->buckets)
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->n_buckets; i++)
+ INIT_HLIST_HEAD(&htab->buckets[i]);
+
+ spin_lock_init(&htab->lock);
+ htab->count = 0;
+
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8) +
+ htab->map.value_size;
+ return &htab->map;
+
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+ void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 hash, key_size;
+
+ /* Must be called with rcu_read_lock. */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l)
+ return l->key + round_up(map->key_size, 8);
+
+ return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l, *next_l;
+ u32 hash, key_size;
+ int i;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ /* lookup the key */
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (!l) {
+ i = 0;
+ goto find_first_elem;
+ }
+
+ /* key was found, get next key in the same bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+
+ if (next_l) {
+ /* if next elem in this hash list is non-zero, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* itereated over all buckets and all elements */
+ return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ u32 key_size;
+ int ret;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* allocate new element outside of lock */
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+ if (!l_new)
+ return -ENOMEM;
+
+ key_size = map->key_size;
+
+ memcpy(l_new->key, key, key_size);
+ memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+ l_new->hash = htab_map_hash(l_new->key, key_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, l_new->hash);
+
+ l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+ if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ /* if elem with this 'key' doesn't exist and we've reached
+ * max_entries limit, fail insertion of new elem
+ */
+ ret = -E2BIG;
+ goto err;
+ }
+
+ if (l_old && map_flags == BPF_NOEXIST) {
+ /* elem already exists */
+ ret = -EEXIST;
+ goto err;
+ }
+
+ if (!l_old && map_flags == BPF_EXIST) {
+ /* elem doesn't exist, cannot update it */
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* add new element to the head of the list, so that concurrent
+ * search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_del_rcu(&l_old->hash_node);
+ kfree_rcu(l_old, rcu);
+ } else {
+ htab->count++;
+ }
+ spin_unlock_irqrestore(&htab->lock, flags);
+
+ return 0;
+err:
+ spin_unlock_irqrestore(&htab->lock, flags);
+ kfree(l_new);
+ return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree_rcu(l, rcu);
+ ret = 0;
+ }
+
+ spin_unlock_irqrestore(&htab->lock, flags);
+ return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree(l);
+ }
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of kfree_rcu() callbacks for elements of this map may not have
+ * executed. It's ok. Proceed to free residual elements and map itself
+ */
+ delete_all_elements(htab);
+ kvfree(htab->buckets);
+ kfree(htab);
+}
+
+static struct bpf_map_ops htab_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_map_lookup_elem,
+ .map_update_elem = htab_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+ .ops = &htab_ops,
+ .type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+ bpf_register_map_type(&tl);
+ return 0;
+}
+late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 0000000..9e3414d
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* verifier checked that R1 contains a valid pointer to bpf_map
+ * and R2 points to a program stack and map->key_size bytes were
+ * initialized
+ */
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ value = map->ops->map_lookup_elem(map, key);
+
+ /* lookup() returns either pointer to element value or NULL
+ * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+ */
+ return (unsigned long) value;
+}
+
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value = (void *) (unsigned long) r3;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_update_elem(map, key, value, r4);
+}
+
+struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_delete_elem(map, key);
+}
+
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c..088ac0b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
- err = -ESRCH;
+ err = -ENOENT;
rcu_read_lock();
value = map->ops->map_lookup_elem(map, key);
if (!value)
@@ -190,7 +190,7 @@ err_put:
return err;
}
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
static int map_update_elem(union bpf_attr *attr)
{
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
* therefore all map accessors rely on this fact, so do the same here
*/
rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value);
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
rcu_read_unlock();
free_value:
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff..0ceae1e 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
u64 arg2;
};
-static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
- return 0;
-}
-
-static struct bpf_func_proto test_funcs[] = {
- [BPF_FUNC_unspec] = {
- .func = test_func,
- .gpl_only = true,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
- },
-};
-
static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
{
- if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ default:
return NULL;
- return &test_funcs[func_id];
+ }
}
static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
.type = BPF_PROG_TYPE_UNSPEC,
};
-static struct bpf_map *test_map_alloc(union bpf_attr *attr)
-{
- struct bpf_map *map;
-
- map = kzalloc(sizeof(*map), GFP_USER);
- if (!map)
- return ERR_PTR(-ENOMEM);
-
- map->key_size = attr->key_size;
- map->value_size = attr->value_size;
- map->max_entries = attr->max_entries;
- return map;
-}
-
-static void test_map_free(struct bpf_map *map)
-{
- kfree(map);
-}
-
-static struct bpf_map_ops test_map_ops = {
- .map_alloc = test_map_alloc,
- .map_free = test_map_free,
-};
-
-static struct bpf_map_type_list tl_map = {
- .ops = &test_map_ops,
- .type = BPF_MAP_TYPE_UNSPEC,
-};
-
static int __init register_test_ops(void)
{
- bpf_register_map_type(&tl_map);
bpf_register_prog_type(&tl_prog);
return 0;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9f81818..a28e09c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,22 +153,19 @@ struct reg_state {
enum bpf_stack_slot_type {
STACK_INVALID, /* nothing was stored in this stack slot */
- STACK_SPILL, /* 1st byte of register spilled into stack */
- STACK_SPILL_PART, /* other 7 bytes of register spill */
+ STACK_SPILL, /* register spilled into stack */
STACK_MISC /* BPF program wrote some data into this slot */
};
-struct bpf_stack_slot {
- enum bpf_stack_slot_type stype;
- struct reg_state reg_st;
-};
+#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
/* state of the program:
* type of all registers and stack info
*/
struct verifier_state {
struct reg_state regs[MAX_BPF_REG];
- struct bpf_stack_slot stack[MAX_BPF_STACK];
+ u8 stack_slot_type[MAX_BPF_STACK];
+ struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
};
/* linked list of verifier states used to prune search */
@@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env)
env->cur_state.regs[i].map_ptr->key_size,
env->cur_state.regs[i].map_ptr->value_size);
}
- for (i = 0; i < MAX_BPF_STACK; i++) {
- if (env->cur_state.stack[i].stype == STACK_SPILL)
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[env->cur_state.stack[i].reg_st.type]);
+ reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
}
verbose("\n");
}
@@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size)
static int check_stack_write(struct verifier_state *state, int off, int size,
int value_regno)
{
- struct bpf_stack_slot *slot;
int i;
+ /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
+ * so it's aligned access and [off, off + size) are within stack limits
+ */
if (value_regno >= 0 &&
(state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
@@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
state->regs[value_regno].type == PTR_TO_CTX)) {
/* register containing pointer is being spilled into stack */
- if (size != 8) {
+ if (size != BPF_REG_SIZE) {
verbose("invalid size of register spill\n");
return -EACCES;
}
- slot = &state->stack[MAX_BPF_STACK + off];
- slot->stype = STACK_SPILL;
/* save register state */
- slot->reg_st = state->regs[value_regno];
- for (i = 1; i < 8; i++) {
- slot = &state->stack[MAX_BPF_STACK + off + i];
- slot->stype = STACK_SPILL_PART;
- slot->reg_st.type = UNKNOWN_VALUE;
- slot->reg_st.map_ptr = NULL;
- }
- } else {
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ state->regs[value_regno];
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ } else {
/* regular write of data into stack */
- for (i = 0; i < size; i++) {
- slot = &state->stack[MAX_BPF_STACK + off + i];
- slot->stype = STACK_MISC;
- slot->reg_st.type = UNKNOWN_VALUE;
- slot->reg_st.map_ptr = NULL;
- }
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ (struct reg_state) {};
+
+ for (i = 0; i < size; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
}
return 0;
}
@@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
static int check_stack_read(struct verifier_state *state, int off, int size,
int value_regno)
{
+ u8 *slot_type;
int i;
- struct bpf_stack_slot *slot;
- slot = &state->stack[MAX_BPF_STACK + off];
+ slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
- if (slot->stype == STACK_SPILL) {
- if (size != 8) {
+ if (slot_type[0] == STACK_SPILL) {
+ if (size != BPF_REG_SIZE) {
verbose("invalid size of register spill\n");
return -EACCES;
}
- for (i = 1; i < 8; i++) {
- if (state->stack[MAX_BPF_STACK + off + i].stype !=
- STACK_SPILL_PART) {
+ for (i = 1; i < BPF_REG_SIZE; i++) {
+ if (slot_type[i] != STACK_SPILL) {
verbose("corrupted spill memory\n");
return -EACCES;
}
@@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
if (value_regno >= 0)
/* restore register state from stack */
- state->regs[value_regno] = slot->reg_st;
+ state->regs[value_regno] =
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
return 0;
} else {
for (i = 0; i < size; i++) {
- if (state->stack[MAX_BPF_STACK + off + i].stype !=
- STACK_MISC) {
+ if (slot_type[i] != STACK_MISC) {
verbose("invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
@@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env,
}
for (i = 0; i < access_size; i++) {
- if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+ if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
verbose("invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
@@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
return 0;
}
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ * preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ * ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ u8 mode = BPF_MODE(insn->code);
+ struct reg_state *reg;
+ int i, err;
+
+ if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+ return -EINVAL;
+ }
+
+ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+ verbose("BPF_LD_ABS uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check whether implicit source operand (register R6) is readable */
+ err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ if (err)
+ return err;
+
+ if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ return -EINVAL;
+ }
+
+ if (mode == BPF_IND) {
+ /* check explicit source operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ }
+
+ /* reset caller saved regs to unreadable */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* mark destination R0 register as readable, since it contains
+ * the value fetched from the packet
+ */
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ return 0;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
}
for (i = 0; i < MAX_BPF_STACK; i++) {
- if (memcmp(&old->stack[i], &cur->stack[i],
- sizeof(old->stack[0])) != 0) {
- if (old->stack[i].stype == STACK_INVALID)
- continue;
+ if (old->stack_slot_type[i] == STACK_INVALID)
+ continue;
+ if (old->stack_slot_type[i] != cur->stack_slot_type[i])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
return false;
- }
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ sizeof(old->spilled_regs[0])))
+ /* when explored and current stack slot types are
+ * the same, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * but current path has stored:
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ else
+ continue;
}
return true;
}
@@ -1664,8 +1741,10 @@ process_bpf_exit:
u8 mode = BPF_MODE(insn->code);
if (mode == BPF_ABS || mode == BPF_IND) {
- verbose("LD_ABS is not supported yet\n");
- return -EINVAL;
+ err = check_ld_abs(env, insn);
+ if (err)
+ return err;
+
} else if (mode == BPF_IMM) {
err = check_ld_imm(env, insn);
if (err)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 136ecead..bb263d0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
if (!(cgrp->root->subsys_mask & (1 << ss->id)))
return NULL;
+ /*
+ * This function is used while updating css associations and thus
+ * can't test the csses directly. Use ->child_subsys_mask.
+ */
while (cgroup_parent(cgrp) &&
!(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
cgrp = cgroup_parent(cgrp);
@@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
return cgroup_css(cgrp, ss);
}
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css && css_tryget_online(css))
+ goto out_unlock;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ css = init_css_set.subsys[ss->id];
+ css_get(css);
+out_unlock:
+ rcu_read_unlock();
+ return css;
+}
+
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
@@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
}
/**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
* @cgrp: the target cgroup
+ * @subtree_control: the new subtree_control mask to consider
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
- * This function determines which subsystems need to be enabled given the
- * current @cgrp->subtree_control and records it in
- * @cgrp->child_subsys_mask. The resulting mask is always a superset of
- * @cgrp->subtree_control and follows the usual hierarchy rules.
+ * This function calculates which subsystems need to be enabled if
+ * @subtree_control is to be applied to @cgrp. The returned mask is always
+ * a superset of @subtree_control and follows the usual hierarchy rules.
*/
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+ unsigned int subtree_control)
{
struct cgroup *parent = cgroup_parent(cgrp);
- unsigned int cur_ss_mask = cgrp->subtree_control;
+ unsigned int cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
- if (!cgroup_on_dfl(cgrp)) {
- cgrp->child_subsys_mask = cur_ss_mask;
- return;
- }
+ if (!cgroup_on_dfl(cgrp))
+ return cur_ss_mask;
while (true) {
unsigned int new_ss_mask = cur_ss_mask;
@@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
cur_ss_mask = new_ss_mask;
}
- cgrp->child_subsys_mask = cur_ss_mask;
+ return cur_ss_mask;
+}
+
+/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * Update @cgrp->child_subsys_mask according to the current
+ * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+ cgrp->child_subsys_mask =
+ cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
}
/**
@@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
loff_t off)
{
unsigned int enable = 0, disable = 0;
- unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
+ unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
ret = -ENOENT;
goto out_unlock;
}
-
- /*
- * @ss is already enabled through dependency and
- * we'll just make it visible. Skip draining.
- */
- if (cgrp->child_subsys_mask & (1 << ssid))
- continue;
-
- /*
- * Because css offlining is asynchronous, userland
- * might try to re-enable the same controller while
- * the previous instance is still around. In such
- * cases, wait till it's gone using offline_waitq.
- */
- cgroup_for_each_live_child(child, cgrp) {
- DEFINE_WAIT(wait);
-
- if (!cgroup_css(child, ss))
- continue;
-
- cgroup_get(child);
- prepare_to_wait(&child->offline_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- cgroup_kn_unlock(of->kn);
- schedule();
- finish_wait(&child->offline_waitq, &wait);
- cgroup_put(child);
-
- return restart_syscall();
- }
} else if (disable & (1 << ssid)) {
if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
@@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* subsystems than specified may need to be enabled or disabled
* depending on subsystem dependencies.
*/
- cgrp->subtree_control |= enable;
- cgrp->subtree_control &= ~disable;
+ old_sc = cgrp->subtree_control;
+ old_ss = cgrp->child_subsys_mask;
+ new_sc = (old_sc | enable) & ~disable;
+ new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
- old_ctrl = cgrp->child_subsys_mask;
- cgroup_refresh_child_subsys_mask(cgrp);
- new_ctrl = cgrp->child_subsys_mask;
-
- css_enable = ~old_ctrl & new_ctrl;
- css_disable = old_ctrl & ~new_ctrl;
+ css_enable = ~old_ss & new_ss;
+ css_disable = old_ss & ~new_ss;
enable |= css_enable;
disable |= css_disable;
/*
+ * Because css offlining is asynchronous, userland might try to
+ * re-enable the same controller while the previous instance is
+ * still around. In such cases, wait till it's gone using
+ * offline_waitq.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(css_enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
+
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+ }
+
+ cgrp->subtree_control = new_sc;
+ cgrp->child_subsys_mask = new_ss;
+
+ /*
* Create new csses or make the existing ones visible. A css is
* created invisible if it's being implicitly enabled through
* dependency. An invisible css is made visible when the userland
@@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
}
+ /*
+ * The effective csses of all the descendants (excluding @cgrp) may
+ * have changed. Subsystems can optionally subscribe to this event
+ * by implementing ->css_e_css_changed() which is invoked if any of
+ * the effective csses seen from the css's cgroup may have changed.
+ */
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
+ struct cgroup_subsys_state *css;
+
+ if (!ss->css_e_css_changed || !this_css)
+ continue;
+
+ css_for_each_descendant_pre(css, this_css)
+ if (css != this_css)
+ ss->css_e_css_changed(css);
+ }
+
kernfs_activate(cgrp->kn);
ret = 0;
out_unlock:
@@ -2832,9 +2898,8 @@ out_unlock:
return ret ?: nbytes;
err_undo_css:
- cgrp->subtree_control &= ~enable;
- cgrp->subtree_control |= disable;
- cgroup_refresh_child_subsys_mask(cgrp);
+ cgrp->subtree_control = old_sc;
+ cgrp->child_subsys_mask = old_ss;
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
@@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
if (ss) {
/* css release path */
cgroup_idr_remove(&ss->css_idr, css->id);
+ if (ss->css_released)
+ ss->css_released(css);
} else {
/* cgroup release path */
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 723cfc9..64b257f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global mutexes guarding cpuset structures - cpuset_mutex
- * and callback_mutex. The latter may nest inside the former. We also
- * require taking task_lock() when dereferencing a task's cpuset pointer.
- * See "The task_lock() exception", at the end of this comment.
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. We also require taking task_lock() when dereferencing a
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
+ * comment.
*
- * A task must hold both mutexes to modify cpusets. If a task holds
+ * A task must hold both locks to modify cpusets. If a task holds
* cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_mutex and be able to
+ * is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
* just holding cpuset_mutex. While it is performing these checks, various
- * callback routines can briefly acquire callback_mutex to query cpusets.
- * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * callback routines can briefly acquire callback_lock to query cpusets.
+ * Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
- * callback_mutex, as that would risk double tripping on callback_mutex
+ * callback_lock, as that would risk double tripping on callback_lock
* from one of the callbacks into the cpuset code from within
* __alloc_pages().
*
- * If a task is only holding callback_mutex, then it has read-only
+ * If a task is only holding callback_lock, then it has read-only
* access to cpusets.
*
* Now, the task_struct fields mems_allowed and mempolicy may be changed
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_mutex across
+ * The cpuset_common_file_read() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
*/
static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
/*
* CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cpuset_mutex held
+ * Call with callback_lock or cpuset_mutex held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
continue;
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* use trialcs->cpus_allowed as a temp variable */
update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
continue;
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
@@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
count = seq_get_buf(sf, &buf);
s = buf;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
@@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
seq_commit(sf, -1);
}
out_unlock:
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
return ret;
}
@@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (cgroup_on_dfl(cs->css.cgroup)) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
}
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
return 0;
@@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
mutex_lock(&cpuset_mutex);
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (cgroup_on_dfl(root_css->cgroup)) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
mutex_unlock(&cpuset_mutex);
}
@@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
{
bool is_empty;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (cpus_updated)
update_tasks_cpumask(cs);
@@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
update_tasks_nodemask(&top_cpuset);
}
@@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- mutex_lock(&callback_mutex);
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
nodemask_t mask;
+ unsigned long flags;
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return mask;
}
@@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
/*
* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
* mem_hardwall ancestor to the specified cpuset. Call holding
- * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
+ * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}
/**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* flag, yes.
* Otherwise, no.
*
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
- * cpusets, and never sleeps.
- *
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
* (in get_page_from_freelist()) refusing to consider the zones for
@@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
- * Scanning up parent cpusets requires callback_mutex. The
+ * Scanning up parent cpusets requires callback_lock. The
* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
* current tasks mems_allowed came up empty on the first pass over
* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_mutex
- * mutex.
+ * cpuset are short of memory, might require taking the callback_lock.
*
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* TIF_MEMDIE - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
- *
- * Rule:
- * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
- * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
- * the code that might scan up ancestor cpusets and sleep.
*/
-int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
+ unsigned long flags;
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
/*
@@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
return 1;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
-/*
- * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
- * set, yes, we can always allocate. If node is in our task's mems_allowed,
- * yes. If the task has been OOM killed and has access to memory reserves as
- * specified by the TIF_MEMDIE flag, yes.
- * Otherwise, no.
- *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first. By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
- * Unlike the cpuset_node_allowed_softwall() variant, above,
- * this variant requires that the node be in the current task's
- * mems_allowed or that we're in interrupt. It does not scan up the
- * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
- * It never sleeps.
- */
-int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
- if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
- return 1;
- if (node_isset(node, current->mems_allowed))
- return 1;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
- return 0;
-}
-
/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ed8f2cd..cb346f2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
int more = 0;
again:
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
continue;
if (!prev && !more) {
/*
- * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+ * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
info->mm = vma->vm_mm;
info->vaddr = offset_to_vaddr(vma, offset);
}
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
if (!more)
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 8714e5d..1ea4369 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -212,27 +212,6 @@ repeat:
}
/*
- * This checks not only the pgrp, but falls back on the pid if no
- * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
- * without this...
- *
- * The caller must hold rcu lock or the tasklist lock.
- */
-struct pid *session_of_pgrp(struct pid *pgrp)
-{
- struct task_struct *p;
- struct pid *sid = NULL;
-
- p = pid_task(pgrp, PIDTYPE_PGID);
- if (p == NULL)
- p = pid_task(pgrp, PIDTYPE_PID);
- if (p != NULL)
- sid = task_session(p);
-
- return sid;
-}
-
-/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ca8418..4dc2dda 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
@@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
vma_interval_tree_insert_after(tmp, mpnt,
&mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/*
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3b74087..c92e448 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
Note that the debugfs filesystem has to be mounted to access
profiling data.
+config ARCH_HAS_GCOV_PROFILE_ALL
+ def_bool n
+
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
+ depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f..664411f 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/* init to 2 - one for init_task, one to ensure it is never freed */
@@ -213,6 +214,14 @@ out:
return i;
}
+bool may_setgroups(void)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ return ns_capable(user_ns, CAP_SETGID) &&
+ userns_may_setgroups(user_ns);
+}
+
/*
* SMP: Our groups are copy-on-write. We can set them safely
* without another task interfering.
@@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 3ab9048..cbf9fb8 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
void irq_work_tick(void)
{
- struct llist_head *raised = &__get_cpu_var(raised_list);
+ struct llist_head *raised = this_cpu_ptr(&raised_list);
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
- irq_work_run_list(&__get_cpu_var(lazy_list));
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
}
/*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2abf9f6..9a8a01a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
if (!kexec_on_panic) {
image->swap_page = kimage_alloc_control_pages(image, 0);
if (!image->swap_page) {
- pr_err(KERN_ERR "Could not allocate swap buffer\n");
+ pr_err("Could not allocate swap buffer\n");
goto out_free_control_pages;
}
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 831978c..06f5830 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
-static int check_kprobe_address_safe(struct kprobe *p,
- struct module **probed_mod)
+int __weak arch_check_ftrace_location(struct kprobe *p)
{
- int ret = 0;
unsigned long ftrace_addr;
- /*
- * If the address is located on a ftrace nop, set the
- * breakpoint to the following instruction.
- */
ftrace_addr = ftrace_location((unsigned long)p->addr);
if (ftrace_addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
return -EINVAL;
#endif
}
+ return 0;
+}
+static int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
+{
+ int ret;
+
+ ret = arch_check_ftrace_location(p);
+ if (ret)
+ return ret;
jump_label_lock();
preempt_disable();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0a..49746c8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p)
SYSCALL_DEFINE2(setns, int, fd, int, nstype)
{
- const struct proc_ns_operations *ops;
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct proc_ns *ei;
struct file *file;
+ struct ns_common *ns;
int err;
file = proc_ns_fget(fd);
@@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
return PTR_ERR(file);
err = -EINVAL;
- ei = get_proc_ns(file_inode(file));
- ops = ei->ns_ops;
- if (nstype && (ops->type != nstype))
+ ns = get_proc_ns(file_inode(file));
+ if (nstype && (ns->ops->type != nstype))
goto out;
new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
@@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
goto out;
}
- err = ops->install(new_nsproxy, ei->ns);
+ err = ns->ops->install(new_nsproxy, ns);
if (err) {
free_nsproxy(new_nsproxy);
goto out;
diff --git a/kernel/pid.c b/kernel/pid.c
index 82430c8..cd36a5e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = {
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .proc_inum = PROC_PID_INIT_INO,
+ .ns.inum = PROC_PID_INIT_INO,
+#ifdef CONFIG_PID_NS
+ .ns.ops = &pidns_operations,
+#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index bc6d6a8..a65ba13 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_map;
- err = proc_alloc_inum(&ns->proc_inum);
+ err = ns_alloc_inum(&ns->ns);
if (err)
goto out_free_map;
+ ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
ns->level = level;
@@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
put_user_ns(ns->user_ns);
@@ -333,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static void *pidns_get(struct task_struct *task)
+static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct pid_namespace, ns);
+}
+
+static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -343,18 +349,18 @@ static void *pidns_get(struct task_struct *task)
get_pid_ns(ns);
rcu_read_unlock();
- return ns;
+ return ns ? &ns->ns : NULL;
}
-static void pidns_put(void *ns)
+static void pidns_put(struct ns_common *ns)
{
- put_pid_ns(ns);
+ put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, void *ns)
+static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct pid_namespace *active = task_active_pid_ns(current);
- struct pid_namespace *ancestor, *new = ns;
+ struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -382,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
return 0;
}
-static unsigned int pidns_inum(void *ns)
-{
- struct pid_namespace *pid_ns = ns;
- return pid_ns->proc_inum;
-}
-
const struct proc_ns_operations pidns_operations = {
.name = "pid",
.type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
- .inum = pidns_inum,
};
static __init int pid_namespaces_init(void)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f900dc9..02d6b6d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1857,10 +1857,16 @@ asmlinkage __visible int printk(const char *fmt, ...)
int r;
va_start(args, fmt);
- preempt_disable();
+
+ /*
+ * If a caller overrides the per_cpu printk_func, then it needs
+ * to disable preemption when calling printk(). Otherwise
+ * the printk_func should be set to the default. No need to
+ * disable preemption here.
+ */
vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
- preempt_enable();
+
va_end(args);
return r;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55c..b6e4c16 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
}
EXPORT_SYMBOL_GPL(print_stack_trace);
+int snprint_stack_trace(char *buf, size_t size,
+ struct stack_trace *trace, int spaces)
+{
+ int i;
+ unsigned long ip;
+ int generated;
+ int total = 0;
+
+ if (WARN_ON(!trace->entries))
+ return 0;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ ip = trace->entries[i];
+ generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
+ 1 + spaces, ' ', (void *) ip, (void *) ip);
+
+ total += generated;
+
+ /* Assume that generated isn't a negative number */
+ if (generated >= size) {
+ buf += size;
+ size = 0;
+ } else {
+ buf += generated;
+ size -= generated;
+ }
+ }
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(snprint_stack_trace);
+
/*
* Architectures that do not implement save_stack_trace_tsk or
* save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 02aa418..5adcb0a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -169,6 +169,8 @@ cond_syscall(ppc_rtas);
cond_syscall(sys_spu_run);
cond_syscall(sys_spu_create);
cond_syscall(sys_subpage_prot);
+cond_syscall(sys_s390_pci_mmio_read);
+cond_syscall(sys_s390_pci_mmio_write);
/* mmu depending weak syscall entries */
cond_syscall(sys_mprotect);
@@ -224,3 +226,6 @@ cond_syscall(sys_seccomp);
/* access BPF programs and maps */
cond_syscall(sys_bpf);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7c54ff7..137c7f6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -623,6 +623,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif
#ifdef CONFIG_KEXEC
{
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2e949cc..b79f39b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
/* Initialize mult/shift and max_idle_ns */
__clocksource_updatefreq_scale(cs, scale, freq);
- /* Add clocksource to the clcoksource list */
+ /* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1f43560..4d54b75 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -235,7 +235,7 @@ void tick_nohz_full_kick(void)
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
- irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+ irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
}
/*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 65015ff..6390517 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -741,6 +741,7 @@ u64 nsecs_to_jiffies64(u64 n)
return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
+EXPORT_SYMBOL(nsecs_to_jiffies64);
/**
* nsecs_to_jiffies - Convert nsecs in u64 to jiffies
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 11b9cb3..483cecf 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1477,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (atomic_dec_and_test(&blk_probes_ref))
blk_unregister_tracepoints();
- spin_lock_irq(&running_trace_lock);
- list_del(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
blk_trace_free(bt);
return 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1af4f8f..2e76797 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running;
*/
bool __read_mostly tracing_selftest_disabled;
+/* Pipe tracepoints to printk */
+struct trace_iterator *tracepoint_print_iter;
+int tracepoint_printk;
+
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
{ }
@@ -193,6 +197,13 @@ static int __init set_trace_boot_clock(char *str)
}
__setup("trace_clock=", set_trace_boot_clock);
+static int __init set_tracepoint_printk(char *str)
+{
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ tracepoint_printk = 1;
+ return 1;
+}
+__setup("tp_printk", set_tracepoint_printk);
unsigned long long ns2usecs(cycle_t nsec)
{
@@ -2031,7 +2042,7 @@ void trace_printk_init_buffers(void)
pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
pr_warning("** **\n");
pr_warning("** This means that this is a DEBUG kernel and it is **\n");
- pr_warning("** unsafe for produciton use. **\n");
+ pr_warning("** unsafe for production use. **\n");
pr_warning("** **\n");
pr_warning("** If you see this message and you are not debugging **\n");
pr_warning("** the kernel, report this immediately to your vendor! **\n");
@@ -6898,6 +6909,19 @@ out:
return ret;
}
+void __init trace_init(void)
+{
+ if (tracepoint_printk) {
+ tracepoint_print_iter =
+ kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ if (WARN_ON(!tracepoint_print_iter))
+ tracepoint_printk = 0;
+ }
+ tracer_alloc_buffers();
+ init_ftrace_syscalls();
+ trace_event_init();
+}
+
__init static int clear_boot_tracer(void)
{
/*
@@ -6917,6 +6941,5 @@ __init static int clear_boot_tracer(void)
return 0;
}
-early_initcall(tracer_alloc_buffers);
fs_initcall(tracer_init_debugfs);
late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3255dfb..8de48ba 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1301,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
#define perf_ftrace_event_register NULL
#endif
+#ifdef CONFIG_FTRACE_SYSCALLS
+void init_ftrace_syscalls(void);
+#else
+static inline void init_ftrace_syscalls(void) { }
+#endif
+
+#ifdef CONFIG_EVENT_TRACING
+void trace_event_init(void);
+#else
+static inline void __init trace_event_init(void) { }
+#endif
+
+extern struct trace_iterator *tracepoint_print_iter;
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d0e4f92..366a78a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
}
EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+
+static void output_printk(struct ftrace_event_buffer *fbuffer)
+{
+ struct ftrace_event_call *event_call;
+ struct trace_event *event;
+ unsigned long flags;
+ struct trace_iterator *iter = tracepoint_print_iter;
+
+ if (!iter)
+ return;
+
+ event_call = fbuffer->ftrace_file->event_call;
+ if (!event_call || !event_call->event.funcs ||
+ !event_call->event.funcs->trace)
+ return;
+
+ event = &fbuffer->ftrace_file->event_call->event;
+
+ spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ trace_seq_init(&iter->seq);
+ iter->ent = fbuffer->entry;
+ event_call->event.funcs->trace(iter, 0, event);
+ trace_seq_putc(&iter->seq, 0);
+ printk("%s", iter->seq.buffer);
+
+ spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
+
void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
{
+ if (tracepoint_printk)
+ output_printk(fbuffer);
+
event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
fbuffer->flags, fbuffer->pc);
@@ -2480,8 +2512,14 @@ static __init int event_trace_init(void)
#endif
return 0;
}
-early_initcall(event_trace_memsetup);
-core_initcall(event_trace_enable);
+
+void __init trace_event_init(void)
+{
+ event_trace_memsetup();
+ init_ftrace_syscalls();
+ event_trace_enable();
+}
+
fs_initcall(event_trace_init);
#ifdef CONFIG_FTRACE_STARTUP_TEST
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index dfe00a4..c6ee36f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -514,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
return (unsigned long)sys_call_table[nr];
}
-static int __init init_ftrace_syscalls(void)
+void __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
@@ -524,7 +524,7 @@ static int __init init_ftrace_syscalls(void)
GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
- return -ENOMEM;
+ return;
}
for (i = 0; i < NR_syscalls; i++) {
@@ -536,10 +536,7 @@ static int __init init_ftrace_syscalls(void)
meta->syscall_nr = i;
syscalls_metadata[i] = meta;
}
-
- return 0;
}
-early_initcall(init_ftrace_syscalls);
#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bb..d58cc4d 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 4efa393..b069ccb 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,7 +50,11 @@ struct user_namespace init_user_ns = {
.count = ATOMIC_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .proc_inum = PROC_USER_INIT_INO,
+ .ns.inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_USER_NS
+ .ns.ops = &userns_operations,
+#endif
+ .flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_PERSISTENT_KEYRINGS
.persistent_keyring_register_sem =
__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa312b0..4109f83 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
+static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
@@ -86,11 +87,12 @@ int create_user_ns(struct cred *new)
if (!ns)
return -ENOMEM;
- ret = proc_alloc_inum(&ns->proc_inum);
+ ret = ns_alloc_inum(&ns->ns);
if (ret) {
kmem_cache_free(user_ns_cachep, ns);
return ret;
}
+ ns->ns.ops = &userns_operations;
atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
@@ -99,6 +101,11 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
+ /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
+ mutex_lock(&userns_state_mutex);
+ ns->flags = parent_ns->flags;
+ mutex_unlock(&userns_state_mutex);
+
set_cred_user_ns(new, ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns)
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
ns = parent;
} while (atomic_dec_and_test(&parent->count));
@@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
-
-static DEFINE_MUTEX(id_map_mutex);
-
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ssize_t ret = -EINVAL;
/*
- * The id_map_mutex serializes all writes to any given map.
+ * The userns_state_mutex serializes all writes to any given map.
*
* Any map is only ever written once.
*
@@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
* order and smp_rmb() is guaranteed that we don't have crazy
* architectures returning stale data.
*/
- mutex_lock(&id_map_mutex);
+ mutex_lock(&userns_state_mutex);
ret = -EPERM;
/* Only allow one successful write to the map */
@@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (!page)
goto out;
- /* Only allow <= page size writes at the beginning of the file */
+ /* Only allow < page size writes at the beginning of the file */
ret = -EINVAL;
if ((*ppos != 0) || (count >= PAGE_SIZE))
goto out;
@@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
*ppos = count;
ret = count;
out:
- mutex_unlock(&id_map_mutex);
+ mutex_unlock(&userns_state_mutex);
if (page)
free_page(page);
return ret;
@@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
- /* Allow mapping to your own filesystem ids */
- if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+ const struct cred *cred = file->f_cred;
+ /* Don't allow mappings that would allow anything that wouldn't
+ * be allowed without the establishment of unprivileged mappings.
+ */
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
+ uid_eq(ns->owner, cred->euid)) {
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, file->f_cred->fsuid))
+ if (uid_eq(uid, cred->euid))
return true;
} else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
- if (gid_eq(gid, file->f_cred->fsgid))
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
+ gid_eq(gid, cred->egid))
return true;
}
}
@@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file,
return false;
}
-static void *userns_get(struct task_struct *task)
+int proc_setgroups_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+
+ seq_printf(seq, "%s\n",
+ (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
+ "allow" : "deny");
+ return 0;
+}
+
+ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ char kbuf[8], *pos;
+ bool setgroups_allowed;
+ ssize_t ret;
+
+ /* Only allow a very narrow range of strings to be written */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= sizeof(kbuf)))
+ goto out;
+
+ /* What was written? */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+ pos = kbuf;
+
+ /* What is being requested? */
+ ret = -EINVAL;
+ if (strncmp(pos, "allow", 5) == 0) {
+ pos += 5;
+ setgroups_allowed = true;
+ }
+ else if (strncmp(pos, "deny", 4) == 0) {
+ pos += 4;
+ setgroups_allowed = false;
+ }
+ else
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ ret = -EPERM;
+ mutex_lock(&userns_state_mutex);
+ if (setgroups_allowed) {
+ /* Enabling setgroups after setgroups has been disabled
+ * is not allowed.
+ */
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
+ goto out_unlock;
+ } else {
+ /* Permanently disabling setgroups after setgroups has
+ * been enabled by writing the gid_map is not allowed.
+ */
+ if (ns->gid_map.nr_extents != 0)
+ goto out_unlock;
+ ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
+ }
+ mutex_unlock(&userns_state_mutex);
+
+ /* Report a successful write */
+ *ppos = count;
+ ret = count;
+out:
+ return ret;
+out_unlock:
+ mutex_unlock(&userns_state_mutex);
+ goto out;
+}
+
+bool userns_may_setgroups(const struct user_namespace *ns)
+{
+ bool allowed;
+
+ mutex_lock(&userns_state_mutex);
+ /* It is not safe to use setgroups until a gid mapping in
+ * the user namespace has been established.
+ */
+ allowed = ns->gid_map.nr_extents != 0;
+ /* Is setgroups allowed? */
+ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
+ mutex_unlock(&userns_state_mutex);
+
+ return allowed;
+}
+
+static inline struct user_namespace *to_user_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct user_namespace, ns);
+}
+
+static struct ns_common *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
@@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task)
user_ns = get_user_ns(__task_cred(task)->user_ns);
rcu_read_unlock();
- return user_ns;
+ return user_ns ? &user_ns->ns : NULL;
}
-static void userns_put(void *ns)
+static void userns_put(struct ns_common *ns)
{
- put_user_ns(ns);
+ put_user_ns(to_user_ns(ns));
}
-static int userns_install(struct nsproxy *nsproxy, void *ns)
+static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
- struct user_namespace *user_ns = ns;
+ struct user_namespace *user_ns = to_user_ns(ns);
struct cred *cred;
/* Don't allow gaining capabilities by reentering
@@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
return commit_creds(cred);
}
-static unsigned int userns_inum(void *ns)
-{
- struct user_namespace *user_ns = ns;
- return user_ns->proc_inum;
-}
-
const struct proc_ns_operations userns_operations = {
.name = "user",
.type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
- .inum = userns_inum,
};
static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 883aaaa..831ea71 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
if (!ns)
return ERR_PTR(-ENOMEM);
- err = proc_alloc_inum(&ns->proc_inum);
+ err = ns_alloc_inum(&ns->ns);
if (err) {
kfree(ns);
return ERR_PTR(err);
}
+ ns->ns.ops = &utsns_operations;
+
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
@@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref)
ns = container_of(kref, struct uts_namespace, kref);
put_user_ns(ns->user_ns);
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
kfree(ns);
}
-static void *utsns_get(struct task_struct *task)
+static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct uts_namespace, ns);
+}
+
+static struct ns_common *utsns_get(struct task_struct *task)
{
struct uts_namespace *ns = NULL;
struct nsproxy *nsproxy;
@@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task)
}
task_unlock(task);
- return ns;
+ return ns ? &ns->ns : NULL;
}
-static void utsns_put(void *ns)
+static void utsns_put(struct ns_common *ns)
{
- put_uts_ns(ns);
+ put_uts_ns(to_uts_ns(ns));
}
-static int utsns_install(struct nsproxy *nsproxy, void *new)
+static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
{
- struct uts_namespace *ns = new;
+ struct uts_namespace *ns = to_uts_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
return 0;
}
-static unsigned int utsns_inum(void *vp)
-{
- struct uts_namespace *ns = vp;
-
- return ns->proc_inum;
-}
-
const struct proc_ns_operations utsns_operations = {
.name = "uts",
.type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
- .inum = utsns_inum,
};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 09b685d..6202b08 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool)
struct worker_pool *pool = (void *)__pool;
struct work_struct *work;
- spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
- spin_lock(&pool->lock);
+ spin_lock_irq(&pool->lock);
+ spin_lock(&wq_mayday_lock); /* for wq->maydays */
if (need_to_create_worker(pool)) {
/*
@@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool)
send_mayday(work);
}
- spin_unlock(&pool->lock);
- spin_unlock_irq(&wq_mayday_lock);
+ spin_unlock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -2248,12 +2248,30 @@ repeat:
* Slurp in all works issued via this workqueue and
* process'em.
*/
- WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
+ WARN_ON_ONCE(!list_empty(scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry)
if (get_work_pwq(work) == pwq)
move_linked_works(work, scheduled, &n);
- process_scheduled_works(rescuer);
+ if (!list_empty(scheduled)) {
+ process_scheduled_works(rescuer);
+
+ /*
+ * The above execution of rescued work items could
+ * have created more to rescue through
+ * pwq_activate_first_delayed() or chained
+ * queueing. Let's put @pwq back on mayday list so
+ * that such back-to-back work items, which may be
+ * being used to relieve memory pressure, don't
+ * incur MAYDAY_INTERVAL delay inbetween.
+ */
+ if (need_to_create_worker(pool)) {
+ spin_lock(&wq_mayday_lock);
+ get_pwq(pwq);
+ list_move_tail(&pwq->mayday_node, &wq->maydays);
+ spin_unlock(&wq_mayday_lock);
+ }
+ }
/*
* Put the reference grabbed by send_mayday(). @pool won't
OpenPOWER on IntegriCloud