summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit_watch.c14
-rw-r--r--kernel/bpf/Makefile8
-rw-r--r--kernel/bpf/arraymap.c33
-rw-r--r--kernel/bpf/bpf_lru_list.h3
-rw-r--r--kernel/bpf/core.c61
-rw-r--r--kernel/bpf/devmap.c409
-rw-r--r--kernel/bpf/hashtab.c90
-rw-r--r--kernel/bpf/lpm_trie.c9
-rw-r--r--kernel/bpf/sockmap.c873
-rw-r--r--kernel/bpf/stackmap.c8
-rw-r--r--kernel/bpf/syscall.c133
-rw-r--r--kernel/bpf/tnum.c180
-rw-r--r--kernel/bpf/verifier.c2439
-rw-r--r--kernel/cgroup/cgroup-internal.h15
-rw-r--r--kernel/cgroup/cgroup-v1.c75
-rw-r--r--kernel/cgroup/cgroup.c1031
-rw-r--r--kernel/cgroup/cpuset.c63
-rw-r--r--kernel/cgroup/debug.c53
-rw-r--r--kernel/cgroup/freezer.c6
-rw-r--r--kernel/cgroup/pids.c1
-rw-r--r--kernel/configs/android-base.config1
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpu_pm.c50
-rw-r--r--kernel/events/core.c211
-rw-r--r--kernel/events/internal.h6
-rw-r--r--kernel/events/ring_buffer.c31
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/exit.c11
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/futex.c66
-rw-r--r--kernel/irq/Kconfig9
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c119
-rw-r--r--kernel/irq/cpuhotplug.c9
-rw-r--r--kernel/irq/debugfs.c50
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/irq/irq_sim.c164
-rw-r--r--kernel/irq/irqdomain.c241
-rw-r--r--kernel/irq/manage.c14
-rw-r--r--kernel/irq/proc.c8
-rw-r--r--kernel/jump_label.c104
-rw-r--r--kernel/kexec_core.c12
-rw-r--r--kernel/kmod.c25
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/locking/lockdep.c1008
-rw-r--r--kernel/locking/lockdep_internals.h2
-rw-r--r--kernel/locking/lockdep_proc.c4
-rw-r--r--kernel/locking/lockdep_states.h1
-rw-r--r--kernel/locking/osq_lock.c13
-rw-r--r--kernel/locking/qspinlock.c117
-rw-r--r--kernel/locking/qspinlock_paravirt.h24
-rw-r--r--kernel/locking/rtmutex_common.h29
-rw-r--r--kernel/locking/rwsem-spinlock.c37
-rw-r--r--kernel/locking/rwsem-xadd.c33
-rw-r--r--kernel/membarrier.c70
-rw-r--r--kernel/memremap.c72
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/pid.c14
-rw-r--r--kernel/power/hibernate.c29
-rw-r--r--kernel/power/main.c64
-rw-r--r--kernel/power/power.h5
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c184
-rw-r--r--kernel/power/suspend_test.c4
-rw-r--r--kernel/power/swap.c5
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/rcu.h128
-rw-r--r--kernel/rcu/rcu_segcblist.c108
-rw-r--r--kernel/rcu/rcu_segcblist.h28
-rw-r--r--kernel/rcu/rcuperf.c17
-rw-r--r--kernel/rcu/rcutorture.c83
-rw-r--r--kernel/rcu/srcutiny.c8
-rw-r--r--kernel/rcu/srcutree.c50
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tiny_plugin.h47
-rw-r--r--kernel/rcu/tree.c213
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h238
-rw-r--r--kernel/rcu/update.c18
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/autogroup.c3
-rw-r--r--kernel/sched/completion.c30
-rw-r--r--kernel/sched/core.c66
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpufreq_schedutil.c98
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/deadline.c35
-rw-r--r--kernel/sched/debug.c83
-rw-r--r--kernel/sched/fair.c463
-rw-r--r--kernel/sched/idle.c8
-rw-r--r--kernel/sched/membarrier.c152
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h16
-rw-r--r--kernel/sched/swait.c6
-rw-r--r--kernel/sched/topology.c39
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/signal.c17
-rw-r--r--kernel/smp.c32
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/time/alarmtimer.c17
-rw-r--r--kernel/time/posix-cpu-timers.c14
-rw-r--r--kernel/time/timekeeping.c6
-rw-r--r--kernel/time/timekeeping_debug.c5
-rw-r--r--kernel/time/timer.c52
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/blktrace.c261
-rw-r--r--kernel/trace/bpf_trace.c34
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/ring_buffer.c14
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c19
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c57
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/trace/tracing_map.c11
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/watchdog.c1
-rw-r--r--kernel/watchdog_hld.c59
-rw-r--r--kernel/workqueue.c89
124 files changed, 7987 insertions, 3164 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b..9c323a6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_HAS_IOMEM) += memremap.o
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 62d686d..9eb8b35 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
+ /*
+ * audit_remove_watch() drops our reference to 'parent' which
+ * can get freed. Grab our own reference to be safe.
+ */
+ audit_get_parent(parent);
audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- audit_get_parent(parent);
+ if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
- audit_put_parent(parent);
- }
+ audit_put_parent(parent);
}
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1e5e65..897daa0 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,13 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+ifeq ($(CONFIG_STREAM_PARSER),y)
+obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
+endif
+endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d771a38..98c0f00 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_array *array;
u64 array_size;
u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags)
+ attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+ (percpu && numa_node != NUMA_NO_NODE))
return ERR_PTR(-EINVAL);
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size);
+ array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
@@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
array->map.map_flags = attr->map_flags;
+ array->map.numa_node = numa_node;
array->elem_size = elem_size;
if (!percpu)
@@ -603,6 +606,31 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
+static u32 array_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ u32 elem_size = round_up(map->value_size, 8);
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int map_ptr = BPF_REG_1;
+ const int index = BPF_REG_2;
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (is_power_of_2(elem_size))
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ else
+ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+
+ return insn - insn_buf;
+}
+
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
@@ -612,4 +640,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = array_of_map_gen_lookup,
};
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 5c35a98..7d4f89b 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -69,7 +69,8 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
/* ref is an approximation on access frequency. It does not
* have to be very accurate. Hence, no protection is used.
*/
- node->ref = 1;
+ if (!node->ref)
+ node->ref = 1;
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ad5f559..917cc04 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JNE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
/* Accommodate for extra offset in case of a backjump. */
off = from->off;
@@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+ [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+ [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+ [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+ [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
/* Program return */
@@ -1073,6 +1085,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLT_X:
+ if (DST < SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLT_K:
+ if (DST < IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JGE_X:
if (DST >= SRC) {
insn += insn->off;
@@ -1085,6 +1109,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLE_X:
+ if (DST <= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLE_K:
+ if (DST <= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGT_X:
if (((s64) DST) > ((s64) SRC)) {
insn += insn->off;
@@ -1097,6 +1133,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLT_X:
+ if (((s64) DST) < ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLT_K:
+ if (((s64) DST) < ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGE_X:
if (((s64) DST) >= ((s64) SRC)) {
insn += insn->off;
@@ -1109,6 +1157,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLE_X:
+ if (((s64) DST) <= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLE_K:
+ if (((s64) DST) <= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSET_X:
if (DST & SRC) {
insn += insn->off;
@@ -1378,6 +1438,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
new file mode 100644
index 0000000..ecf9f99
--- /dev/null
+++ b/kernel/bpf/devmap.c
@@ -0,0 +1,409 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* Devmaps primary use is as a backend map for XDP BPF helper call
+ * bpf_redirect_map(). Because XDP is mostly concerned with performance we
+ * spent some effort to ensure the datapath with redirect maps does not use
+ * any locking. This is a quick note on the details.
+ *
+ * We have three possible paths to get into the devmap control plane bpf
+ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
+ * will invoke an update, delete, or lookup operation. To ensure updates and
+ * deletes appear atomic from the datapath side xchg() is used to modify the
+ * netdev_map array. Then because the datapath does a lookup into the netdev_map
+ * array (read-only) from an RCU critical section we use call_rcu() to wait for
+ * an rcu grace period before free'ing the old data structures. This ensures the
+ * datapath always has a valid copy. However, the datapath does a "flush"
+ * operation that pushes any pending packets in the driver outside the RCU
+ * critical section. Each bpf_dtab_netdev tracks these pending operations using
+ * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
+ * until all bits are cleared indicating outstanding flush operations have
+ * completed.
+ *
+ * BPF syscalls may race with BPF program calls on any of the update, delete
+ * or lookup operations. As noted above the xchg() operation also keep the
+ * netdev_map consistent in this case. From the devmap side BPF programs
+ * calling into these operations are the same as multiple user space threads
+ * making system calls.
+ *
+ * Finally, any of the above may race with a netdev_unregister notifier. The
+ * unregister notifier must search for net devices in the map structure that
+ * contain a reference to the net device and remove them. This is a two step
+ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
+ * check to see if the ifindex is the same as the net_device being removed.
+ * When removing the dev a cmpxchg() is used to ensure the correct dev is
+ * removed, in the case of a concurrent update or delete operation it is
+ * possible that the initially referenced dev is no longer in the map. As the
+ * notifier hook walks the map we know that new dev references can not be
+ * added by the user because core infrastructure ensures dev_get_by_index()
+ * calls will fail at this point.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+struct bpf_dtab_netdev {
+ struct net_device *dev;
+ struct bpf_dtab *dtab;
+ unsigned int bit;
+ struct rcu_head rcu;
+};
+
+struct bpf_dtab {
+ struct bpf_map map;
+ struct bpf_dtab_netdev **netdev_map;
+ unsigned long __percpu *flush_needed;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_map_lock);
+static LIST_HEAD(dev_map_list);
+
+static u64 dev_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_dtab *dtab;
+ u64 cost;
+ int err;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ dtab->map.map_type = attr->map_type;
+ dtab->map.key_size = attr->key_size;
+ dtab->map.value_size = attr->value_size;
+ dtab->map.max_entries = attr->max_entries;
+ dtab->map.map_flags = attr->map_flags;
+ dtab->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+ cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_dtab;
+
+ dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(dtab->map.pages);
+ if (err)
+ goto free_dtab;
+
+ /* A per cpu bitfield with a bit per possible net device */
+ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!dtab->flush_needed)
+ goto free_dtab;
+
+ dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ sizeof(struct bpf_dtab_netdev *),
+ dtab->map.numa_node);
+ if (!dtab->netdev_map)
+ goto free_dtab;
+
+ spin_lock(&dev_map_lock);
+ list_add_tail_rcu(&dtab->list, &dev_map_list);
+ spin_unlock(&dev_map_lock);
+
+ return &dtab->map;
+free_dtab:
+ free_percpu(dtab->flush_needed);
+ kfree(dtab);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void dev_map_free(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int i, cpu;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further reads against netdev_map. It does __not__ ensure pending
+ * flush operations (if any) are complete.
+ */
+
+ spin_lock(&dev_map_lock);
+ list_del_rcu(&dtab->list);
+ spin_unlock(&dev_map_lock);
+
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ cpu_relax();
+ }
+
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev;
+
+ dev = dtab->netdev_map[i];
+ if (!dev)
+ continue;
+
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+
+ free_percpu(dtab->flush_needed);
+ bpf_map_area_free(dtab->netdev_map);
+ kfree(dtab);
+}
+
+static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= dtab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == dtab->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+ * from the driver before returning from its napi->poll() routine. The poll()
+ * routine is called either from busy_poll context or net_rx_action signaled
+ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
+ * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * is zeroed before completing to ensure all flush operations have completed.
+ */
+void __dev_map_flush(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+ u32 bit;
+
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+ struct net_device *netdev;
+
+ /* This is possible if the dev entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!dev))
+ continue;
+
+ __clear_bit(bit, bitmap);
+ netdev = dev->dev;
+ if (likely(netdev->netdev_ops->ndo_xdp_flush))
+ netdev->netdev_ops->ndo_xdp_flush(netdev);
+ }
+}
+
+/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
+ * update happens in parallel here a dev_put wont happen until after reading the
+ * ifindex.
+ */
+struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ dev = READ_ONCE(dtab->netdev_map[key]);
+ return dev ? dev->dev : NULL;
+}
+
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+
+ return dev ? &dev->ifindex : NULL;
+}
+
+static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
+{
+ if (dev->dev->netdev_ops->ndo_xdp_flush) {
+ struct net_device *fl = dev->dev;
+ unsigned long *bitmap;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
+ __clear_bit(dev->bit, bitmap);
+
+ fl->netdev_ops->ndo_xdp_flush(dev->dev);
+ }
+ }
+}
+
+static void __dev_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_dtab_netdev *dev;
+
+ dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ dev_map_flush_old(dev);
+ dev_put(dev->dev);
+ kfree(dev);
+}
+
+static int dev_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *old_dev;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ /* Use call_rcu() here to ensure any rcu critical sections have
+ * completed, but this does not guarantee a flush has happened
+ * yet. Because driver side rcu_read_lock/unlock only protects the
+ * running XDP program. However, for pending flush operations the
+ * dev and ctx are stored in another per cpu map. And additionally,
+ * the driver tear down ensures all soft irqs are complete before
+ * removing the net device in the case of dev_put equals zero.
+ */
+ old_dev = xchg(&dtab->netdev_map[k], NULL);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ return 0;
+}
+
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dtab_netdev *dev, *old_dev;
+ u32 i = *(u32 *)key;
+ u32 ifindex = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= dtab->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ if (!ifindex) {
+ dev = NULL;
+ } else {
+ dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+ map->numa_node);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->dev = dev_get_by_index(net, ifindex);
+ if (!dev->dev) {
+ kfree(dev);
+ return -EINVAL;
+ }
+
+ dev->bit = i;
+ dev->dtab = dtab;
+ }
+
+ /* Use call_rcu() here to ensure rcu critical sections have completed
+ * Remembering the driver side flush operation will happen before the
+ * net device is removed.
+ */
+ old_dev = xchg(&dtab->netdev_map[i], dev);
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+ return 0;
+}
+
+const struct bpf_map_ops dev_map_ops = {
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_get_next_key,
+ .map_lookup_elem = dev_map_lookup_elem,
+ .map_update_elem = dev_map_update_elem,
+ .map_delete_elem = dev_map_delete_elem,
+};
+
+static int dev_map_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dtab *dtab;
+ int i;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* This rcu_read_lock/unlock pair is needed because
+ * dev_map_list is an RCU list AND to ensure a delete
+ * operation does not free a netdev_map entry while we
+ * are comparing it against the netdev being unregistered.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(dtab, &dev_map_list, list) {
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev, *odev;
+
+ dev = READ_ONCE(dtab->netdev_map[i]);
+ if (!dev ||
+ dev->dev->ifindex != netdev->ifindex)
+ continue;
+ odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+ if (dev == odev)
+ call_rcu(&dev->rcu,
+ __dev_map_entry_free);
+ }
+ }
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_map_notifier = {
+ .notifier_call = dev_map_notification,
+};
+
+static int __init dev_map_init(void)
+{
+ register_netdevice_notifier(&dev_map_notifier);
+ return 0;
+}
+
+subsys_initcall(dev_map_init);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4fb4631..431126f 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,6 +18,9 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+
struct bucket {
struct hlist_nulls_head head;
raw_spinlock_t lock;
@@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab)
if (!htab_is_percpu(htab) && !htab_is_lru(htab))
num_entries += num_possible_cpus();
- htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
+ htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
+ htab->map.numa_node);
if (!htab->elems)
return -ENOMEM;
@@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_htab *htab;
int err, i;
u64 cost;
@@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
return ERR_PTR(-EPERM);
- if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
@@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (lru && !prealloc)
return ERR_PTR(-ENOTSUPP);
+ if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
+ return ERR_PTR(-EINVAL);
+
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries;
htab->map.map_flags = attr->map_flags;
+ htab->map.numa_node = numa_node;
/* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set
@@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
err = -ENOMEM;
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
- sizeof(struct bucket));
+ sizeof(struct bucket),
+ htab->map.numa_node);
if (!htab->buckets)
goto free_htab;
@@ -504,6 +514,29 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int ref_reg = BPF_REG_1;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
+ *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
+ *insn++ = BPF_ST_MEM(BPF_B, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref),
+ 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ return insn - insn_buf;
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@@ -652,12 +685,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
}
}
+static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
+ BITS_PER_LONG == 64;
+}
+
+static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
+{
+ u32 size = htab->map.value_size;
+
+ if (percpu || fd_htab_map_needs_adjust(htab))
+ size = round_up(size, 8);
+ return size;
+}
+
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab->map.value_size;
+ u32 size = htab_size_value(htab, percpu);
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -689,16 +737,14 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
atomic_dec(&htab->count);
return ERR_PTR(-E2BIG);
}
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
if (!l_new)
return ERR_PTR(-ENOMEM);
}
memcpy(l_new->key, key, key_size);
if (percpu) {
- /* round up value_size to 8 bytes */
- size = round_up(size, 8);
-
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -1114,6 +1160,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+ .map_gen_lookup = htab_lru_map_gen_lookup,
};
/* Called from eBPF program */
@@ -1209,17 +1256,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
{
- struct bpf_map *map;
-
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
-
- /* pointer is stored internally */
- attr->value_size = sizeof(void *);
- map = htab_map_alloc(attr);
- attr->value_size = sizeof(u32);
-
- return map;
+ return htab_map_alloc(attr);
}
static void fd_htab_map_free(struct bpf_map *map)
@@ -1311,6 +1350,22 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
+static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+
+ return insn - insn_buf;
+}
+
static void htab_of_map_free(struct bpf_map *map)
{
bpf_map_meta_free(map->inner_map_meta);
@@ -1326,4 +1381,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = htab_of_map_gen_lookup,
};
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b09185f..1b76784 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
- node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+ node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
+ trie->map.numa_node);
if (!node)
return NULL;
@@ -405,6 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
@@ -416,7 +419,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 ||
- attr->map_flags != BPF_F_NO_PREALLOC ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
attr->key_size < LPM_KEY_SIZE_MIN ||
attr->key_size > LPM_KEY_SIZE_MAX ||
attr->value_size < LPM_VAL_SIZE_MIN ||
@@ -433,6 +437,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
trie->map.value_size = attr->value_size;
trie->map.max_entries = attr->max_entries;
trie->map.map_flags = attr->map_flags;
+ trie->map.numa_node = bpf_map_attr_numa_node(attr);
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
new file mode 100644
index 0000000..f6ffde9
--- /dev/null
+++ b/kernel/bpf/sockmap.c
@@ -0,0 +1,873 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* A BPF sock_map is used to store sock objects. This is primarly used
+ * for doing socket redirect with BPF helper routines.
+ *
+ * A sock map may have BPF programs attached to it, currently a program
+ * used to parse packets and a program to provide a verdict and redirect
+ * decision on the packet are supported. Any programs attached to a sock
+ * map are inherited by sock objects when they are added to the map. If
+ * no BPF programs are attached the sock object may only be used for sock
+ * redirect.
+ *
+ * A sock object may be in multiple maps, but can only inherit a single
+ * parse or verdict program. If adding a sock object to a map would result
+ * in having multiple parsing programs the update will return an EBUSY error.
+ *
+ * For reference this program is similar to devmap used in XDP context
+ * reviewing these together may be useful. For an example please review
+ * ./samples/bpf/sockmap/.
+ */
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <net/strparser.h>
+
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sock_map;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+};
+
+enum smap_psock_state {
+ SMAP_TX_RUNNING,
+};
+
+struct smap_psock_map_entry {
+ struct list_head list;
+ struct sock **entry;
+};
+
+struct smap_psock {
+ struct rcu_head rcu;
+ /* refcnt is used inside sk_callback_lock */
+ u32 refcnt;
+
+ /* datapath variables */
+ struct sk_buff_head rxqueue;
+ bool strp_enabled;
+
+ /* datapath error path cache across tx work invocations */
+ int save_rem;
+ int save_off;
+ struct sk_buff *save_skb;
+
+ struct strparser strp;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+ struct list_head maps;
+
+ /* Back reference used when sock callback trigger sockmap operations */
+ struct sock *sock;
+ unsigned long state;
+
+ struct work_struct tx_work;
+ struct work_struct gc_work;
+
+ void (*save_data_ready)(struct sock *sk);
+ void (*save_write_space)(struct sock *sk);
+ void (*save_state_change)(struct sock *sk);
+};
+
+static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
+{
+ return rcu_dereference_sk_user_data(sk);
+}
+
+static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
+ int rc;
+
+ if (unlikely(!prog))
+ return SK_DROP;
+
+ skb_orphan(skb);
+ skb->sk = psock->sock;
+ bpf_compute_data_end(skb);
+ rc = (*prog->bpf_func)(skb, prog->insnsi);
+ skb->sk = NULL;
+
+ return rc;
+}
+
+static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int rc;
+
+ /* Because we use per cpu values to feed input from sock redirect
+ * in BPF program to do_sk_redirect_map() call we need to ensure we
+ * are not preempted. RCU read lock is not sufficient in this case
+ * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
+ */
+ preempt_disable();
+ rc = smap_verdict_func(psock, skb);
+ switch (rc) {
+ case SK_REDIRECT:
+ sk = do_sk_redirect_map();
+ preempt_enable();
+ if (likely(sk)) {
+ struct smap_psock *peer = smap_psock_sk(sk);
+
+ if (likely(peer &&
+ test_bit(SMAP_TX_RUNNING, &peer->state) &&
+ !sock_flag(sk, SOCK_DEAD) &&
+ sock_writeable(sk))) {
+ skb_set_owner_w(skb, sk);
+ skb_queue_tail(&peer->rxqueue, skb);
+ schedule_work(&peer->tx_work);
+ break;
+ }
+ }
+ /* Fall through and free skb otherwise */
+ case SK_DROP:
+ default:
+ if (rc != SK_REDIRECT)
+ preempt_enable();
+ kfree_skb(skb);
+ }
+}
+
+static void smap_report_sk_error(struct smap_psock *psock, int err)
+{
+ struct sock *sk = psock->sock;
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+/* Called with lock_sock(sk) held */
+static void smap_state_change(struct sock *sk)
+{
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+ struct socket_wq *wq;
+ struct sock *osk;
+
+ rcu_read_lock();
+
+ /* Allowing transitions into an established syn_recv states allows
+ * for early binding sockets to a smap object before the connection
+ * is established.
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ case TCP_LAST_ACK:
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ case TCP_LISTEN:
+ break;
+ case TCP_CLOSE:
+ /* Only release if the map entry is in fact the sock in
+ * question. There is a case where the operator deletes
+ * the sock from the map, but the TCP sock is closed before
+ * the psock is detached. Use cmpxchg to verify correct
+ * sock is removed.
+ */
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ break;
+ write_lock_bh(&sk->sk_callback_lock);
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ break;
+ default:
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ break;
+ smap_report_sk_error(psock, EPIPE);
+ break;
+ }
+
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static void smap_read_sock_strparser(struct strparser *strp,
+ struct sk_buff *skb)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = container_of(strp, struct smap_psock, strp);
+ smap_do_verdict(psock, skb);
+ rcu_read_unlock();
+}
+
+/* Called with lock held on socket */
+static void smap_data_ready(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (likely(psock)) {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+ rcu_read_unlock();
+}
+
+static void smap_tx_work(struct work_struct *w)
+{
+ struct smap_psock *psock;
+ struct sk_buff *skb;
+ int rem, off, n;
+
+ psock = container_of(w, struct smap_psock, tx_work);
+
+ /* lock sock to avoid losing sk_socket at some point during loop */
+ lock_sock(psock->sock);
+ if (psock->save_skb) {
+ skb = psock->save_skb;
+ rem = psock->save_rem;
+ off = psock->save_off;
+ psock->save_skb = NULL;
+ goto start;
+ }
+
+ while ((skb = skb_dequeue(&psock->rxqueue))) {
+ rem = skb->len;
+ off = 0;
+start:
+ do {
+ if (likely(psock->sock->sk_socket))
+ n = skb_send_sock_locked(psock->sock,
+ skb, off, rem);
+ else
+ n = -EINVAL;
+ if (n <= 0) {
+ if (n == -EAGAIN) {
+ /* Retry when space is available */
+ psock->save_skb = skb;
+ psock->save_rem = rem;
+ psock->save_off = off;
+ goto out;
+ }
+ /* Hard errors break pipe and stop xmit */
+ smap_report_sk_error(psock, n ? -n : EPIPE);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ kfree_skb(skb);
+ goto out;
+ }
+ rem -= n;
+ off += n;
+ } while (rem);
+ kfree_skb(skb);
+ }
+out:
+ release_sock(psock->sock);
+}
+
+static void smap_write_space(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
+ schedule_work(&psock->tx_work);
+ rcu_read_unlock();
+}
+
+static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
+{
+ if (!psock->strp_enabled)
+ return;
+ sk->sk_data_ready = psock->save_data_ready;
+ sk->sk_write_space = psock->save_write_space;
+ sk->sk_state_change = psock->save_state_change;
+ psock->save_data_ready = NULL;
+ psock->save_write_space = NULL;
+ psock->save_state_change = NULL;
+ strp_stop(&psock->strp);
+ psock->strp_enabled = false;
+}
+
+static void smap_destroy_psock(struct rcu_head *rcu)
+{
+ struct smap_psock *psock = container_of(rcu,
+ struct smap_psock, rcu);
+
+ /* Now that a grace period has passed there is no longer
+ * any reference to this sock in the sockmap so we can
+ * destroy the psock, strparser, and bpf programs. But,
+ * because we use workqueue sync operations we can not
+ * do it in rcu context
+ */
+ schedule_work(&psock->gc_work);
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
+{
+ psock->refcnt--;
+ if (psock->refcnt)
+ return;
+
+ smap_stop_sock(psock, sock);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ rcu_assign_sk_user_data(sock, NULL);
+ call_rcu_sched(&psock->rcu, smap_destroy_psock);
+}
+
+static int smap_parse_func_strparser(struct strparser *strp,
+ struct sk_buff *skb)
+{
+ struct smap_psock *psock;
+ struct bpf_prog *prog;
+ int rc;
+
+ rcu_read_lock();
+ psock = container_of(strp, struct smap_psock, strp);
+ prog = READ_ONCE(psock->bpf_parse);
+
+ if (unlikely(!prog)) {
+ rcu_read_unlock();
+ return skb->len;
+ }
+
+ /* Attach socket for bpf program to use if needed we can do this
+ * because strparser clones the skb before handing it to a upper
+ * layer, meaning skb_orphan has been called. We NULL sk on the
+ * way out to ensure we don't trigger a BUG_ON in skb/sk operations
+ * later and because we are not charging the memory of this skb to
+ * any socket yet.
+ */
+ skb->sk = psock->sock;
+ bpf_compute_data_end(skb);
+ rc = (*prog->bpf_func)(skb, prog->insnsi);
+ skb->sk = NULL;
+ rcu_read_unlock();
+ return rc;
+}
+
+
+static int smap_read_sock_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+static int smap_init_sock(struct smap_psock *psock,
+ struct sock *sk)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = smap_read_sock_strparser,
+ .parse_msg = smap_parse_func_strparser,
+ .read_sock_done = smap_read_sock_done,
+ };
+
+ return strp_init(&psock->strp, sk, &cb);
+}
+
+static void smap_init_progs(struct smap_psock *psock,
+ struct bpf_stab *stab,
+ struct bpf_prog *verdict,
+ struct bpf_prog *parse)
+{
+ struct bpf_prog *orig_parse, *orig_verdict;
+
+ orig_parse = xchg(&psock->bpf_parse, parse);
+ orig_verdict = xchg(&psock->bpf_verdict, verdict);
+
+ if (orig_verdict)
+ bpf_prog_put(orig_verdict);
+ if (orig_parse)
+ bpf_prog_put(orig_parse);
+}
+
+static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
+{
+ if (sk->sk_data_ready == smap_data_ready)
+ return;
+ psock->save_data_ready = sk->sk_data_ready;
+ psock->save_write_space = sk->sk_write_space;
+ psock->save_state_change = sk->sk_state_change;
+ sk->sk_data_ready = smap_data_ready;
+ sk->sk_write_space = smap_write_space;
+ sk->sk_state_change = smap_state_change;
+ psock->strp_enabled = true;
+}
+
+static void sock_map_remove_complete(struct bpf_stab *stab)
+{
+ bpf_map_area_free(stab->sock_map);
+ kfree(stab);
+}
+
+static void smap_gc_work(struct work_struct *w)
+{
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+
+ psock = container_of(w, struct smap_psock, gc_work);
+
+ /* no callback lock needed because we already detached sockmap ops */
+ if (psock->strp_enabled)
+ strp_done(&psock->strp);
+
+ cancel_work_sync(&psock->tx_work);
+ __skb_queue_purge(&psock->rxqueue);
+
+ /* At this point all strparser and xmit work must be complete */
+ if (psock->bpf_parse)
+ bpf_prog_put(psock->bpf_parse);
+ if (psock->bpf_verdict)
+ bpf_prog_put(psock->bpf_verdict);
+
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ list_del(&e->list);
+ kfree(e);
+ }
+
+ sock_put(psock->sock);
+ kfree(psock);
+}
+
+static struct smap_psock *smap_init_psock(struct sock *sock,
+ struct bpf_stab *stab)
+{
+ struct smap_psock *psock;
+
+ psock = kzalloc_node(sizeof(struct smap_psock),
+ GFP_ATOMIC | __GFP_NOWARN,
+ stab->map.numa_node);
+ if (!psock)
+ return ERR_PTR(-ENOMEM);
+
+ psock->sock = sock;
+ skb_queue_head_init(&psock->rxqueue);
+ INIT_WORK(&psock->tx_work, smap_tx_work);
+ INIT_WORK(&psock->gc_work, smap_gc_work);
+ INIT_LIST_HEAD(&psock->maps);
+ psock->refcnt = 1;
+
+ rcu_assign_sk_user_data(sock, psock);
+ sock_hold(sock);
+ return psock;
+}
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_stab *stab;
+ int err = -EINVAL;
+ u64 cost;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ stab = kzalloc(sizeof(*stab), GFP_USER);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ stab->map.map_type = attr->map_type;
+ stab->map.key_size = attr->key_size;
+ stab->map.value_size = attr->value_size;
+ stab->map.max_entries = attr->max_entries;
+ stab->map.map_flags = attr->map_flags;
+ stab->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_stab;
+
+ stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(stab->map.pages);
+ if (err)
+ goto free_stab;
+
+ err = -ENOMEM;
+ stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
+ sizeof(struct sock *),
+ stab->map.numa_node);
+ if (!stab->sock_map)
+ goto free_stab;
+
+ return &stab->map;
+free_stab:
+ kfree(stab);
+ return ERR_PTR(err);
+}
+
+static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+{
+ struct smap_psock_map_entry *e, *tmp;
+
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ if (e->entry == entry) {
+ list_del(&e->list);
+ break;
+ }
+ }
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* At this point no update, lookup or delete operations can happen.
+ * However, be aware we can still get a socket state event updates,
+ * and data ready callabacks that reference the psock from sk_user_data
+ * Also psock worker threads are still in-flight. So smap_release_sock
+ * will only free the psock after cancel_sync on the worker threads
+ * and a grace period expire to ensure psock is really safe to remove.
+ */
+ rcu_read_lock();
+ for (i = 0; i < stab->map.max_entries; i++) {
+ struct smap_psock *psock;
+ struct sock *sock;
+
+ sock = xchg(&stab->sock_map[i], NULL);
+ if (!sock)
+ continue;
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
+ }
+ rcu_read_unlock();
+
+ if (stab->bpf_verdict)
+ bpf_prog_put(stab->bpf_verdict);
+ if (stab->bpf_parse)
+ bpf_prog_put(stab->bpf_parse);
+
+ sock_map_remove_complete(stab);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (i >= stab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (i == stab->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = i + 1;
+ return 0;
+}
+
+struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ return READ_ONCE(stab->sock_map[key]);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct smap_psock *psock;
+ int k = *(u32 *)key;
+ struct sock *sock;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ sock = xchg(&stab->sock_map[k], NULL);
+ if (!sock)
+ return -EINVAL;
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ if (!psock)
+ goto out;
+
+ if (psock->bpf_parse)
+ smap_stop_sock(psock, sock);
+ smap_list_remove(psock, &stab->sock_map[k]);
+ smap_release_sock(psock, sock);
+out:
+ write_unlock_bh(&sock->sk_callback_lock);
+ return 0;
+}
+
+/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
+ * done inside rcu critical sections. This ensures on updates that the psock
+ * will not be released via smap_release_sock() until concurrent updates/deletes
+ * complete. All operations operate on sock_map using cmpxchg and xchg
+ * operations to ensure we do not get stale references. Any reads into the
+ * map must be done with READ_ONCE() because of this.
+ *
+ * A psock is destroyed via call_rcu and after any worker threads are cancelled
+ * and syncd so we are certain all references from the update/lookup/delete
+ * operations as well as references in the data path are no longer in use.
+ *
+ * Psocks may exist in multiple maps, but only a single set of parse/verdict
+ * programs may be inherited from the maps it belongs to. A reference count
+ * is kept with the total number of references to the psock from all maps. The
+ * psock will not be released until this reaches zero. The psock and sock
+ * user data data use the sk_callback_lock to protect critical data structures
+ * from concurrent access. This allows us to avoid two updates from modifying
+ * the user data in sock and the lock is required anyways for modifying
+ * callbacks, we simply increase its scope slightly.
+ *
+ * Rules to follow,
+ * - psock must always be read inside RCU critical section
+ * - sk_user_data must only be modified inside sk_callback_lock and read
+ * inside RCU critical section.
+ * - psock->maps list must only be read & modified inside sk_callback_lock
+ * - sock_map must use READ_ONCE and (cmp)xchg operations
+ * - BPF verdict/parse programs must use READ_ONCE and xchg operations
+ */
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct smap_psock_map_entry *e = NULL;
+ struct bpf_prog *verdict, *parse;
+ struct sock *osock, *sock;
+ struct smap_psock *psock;
+ u32 i = *(u32 *)key;
+ int err;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (unlikely(i >= stab->map.max_entries))
+ return -E2BIG;
+
+ sock = READ_ONCE(stab->sock_map[i]);
+ if (flags == BPF_EXIST && !sock)
+ return -ENOENT;
+ else if (flags == BPF_NOEXIST && sock)
+ return -EEXIST;
+
+ sock = skops->sk;
+
+ /* 1. If sock map has BPF programs those will be inherited by the
+ * sock being added. If the sock is already attached to BPF programs
+ * this results in an error.
+ */
+ verdict = READ_ONCE(stab->bpf_verdict);
+ parse = READ_ONCE(stab->bpf_parse);
+
+ if (parse && verdict) {
+ /* bpf prog refcnt may be zero if a concurrent attach operation
+ * removes the program after the above READ_ONCE() but before
+ * we increment the refcnt. If this is the case abort with an
+ * error.
+ */
+ verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+ if (IS_ERR(verdict))
+ return PTR_ERR(verdict);
+
+ parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+ if (IS_ERR(parse)) {
+ bpf_prog_put(verdict);
+ return PTR_ERR(parse);
+ }
+ }
+
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+
+ /* 2. Do not allow inheriting programs if psock exists and has
+ * already inherited programs. This would create confusion on
+ * which parser/verdict program is running. If no psock exists
+ * create one. Inside sk_callback_lock to ensure concurrent create
+ * doesn't update user data.
+ */
+ if (psock) {
+ if (READ_ONCE(psock->bpf_parse) && parse) {
+ err = -EBUSY;
+ goto out_progs;
+ }
+ psock->refcnt++;
+ } else {
+ psock = smap_init_psock(sock, stab);
+ if (IS_ERR(psock)) {
+ err = PTR_ERR(psock);
+ goto out_progs;
+ }
+
+ set_bit(SMAP_TX_RUNNING, &psock->state);
+ }
+
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e) {
+ err = -ENOMEM;
+ goto out_progs;
+ }
+ e->entry = &stab->sock_map[i];
+
+ /* 3. At this point we have a reference to a valid psock that is
+ * running. Attach any BPF programs needed.
+ */
+ if (parse && verdict && !psock->strp_enabled) {
+ err = smap_init_sock(psock, sock);
+ if (err)
+ goto out_free;
+ smap_init_progs(psock, stab, verdict, parse);
+ smap_start_sock(psock, sock);
+ }
+
+ /* 4. Place psock in sockmap for use and stop any programs on
+ * the old sock assuming its not the same sock we are replacing
+ * it with. Because we can only have a single set of programs if
+ * old_sock has a strp we can stop it.
+ */
+ list_add_tail(&e->list, &psock->maps);
+ write_unlock_bh(&sock->sk_callback_lock);
+
+ osock = xchg(&stab->sock_map[i], sock);
+ if (osock) {
+ struct smap_psock *opsock = smap_psock_sk(osock);
+
+ write_lock_bh(&osock->sk_callback_lock);
+ if (osock != sock && parse)
+ smap_stop_sock(opsock, osock);
+ smap_list_remove(opsock, &stab->sock_map[i]);
+ smap_release_sock(opsock, osock);
+ write_unlock_bh(&osock->sk_callback_lock);
+ }
+ return 0;
+out_free:
+ smap_release_sock(psock, sock);
+out_progs:
+ if (verdict)
+ bpf_prog_put(verdict);
+ if (parse)
+ bpf_prog_put(parse);
+ write_unlock_bh(&sock->sk_callback_lock);
+ kfree(e);
+ return err;
+}
+
+int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_prog *orig;
+
+ if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_SK_SKB_STREAM_PARSER:
+ orig = xchg(&stab->bpf_parse, prog);
+ break;
+ case BPF_SK_SKB_STREAM_VERDICT:
+ orig = xchg(&stab->bpf_verdict, prog);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (orig)
+ bpf_prog_put(orig);
+
+ return 0;
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static int sock_map_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_sock_ops_kern skops;
+ u32 fd = *(u32 *)value;
+ struct socket *socket;
+ int err;
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ skops.sk = socket->sk;
+ if (!skops.sk) {
+ fput(socket->file);
+ return -EINVAL;
+ }
+
+ err = sock_map_ctx_update_elem(&skops, map, key, flags);
+ fput(socket->file);
+ return err;
+}
+
+const struct bpf_map_ops sock_map_ops = {
+ .map_alloc = sock_map_alloc,
+ .map_free = sock_map_free,
+ .map_lookup_elem = sock_map_lookup,
+ .map_get_next_key = sock_map_get_next_key,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_map_delete_elem,
+};
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+ .func = bpf_sock_map_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 31147d7..135be43 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -31,7 +31,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
- smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
+ smap->map.numa_node);
if (!smap->elems)
return -ENOMEM;
@@ -59,7 +60,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags)
+ if (attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
@@ -75,7 +76,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
- smap = bpf_map_area_alloc(cost);
+ smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
return ERR_PTR(-ENOMEM);
@@ -91,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ smap->map.numa_node = bpf_map_attr_numa_node(attr);
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 045646d..70ad8e2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -48,6 +48,47 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#undef BPF_MAP_TYPE
};
+/*
+ * If we're handed a bigger struct than we know of, ensure all the unknown bits
+ * are 0 - i.e. new user-space does not rely on any kernel feature extensions
+ * we don't know about yet.
+ *
+ * There is a ToCToU between this function call and the following
+ * copy_from_user() call. However, this is not a concern since this function is
+ * meant to be a future-proofing of bits.
+ */
+static int check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
+{
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+ int err;
+
+ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
+ return -E2BIG;
+
+ if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
+ return -EFAULT;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ addr = uaddr + expected_size;
+ end = uaddr + actual_size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
struct bpf_map *map;
@@ -64,7 +105,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
-void *bpf_map_area_alloc(size_t size)
+void *bpf_map_area_alloc(size_t size, int numa_node)
{
/* We definitely need __GFP_NORETRY, so OOM killer doesn't
* trigger under memory pressure as we really just want to
@@ -74,12 +115,13 @@ void *bpf_map_area_alloc(size_t size)
void *area;
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- area = kmalloc(size, GFP_USER | flags);
+ area = kmalloc_node(size, GFP_USER | flags, numa_node);
if (area != NULL)
return area;
}
- return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
+ return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+ __builtin_return_address(0));
}
void bpf_map_area_free(void *area)
@@ -268,10 +310,11 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
+#define BPF_MAP_CREATE_LAST_FIELD numa_node
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
+ int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
int err;
@@ -279,6 +322,11 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
+ if (numa_node != NUMA_NO_NODE &&
+ ((unsigned int)numa_node >= nr_node_ids ||
+ !node_online(numa_node)))
+ return -EINVAL;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
@@ -870,7 +918,7 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
EXPORT_SYMBOL_GPL(bpf_prog_inc);
/* prog_idr_lock should have been held */
-static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
int refold;
@@ -886,6 +934,7 @@ static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
@@ -1047,6 +1096,36 @@ static int bpf_obj_get(const union bpf_attr *attr)
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+static int sockmap_get_from_fd(const union bpf_attr *attr)
+{
+ int ufd = attr->target_fd;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct fd f;
+ int err;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB);
+ if (IS_ERR(prog)) {
+ fdput(f);
+ return PTR_ERR(prog);
+ }
+
+ err = sock_map_attach_prog(map, prog, attr->attach_type);
+ if (err) {
+ fdput(f);
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ fdput(f);
+ return 0;
+}
+
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -1074,6 +1153,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ return sockmap_get_from_fd(attr);
default:
return -EINVAL;
}
@@ -1246,32 +1328,6 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
-static int check_uarg_tail_zero(void __user *uaddr,
- size_t expected_size,
- size_t actual_size)
-{
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
- int err;
-
- if (actual_size <= expected_size)
- return 0;
-
- addr = uaddr + expected_size;
- end = uaddr + actual_size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
-
- return 0;
-}
-
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1289,7 +1345,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info_len = min_t(u32, sizeof(info), info_len);
if (copy_from_user(&info, uinfo, info_len))
- return err;
+ return -EFAULT;
info.type = prog->type;
info.id = prog->aux->id;
@@ -1312,7 +1368,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
}
ulen = info.xlated_prog_len;
- info.xlated_prog_len = bpf_prog_size(prog->len);
+ info.xlated_prog_len = bpf_prog_insn_size(prog);
if (info.xlated_prog_len && ulen) {
uinsns = u64_to_user_ptr(info.xlated_prog_insns);
ulen = min_t(u32, info.xlated_prog_len, ulen);
@@ -1393,17 +1449,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
return -EPERM;
- if (!access_ok(VERIFY_READ, uattr, 1))
- return -EFAULT;
-
- if (size > PAGE_SIZE) /* silly large */
- return -E2BIG;
-
- /* If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
err = check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
new file mode 100644
index 0000000..1f4bf68
--- /dev/null
+++ b/kernel/bpf/tnum.c
@@ -0,0 +1,180 @@
+/* tnum: tracked (or tristate) numbers
+ *
+ * A tnum tracks knowledge about the bits of a value. Each bit can be either
+ * known (0 or 1), or unknown (x). Arithmetic operations on tnums will
+ * propagate the unknown bits such that the tnum result represents all the
+ * possible results for possible values of the operands.
+ */
+#include <linux/kernel.h>
+#include <linux/tnum.h>
+
+#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
+/* A completely unknown value */
+const struct tnum tnum_unknown = { .value = 0, .mask = -1 };
+
+struct tnum tnum_const(u64 value)
+{
+ return TNUM(value, 0);
+}
+
+struct tnum tnum_range(u64 min, u64 max)
+{
+ u64 chi = min ^ max, delta;
+ u8 bits = fls64(chi);
+
+ /* special case, needed because 1ULL << 64 is undefined */
+ if (bits > 63)
+ return tnum_unknown;
+ /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
+ * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
+ * constant min (since min == max).
+ */
+ delta = (1ULL << bits) - 1;
+ return TNUM(min & ~delta, delta);
+}
+
+struct tnum tnum_lshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value << shift, a.mask << shift);
+}
+
+struct tnum tnum_rshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value >> shift, a.mask >> shift);
+}
+
+struct tnum tnum_add(struct tnum a, struct tnum b)
+{
+ u64 sm, sv, sigma, chi, mu;
+
+ sm = a.mask + b.mask;
+ sv = a.value + b.value;
+ sigma = sm + sv;
+ chi = sigma ^ sv;
+ mu = chi | a.mask | b.mask;
+ return TNUM(sv & ~mu, mu);
+}
+
+struct tnum tnum_sub(struct tnum a, struct tnum b)
+{
+ u64 dv, alpha, beta, chi, mu;
+
+ dv = a.value - b.value;
+ alpha = dv + a.mask;
+ beta = dv - b.mask;
+ chi = alpha ^ beta;
+ mu = chi | a.mask | b.mask;
+ return TNUM(dv & ~mu, mu);
+}
+
+struct tnum tnum_and(struct tnum a, struct tnum b)
+{
+ u64 alpha, beta, v;
+
+ alpha = a.value | a.mask;
+ beta = b.value | b.mask;
+ v = a.value & b.value;
+ return TNUM(v, alpha & beta & ~v);
+}
+
+struct tnum tnum_or(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v, mu & ~v);
+}
+
+struct tnum tnum_xor(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value ^ b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+/* half-multiply add: acc += (unknown * mask * value).
+ * An intermediate step in the multiply algorithm.
+ */
+static struct tnum hma(struct tnum acc, u64 value, u64 mask)
+{
+ while (mask) {
+ if (mask & 1)
+ acc = tnum_add(acc, TNUM(0, value));
+ mask >>= 1;
+ value <<= 1;
+ }
+ return acc;
+}
+
+struct tnum tnum_mul(struct tnum a, struct tnum b)
+{
+ struct tnum acc;
+ u64 pi;
+
+ pi = a.value * b.value;
+ acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
+ return hma(acc, b.mask, a.value);
+}
+
+/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
+ * a 'known 0' - this will return a 'known 1' for that bit.
+ */
+struct tnum tnum_intersect(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask & b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+struct tnum tnum_cast(struct tnum a, u8 size)
+{
+ a.value &= (1ULL << (size * 8)) - 1;
+ a.mask &= (1ULL << (size * 8)) - 1;
+ return a;
+}
+
+bool tnum_is_aligned(struct tnum a, u64 size)
+{
+ if (!size)
+ return true;
+ return !((a.value | a.mask) & (size - 1));
+}
+
+bool tnum_in(struct tnum a, struct tnum b)
+{
+ if (b.mask & ~a.mask)
+ return false;
+ b.value &= ~a.mask;
+ return a.value == b.value;
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+int tnum_sbin(char *str, size_t size, struct tnum a)
+{
+ size_t n;
+
+ for (n = 64; n; n--) {
+ if (n < size) {
+ if (a.mask & 1)
+ str[n - 1] = 'x';
+ else if (a.value & 1)
+ str[n - 1] = '1';
+ else
+ str[n - 1] = '0';
+ }
+ a.mask >>= 1;
+ a.value >>= 1;
+ }
+ str[min(size - 1, (size_t)64)] = 0;
+ return 64;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index af9e84a..d690c7d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -61,12 +61,12 @@
* (and -20 constant is saved for further stack bounds checking).
* Meaning that this reg is a pointer to stack plus known immediate constant.
*
- * Most of the time the registers have UNKNOWN_VALUE type, which
+ * Most of the time the registers have SCALAR_VALUE type, which
* means the register has some value, but it's not a valid pointer.
- * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ * (like pointer plus pointer becomes SCALAR_VALUE type)
*
* When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
* types recognized by check_mem_access() function.
*
* PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
@@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 98304
+#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
@@ -180,15 +180,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
- [UNKNOWN_VALUE] = "inv",
+ [SCALAR_VALUE] = "inv",
[PTR_TO_CTX] = "ctx",
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
- [FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
- [CONST_IMM] = "imm",
[PTR_TO_PACKET] = "pkt",
[PTR_TO_PACKET_END] = "pkt_end",
};
@@ -221,32 +218,52 @@ static void print_verifier_state(struct bpf_verifier_state *state)
if (t == NOT_INIT)
continue;
verbose(" R%d=%s", i, reg_type_str[t]);
- if (t == CONST_IMM || t == PTR_TO_STACK)
- verbose("%lld", reg->imm);
- else if (t == PTR_TO_PACKET)
- verbose("(id=%d,off=%d,r=%d)",
- reg->id, reg->off, reg->range);
- else if (t == UNKNOWN_VALUE && reg->imm)
- verbose("%lld", reg->imm);
- else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL ||
- t == PTR_TO_MAP_VALUE_ADJ)
- verbose("(ks=%d,vs=%d,id=%u)",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size,
- reg->id);
- if (reg->min_value != BPF_REGISTER_MIN_RANGE)
- verbose(",min_value=%lld",
- (long long)reg->min_value);
- if (reg->max_value != BPF_REGISTER_MAX_RANGE)
- verbose(",max_value=%llu",
- (unsigned long long)reg->max_value);
- if (reg->min_align)
- verbose(",min_align=%u", reg->min_align);
- if (reg->aux_off)
- verbose(",aux_off=%u", reg->aux_off);
- if (reg->aux_off_align)
- verbose(",aux_off_align=%u", reg->aux_off_align);
+ if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+ tnum_is_const(reg->var_off)) {
+ /* reg->off should be 0 for SCALAR_VALUE */
+ verbose("%lld", reg->var_off.value + reg->off);
+ } else {
+ verbose("(id=%d", reg->id);
+ if (t != SCALAR_VALUE)
+ verbose(",off=%d", reg->off);
+ if (t == PTR_TO_PACKET)
+ verbose(",r=%d", reg->range);
+ else if (t == CONST_PTR_TO_MAP ||
+ t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose(",ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ if (tnum_is_const(reg->var_off)) {
+ /* Typically an immediate SCALAR_VALUE, but
+ * could be a pointer whose offset is too big
+ * for reg->off
+ */
+ verbose(",imm=%llx", reg->var_off.value);
+ } else {
+ if (reg->smin_value != reg->umin_value &&
+ reg->smin_value != S64_MIN)
+ verbose(",smin_value=%lld",
+ (long long)reg->smin_value);
+ if (reg->smax_value != reg->umax_value &&
+ reg->smax_value != S64_MAX)
+ verbose(",smax_value=%lld",
+ (long long)reg->smax_value);
+ if (reg->umin_value != 0)
+ verbose(",umin_value=%llu",
+ (unsigned long long)reg->umin_value);
+ if (reg->umax_value != U64_MAX)
+ verbose(",umax_value=%llu",
+ (unsigned long long)reg->umax_value);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(",var_off=%s", tn_buf);
+ }
+ }
+ verbose(")");
+ }
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -295,11 +312,15 @@ static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
[BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
[BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
[BPF_JSET >> 4] = "&",
[BPF_JNE >> 4] = "!=",
[BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
[BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
[BPF_CALL >> 4] = "call",
[BPF_EXIT >> 4] = "exit",
};
@@ -463,56 +484,163 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg);
+
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- BUG_ON(regno >= MAX_BPF_REG);
+ reg->id = 0;
+ reg->var_off = tnum_const(imm);
+ reg->smin_value = (s64)imm;
+ reg->smax_value = (s64)imm;
+ reg->umin_value = imm;
+ reg->umax_value = imm;
+}
- memset(&regs[regno], 0, sizeof(regs[regno]));
- regs[regno].type = NOT_INIT;
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+/* Mark the 'variable offset' part of a register as zero. This should be
+ * used only on registers holding a pointer type.
+ */
+static void __mark_reg_known_zero(struct bpf_reg_state *reg)
+{
+ __mark_reg_known(reg, 0);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
{
- int i;
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_known_zero(regs + regno);
+}
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_reg_not_init(regs, i);
+/* Attempts to improve min/max values based on var_off information */
+static void __update_reg_bounds(struct bpf_reg_state *reg)
+{
+ /* min signed is max(sign bit) | min(other bits) */
+ reg->smin_value = max_t(s64, reg->smin_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MIN));
+ /* max signed is min(sign bit) | max(other bits) */
+ reg->smax_value = min_t(s64, reg->smax_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MAX));
+ reg->umin_value = max(reg->umin_value, reg->var_off.value);
+ reg->umax_value = min(reg->umax_value,
+ reg->var_off.value | reg->var_off.mask);
+}
- /* frame pointer */
- regs[BPF_REG_FP].type = FRAME_PTR;
+/* Uses signed min/max values to inform unsigned, and vice-versa */
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
+{
+ /* Learn sign from signed bounds.
+ * If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ */
+ if (reg->smin_value >= 0 || reg->smax_value < 0) {
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ return;
+ }
+ /* Learn sign from unsigned bounds. Signed bounds cross the sign
+ * boundary, so we must be careful.
+ */
+ if ((s64)reg->umax_value >= 0) {
+ /* Positive. We can't learn anything from the smin, but smax
+ * is positive, hence safe.
+ */
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+ reg->umax_value);
+ } else if ((s64)reg->umin_value < 0) {
+ /* Negative. We can't learn anything from the smax, but smin
+ * is negative, hence safe.
+ */
+ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+ reg->umin_value);
+ reg->smax_value = reg->umax_value;
+ }
+}
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
+/* Attempts to improve var_off based on unsigned min/max information */
+static void __reg_bound_offset(struct bpf_reg_state *reg)
+{
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->umin_value,
+ reg->umax_value));
}
-static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
- regs[regno].type = UNKNOWN_VALUE;
- regs[regno].id = 0;
- regs[regno].imm = 0;
+ reg->smin_value = S64_MIN;
+ reg->smax_value = S64_MAX;
+ reg->umin_value = 0;
+ reg->umax_value = U64_MAX;
}
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+/* Mark a register as having a completely unknown (scalar) value. */
+static void __mark_reg_unknown(struct bpf_reg_state *reg)
{
- BUG_ON(regno >= MAX_BPF_REG);
- __mark_reg_unknown_value(regs, regno);
+ reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->off = 0;
+ reg->var_off = tnum_unknown;
+ __mark_reg_unbounded(reg);
}
-static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
{
- regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
- regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
- regs[regno].value_from_signed = false;
- regs[regno].min_align = 0;
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_unknown(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_unknown(regs + regno);
}
-static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
- u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg)
+{
+ __mark_reg_unknown(reg);
+ reg->type = NOT_INIT;
+}
+
+static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
{
- mark_reg_unknown_value(regs, regno);
- reset_reg_range_values(regs, regno);
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose("mark_reg_not_init(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(regs + regno);
+ return;
+ }
+ __mark_reg_not_init(regs + regno);
+}
+
+static void init_reg_state(struct bpf_reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ mark_reg_not_init(regs, i);
+ regs[i].live = REG_LIVE_NONE;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = PTR_TO_STACK;
+ mark_reg_known_zero(regs, BPF_REG_FP);
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -521,9 +649,26 @@ enum reg_arg_type {
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
+static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->regs[regno].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->regs[regno].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+
if (regno >= MAX_BPF_REG) {
verbose("R%d is invalid\n", regno);
return -EINVAL;
@@ -535,14 +680,16 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
+ mark_reg_read(&env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose("frame pointer is read only\n");
return -EACCES;
}
+ regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown_value(regs, regno);
+ mark_reg_unknown(regs, regno);
}
return 0;
}
@@ -552,12 +699,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
switch (type) {
case PTR_TO_MAP_VALUE:
case PTR_TO_MAP_VALUE_OR_NULL:
- case PTR_TO_MAP_VALUE_ADJ:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
case PTR_TO_PACKET_END:
- case FRAME_PTR:
case CONST_PTR_TO_MAP:
return true;
default:
@@ -571,7 +716,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
static int check_stack_write(struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
- int i;
+ int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
@@ -586,15 +731,14 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
}
/* save register state */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- state->regs[value_regno];
+ state->spilled_regs[spi] = state->regs[value_regno];
+ state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
} else {
/* regular write of data into stack */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct bpf_reg_state) {};
+ state->spilled_regs[spi] = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -602,11 +746,26 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
return 0;
}
+static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot)
+{
+ struct bpf_verifier_state *parent = state->parent;
+
+ while (parent) {
+ /* if read wasn't screened by an earlier write ... */
+ if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+ break;
+ /* ... then we depend on parent's value */
+ parent->spilled_regs[slot].live |= REG_LIVE_READ;
+ state = parent;
+ parent = state->parent;
+ }
+}
+
static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
- int i;
+ int i, spi;
slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
@@ -622,10 +781,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
}
- if (value_regno >= 0)
+ spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+
+ if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] =
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ state->regs[value_regno] = state->spilled_regs[spi];
+ mark_stack_slot_read(state, spi);
+ }
return 0;
} else {
for (i = 0; i < size; i++) {
@@ -637,14 +799,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
return 0;
}
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -657,49 +818,50 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
-/* check read/write into an adjusted map element */
-static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
int off, int size)
{
struct bpf_verifier_state *state = &env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
- /* We adjusted the register to this map value, so we
- * need to change off and size to min_value and max_value
- * respectively to make sure our theoretical access will be
- * safe.
+ /* We may have adjusted the register to this map value, so we
+ * need to try adding each of min_value and max_value to off
+ * to make sure our theoretical access will be safe.
*/
if (log_level)
print_verifier_state(state);
- env->varlen_map_value_access = true;
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if (reg->min_value < 0) {
+ if (reg->smin_value < 0) {
verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = check_map_access(env, regno, reg->min_value + off, size);
+ err = __check_map_access(env, regno, reg->smin_value + off, size);
if (err) {
- verbose("R%d min value is outside of the array range\n",
- regno);
+ verbose("R%d min value is outside of the array range\n", regno);
return err;
}
- /* If we haven't set a max value then we need to bail
- * since we can't be sure we won't do bad things.
+ /* If we haven't set a max value then we need to bail since we can't be
+ * sure we won't do bad things.
+ * If reg->umax_value + off could overflow, treat that as unbounded too.
*/
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ if (reg->umax_value >= BPF_MAX_VAR_OFF) {
verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
- return check_map_access(env, regno, reg->max_value + off, size);
+ err = __check_map_access(env, regno, reg->umax_value + off, size);
+ if (err)
+ verbose("R%d max value is outside of the array range\n", regno);
+ return err;
}
#define MAX_PACKET_OFF 0xffff
@@ -719,6 +881,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_SK_SKB:
if (meta)
return meta->pkt_access;
@@ -729,14 +892,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
}
-static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size)
{
struct bpf_reg_state *regs = env->cur_state.regs;
struct bpf_reg_state *reg = &regs[regno];
- off += reg->off;
- if (off < 0 || size <= 0 || off + size > reg->range) {
+ if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
@@ -744,7 +906,35 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
-/* check access to 'struct bpf_context' fields */
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *reg = &regs[regno];
+ int err;
+
+ /* We may have added a variable offset to the packet pointer; but any
+ * reg->range we have comes after that. We are only checking the fixed
+ * offset.
+ */
+
+ /* We don't allow negative numbers, because we aren't tracking enough
+ * detail to prove they're safe.
+ */
+ if (reg->smin_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = __check_packet_access(env, regno, off, size);
+ if (err) {
+ verbose("R%d offset is outside of the packet\n", regno);
+ return err;
+ }
+ return err;
+}
+
+/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
@@ -784,13 +974,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
if (allow_ptr_leaks)
return false;
- switch (reg->type) {
- case UNKNOWN_VALUE:
- case CONST_IMM:
- return false;
- default:
- return true;
- }
+ return reg->type != SCALAR_VALUE;
}
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
@@ -801,23 +985,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
+ struct tnum reg_off;
int ip_align;
- int reg_off;
/* Byte size accesses are always allowed. */
if (!strict || size == 1)
return 0;
- reg_off = reg->off;
- if (reg->id) {
- if (reg->aux_off_align % size) {
- verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
- reg->aux_off_align, size);
- return -EACCES;
- }
- reg_off += reg->aux_off;
- }
-
/* For platforms that do not have a Kconfig enabling
* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
* NET_IP_ALIGN is universally set to '2'. And on platforms
@@ -827,20 +1001,37 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
* unconditional IP align value of '2'.
*/
ip_align = 2;
- if ((ip_align + reg_off + off) % size != 0) {
- verbose("misaligned packet access off %d+%d+%d size %d\n",
- ip_align, reg_off, off, size);
+
+ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
return 0;
}
-static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
- int size, bool strict)
+static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+ const char *pointer_desc,
+ int off, int size, bool strict)
{
- if (strict && size != 1) {
- verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+ struct tnum reg_off;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -852,21 +1043,25 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
int off, int size)
{
bool strict = env->strict_alignment;
+ const char *pointer_desc = "";
switch (reg->type) {
case PTR_TO_PACKET:
+ /* special case, because of NET_IP_ALIGN */
return check_pkt_ptr_alignment(reg, off, size, strict);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_val_ptr_alignment(reg, size, strict);
+ case PTR_TO_MAP_VALUE:
+ pointer_desc = "value ";
+ break;
+ case PTR_TO_CTX:
+ pointer_desc = "context ";
+ break;
+ case PTR_TO_STACK:
+ pointer_desc = "stack ";
+ break;
default:
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n",
- off, size);
- return -EACCES;
- }
-
- return 0;
+ break;
}
+ return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -883,52 +1078,79 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
struct bpf_reg_state *reg = &state->regs[regno];
int size, err = 0;
- if (reg->type == PTR_TO_STACK)
- off += reg->imm;
-
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
+ /* alignment checks will add in reg->off themselves */
err = check_ptr_alignment(env, reg, off, size);
if (err)
return err;
- if (reg->type == PTR_TO_MAP_VALUE ||
- reg->type == PTR_TO_MAP_VALUE_ADJ) {
+ /* for access checks, reg->off is just part of off */
+ off += reg->off;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- if (reg->type == PTR_TO_MAP_VALUE_ADJ)
- err = check_map_access_adj(env, regno, off, size);
- else
- err = check_map_access(env, regno, off, size);
+ err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+ enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
+ /* ctx accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable ctx access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
- /* note that reg.[id|off|range] == 0 */
+ /* ctx access returns either a scalar, or a
+ * PTR_TO_PACKET[_END]. In the latter case, we know
+ * the offset is zero.
+ */
+ if (reg_type == SCALAR_VALUE)
+ mark_reg_unknown(state->regs, value_regno);
+ else
+ mark_reg_known_zero(state->regs, value_regno);
+ state->regs[value_regno].id = 0;
+ state->regs[value_regno].off = 0;
+ state->regs[value_regno].range = 0;
state->regs[value_regno].type = reg_type;
- state->regs[value_regno].aux_off = 0;
- state->regs[value_regno].aux_off_align = 0;
}
- } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
+ } else if (reg->type == PTR_TO_STACK) {
+ /* stack accesses must be at a fixed offset, so that we can
+ * determine what type of data were returned.
+ * See check_stack_read().
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable stack access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
@@ -948,7 +1170,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
err = check_stack_read(state, off, size, value_regno);
}
- } else if (state->regs[regno].type == PTR_TO_PACKET) {
+ } else if (reg->type == PTR_TO_PACKET) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
@@ -960,28 +1182,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value_and_range(state->regs,
- value_regno);
+ mark_reg_unknown(state->regs, value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[reg->type]);
return -EACCES;
}
- if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
- state->regs[value_regno].type == UNKNOWN_VALUE) {
- /* 1 or 2 byte load zero-extends, determine the number of
- * zero upper bits. Not doing it fo 4 byte load, since
- * such values cannot be added to ptr_to_packet anyway.
- */
- state->regs[value_regno].imm = 64 - size * 8;
+ if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
+ state->regs[value_regno].type == SCALAR_VALUE) {
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ state->regs[value_regno].var_off = tnum_cast(
+ state->regs[value_regno].var_off, size);
+ __update_reg_bounds(&state->regs[value_regno]);
}
return err;
}
static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -991,12 +1210,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -1016,9 +1235,17 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state reg)
+{
+ return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
+}
+
/* when register 'regno' is passed into function that will read 'access_size'
* bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized
+ * and all elements of stack are initialized.
+ * Unlike most pointer bounds-checking functions, this one doesn't take an
+ * 'off' argument, so it has to add in reg->off itself.
*/
static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
@@ -1029,9 +1256,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int off, i;
if (regs[regno].type != PTR_TO_STACK) {
+ /* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
- regs[regno].type == CONST_IMM &&
- regs[regno].imm == 0)
+ register_is_null(regs[regno]))
return 0;
verbose("R%d type=%s expected=%s\n", regno,
@@ -1040,7 +1267,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
- off = regs[regno].imm;
+ /* Only allow fixed-offset stack reads */
+ if (!tnum_is_const(regs[regno].var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+ verbose("invalid variable stack read R%d var_off=%s\n",
+ regno, tn_buf);
+ }
+ off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
access_size <= 0) {
verbose("invalid stack type R%d off=%d access_size=%d\n",
@@ -1071,16 +1306,14 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
- switch (regs[regno].type) {
+ switch (reg->type) {
case PTR_TO_PACKET:
- return check_packet_access(env, regno, 0, access_size);
+ return check_packet_access(env, regno, reg->off, access_size);
case PTR_TO_MAP_VALUE:
- return check_map_access(env, regno, 0, access_size);
- case PTR_TO_MAP_VALUE_ADJ:
- return check_map_access_adj(env, regno, 0, access_size);
- default: /* const_imm|ptr_to_stack or invalid ptr */
+ return check_map_access(env, regno, reg->off, access_size);
+ default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
}
@@ -1097,10 +1330,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_DONTCARE)
return 0;
- if (type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
- return -EACCES;
- }
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
@@ -1123,11 +1355,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
- expected_type = CONST_IMM;
- /* One exception. Allow UNKNOWN_VALUE registers when the
- * boundaries are known and don't cause unsafe memory accesses
- */
- if (type != UNKNOWN_VALUE && type != expected_type)
+ expected_type = SCALAR_VALUE;
+ if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
@@ -1141,13 +1370,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
- * passed in as argument, it's a CONST_IMM type. Final test
+ * passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (type == CONST_IMM && reg->imm == 0)
+ if (register_is_null(*reg))
/* final test in check_stack_boundary() */;
else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
- type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
+ type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
@@ -1173,7 +1402,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
+ err = check_packet_access(env, regno, reg->off,
meta->map_ptr->key_size);
else
err = check_stack_boundary(env, regno,
@@ -1189,7 +1418,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
if (type == PTR_TO_PACKET)
- err = check_packet_access(env, regno, 0,
+ err = check_packet_access(env, regno, reg->off,
meta->map_ptr->value_size);
else
err = check_stack_boundary(env, regno,
@@ -1209,10 +1438,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
- /* If the register is UNKNOWN_VALUE, the access check happens
- * using its boundaries. Otherwise, just use its imm
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
*/
- if (type == UNKNOWN_VALUE) {
+
+ if (!tnum_is_const(reg->var_off))
/* For unprivileged variable accesses, disable raw
* mode so that the program is required to
* initialize all the memory that the helper could
@@ -1220,35 +1450,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
meta = NULL;
- if (reg->min_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->min_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
- }
+ if (reg->smin_value < 0) {
+ verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->max_value,
- zero_size_allowed, meta);
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
if (err)
return err;
- } else {
- /* register is CONST_IMM */
- err = check_helper_mem_access(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
}
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
}
return err;
@@ -1283,10 +1506,25 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
+ /* devmap returns a pointer to a live net_device ifindex that we cannot
+ * allow to be modified from bpf side. So do not allow lookup elements
+ * for now.
+ */
+ case BPF_MAP_TYPE_DEVMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
goto error;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ if (func_id != BPF_FUNC_sk_redirect_map &&
+ func_id != BPF_FUNC_sock_map_update &&
+ func_id != BPF_FUNC_map_delete_elem)
+ goto error;
+ break;
default:
break;
}
@@ -1311,6 +1549,18 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
break;
+ case BPF_FUNC_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sk_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sock_map_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
default:
break;
}
@@ -1340,6 +1590,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
+/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
+ * so turn them into unknown SCALAR_VALUE.
+ */
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
struct bpf_verifier_state *state = &env->cur_state;
@@ -1349,7 +1602,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET ||
regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown_value(regs, i);
+ mark_reg_unknown(regs, i);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1358,8 +1611,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
- __mark_reg_unknown_value(state->spilled_regs,
- i / BPF_REG_SIZE);
+ __mark_reg_unknown(reg);
}
}
@@ -1434,19 +1686,24 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
}
/* reset caller saved regs */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
- /* update return register */
+ /* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ /* sets type to SCALAR_VALUE */
+ mark_reg_unknown(regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
struct bpf_insn_aux_data *insn_aux;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
- regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
+ /* There is no offset yet applied, variable or fixed */
+ mark_reg_known_zero(regs, BPF_REG_0);
+ regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
@@ -1477,485 +1734,551 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static int check_packet_ptr_add(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static void coerce_reg_to_32(struct bpf_reg_state *reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- struct bpf_reg_state tmp_reg;
- s32 imm;
-
- if (BPF_SRC(insn->code) == BPF_K) {
- /* pkt_ptr += imm */
- imm = insn->imm;
-
-add_imm:
- if (imm < 0) {
- verbose("addition of negative constant to packet pointer is not allowed\n");
- return -EACCES;
- }
- if (imm >= MAX_PACKET_OFF ||
- imm + dst_reg->off >= MAX_PACKET_OFF) {
- verbose("constant %d is too large to add to packet pointer\n",
- imm);
- return -EACCES;
- }
- /* a constant was added to pkt_ptr.
- * Remember it while keeping the same 'id'
- */
- dst_reg->off += imm;
- } else {
- bool had_id;
-
- if (src_reg->type == PTR_TO_PACKET) {
- /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
- tmp_reg = *dst_reg; /* save r7 state */
- *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
- src_reg = &tmp_reg; /* pretend it's src_reg state */
- /* if the checks below reject it, the copy won't matter,
- * since we're rejecting the whole program. If all ok,
- * then imm22 state will be added to r7
- * and r7 will be pkt(id=0,off=22,r=62) while
- * r6 will stay as pkt(id=0,off=0,r=62)
- */
- }
+ /* clear high 32 bits */
+ reg->var_off = tnum_cast(reg->var_off, 4);
+ /* Update bounds */
+ __update_reg_bounds(reg);
+}
- if (src_reg->type == CONST_IMM) {
- /* pkt_ptr += reg where reg is known constant */
- imm = src_reg->imm;
- goto add_imm;
- }
- /* disallow pkt_ptr += reg
- * if reg is not uknown_value with guaranteed zero upper bits
- * otherwise pkt_ptr may overflow and addition will become
- * subtraction which is not allowed
- */
- if (src_reg->type != UNKNOWN_VALUE) {
- verbose("cannot add '%s' to ptr_to_packet\n",
- reg_type_str[src_reg->type]);
- return -EACCES;
- }
- if (src_reg->imm < 48) {
- verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
- src_reg->imm);
- return -EACCES;
- }
+static bool signed_add_overflows(s64 a, s64 b)
+{
+ /* Do the add in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a + (u64)b);
- had_id = (dst_reg->id != 0);
+ if (b < 0)
+ return res > a;
+ return res < a;
+}
- /* dst_reg stays as pkt_ptr type and since some positive
- * integer value was added to the pointer, increment its 'id'
- */
- dst_reg->id = ++env->id_gen;
-
- /* something was added to pkt_ptr, set range to zero */
- dst_reg->aux_off += dst_reg->off;
- dst_reg->off = 0;
- dst_reg->range = 0;
- if (had_id)
- dst_reg->aux_off_align = min(dst_reg->aux_off_align,
- src_reg->min_align);
- else
- dst_reg->aux_off_align = src_reg->min_align;
- }
- return 0;
+static bool signed_sub_overflows(s64 a, s64 b)
+{
+ /* Do the sub in u64, where overflow is well-defined */
+ s64 res = (s64)((u64)a - (u64)b);
+
+ if (b < 0)
+ return res < a;
+ return res > a;
}
-static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
+/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
+ * Caller should also handle BPF_MOV case separately.
+ * If we return -EACCES, caller may want to try again treating pointer as a
+ * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks.
+ */
+static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ bool known = tnum_is_const(off_reg->var_off);
+ s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
+ smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
+ u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
+ umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
u8 opcode = BPF_OP(insn->code);
- s64 imm_log2;
+ u32 dst = insn->dst_reg;
- /* for type == UNKNOWN_VALUE:
- * imm > 0 -> number of zero upper bits
- * imm == 0 -> don't track which is the same as all bits can be non-zero
- */
+ dst_reg = &regs[dst];
- if (BPF_SRC(insn->code) == BPF_X) {
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
- if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(dst_reg->imm, src_reg->imm);
- dst_reg->imm--;
- return 0;
- }
- if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
- dst_reg->imm && opcode == BPF_ADD) {
- /* dreg += sreg
- * where dreg has zero upper bits and sreg is const.
- * Adding them can only result making one more bit
- * non-zero in the larger value.
- */
- imm_log2 = __ilog2_u64((long long)src_reg->imm);
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- return 0;
- }
- /* all other cases non supported yet, just mark dst_reg */
- dst_reg->imm = 0;
- return 0;
+ if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: known but bad sbounds\n");
+ return -EINVAL;
+ }
+ if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: known but bad ubounds\n");
+ return -EINVAL;
}
- /* sign extend 32-bit imm into 64-bit to make sure that
- * negative values occupy bit 63. Note ilog2() would have
- * been incorrect, since sizeof(insn->imm) == 4
- */
- imm_log2 = __ilog2_u64((long long)insn->imm);
-
- if (dst_reg->imm && opcode == BPF_LSH) {
- /* reg <<= imm
- * if reg was a result of 2 byte load, then its imm == 48
- * which means that upper 48 bits are zero and shifting this reg
- * left by 4 would mean that upper 44 bits are still zero
- */
- dst_reg->imm -= insn->imm;
- } else if (dst_reg->imm && opcode == BPF_MUL) {
- /* reg *= imm
- * if multiplying by 14 subtract 4
- * This is conservative calculation of upper zero bits.
- * It's not trying to special case insn->imm == 1 or 0 cases
- */
- dst_reg->imm -= imm_log2 + 1;
- } else if (opcode == BPF_AND) {
- /* reg &= imm */
- dst_reg->imm = 63 - imm_log2;
- } else if (dst_reg->imm && opcode == BPF_ADD) {
- /* reg += imm */
- dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- } else if (opcode == BPF_RSH) {
- /* reg >>= imm
- * which means that after right shift, upper bits will be zero
- * note that verifier already checked that
- * 0 <= imm < 64 for shift insn
- */
- dst_reg->imm += insn->imm;
- if (unlikely(dst_reg->imm > 64))
- /* some dumb code did:
- * r2 = *(u32 *)mem;
- * r2 >>= 32;
- * and all bits are zero now */
- dst_reg->imm = 64;
- } else {
- /* all other alu ops, means that we don't know what will
- * happen to the value, mark it with unknown number of zero bits
- */
- dst_reg->imm = 0;
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops on pointers produce (meaningless) scalars */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
+ return -EACCES;
}
- if (dst_reg->imm < 0) {
- /* all 64 bits of the register can contain non-zero bits
- * and such value cannot be added to ptr_to_packet, since it
- * may overflow, mark it as unknown to avoid further eval
- */
- dst_reg->imm = 0;
+ if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == CONST_PTR_TO_MAP) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (ptr_reg->type == PTR_TO_PACKET_END) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ dst);
+ return -EACCES;
}
- return 0;
-}
-static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
- u8 opcode = BPF_OP(insn->code);
- s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
-
- /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
- if (src_reg->imm > 0 && dst_reg->imm) {
- switch (opcode) {
- case BPF_ADD:
- /* dreg += sreg
- * where both have zero upper bits. Adding them
- * can only result making one more bit non-zero
- * in the larger value.
- * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
- * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
- */
- dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
- dst_reg->imm--;
- break;
- case BPF_AND:
- /* dreg &= sreg
- * AND can not extend zero bits only shrink
- * Ex. 0x00..00ffffff
- * & 0x0f..ffffffff
- * ----------------
- * 0x00..00ffffff
- */
- dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
+ * The id may be overwritten later if we create a new variable offset.
+ */
+ dst_reg->type = ptr_reg->type;
+ dst_reg->id = ptr_reg->id;
+
+ switch (opcode) {
+ case BPF_ADD:
+ /* We can take a fixed offset as long as it doesn't overflow
+ * the s32 'off' field
+ */
+ if (known && (ptr_reg->off + smin_val ==
+ (s64)(s32)(ptr_reg->off + smin_val))) {
+ /* pointer += K. Accumulate it into fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->off = ptr_reg->off + smin_val;
+ dst_reg->range = ptr_reg->range;
break;
- case BPF_OR:
- /* dreg |= sreg
- * OR can only extend zero bits
- * Ex. 0x00..00ffffff
- * | 0x0f..ffffffff
- * ----------------
- * 0x0f..00ffffff
- */
- dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ }
+ /* A new variable offset is created. Note that off_reg->off
+ * == 0, since it's a scalar.
+ * dst_reg gets the pointer type and since some positive
+ * integer value was added to the pointer, give it a new 'id'
+ * if it's a PTR_TO_PACKET.
+ * this creates a new 'base' pointer, off_reg (variable) gets
+ * added into the variable offset, and we copy the fixed offset
+ * from ptr_reg.
+ */
+ if (signed_add_overflows(smin_ptr, smin_val) ||
+ signed_add_overflows(smax_ptr, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr + smin_val;
+ dst_reg->smax_value = smax_ptr + smax_val;
+ }
+ if (umin_ptr + umin_val < umin_ptr ||
+ umax_ptr + umax_val < umax_ptr) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value = umin_ptr + umin_val;
+ dst_reg->umax_value = umax_ptr + umax_val;
+ }
+ dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (ptr_reg->type == PTR_TO_PACKET) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_SUB:
+ if (dst_reg == off_reg) {
+ /* scalar -= pointer. Creates an unknown scalar */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d tried to subtract pointer from scalar\n",
+ dst);
+ return -EACCES;
+ }
+ /* We don't allow subtraction from FP, because (according to
+ * test_verifier.c test "invalid fp arithmetic", JITs might not
+ * be able to deal with it.
+ */
+ if (ptr_reg->type == PTR_TO_STACK) {
+ if (!env->allow_ptr_leaks)
+ verbose("R%d subtraction from stack pointer prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (known && (ptr_reg->off - smin_val ==
+ (s64)(s32)(ptr_reg->off - smin_val))) {
+ /* pointer -= K. Subtract it from fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->id = ptr_reg->id;
+ dst_reg->off = ptr_reg->off - smin_val;
+ dst_reg->range = ptr_reg->range;
break;
- case BPF_SUB:
- case BPF_MUL:
- case BPF_RSH:
- case BPF_LSH:
- /* These may be flushed out later */
- default:
- mark_reg_unknown_value(regs, insn->dst_reg);
}
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
+ /* A new variable offset is created. If the subtrahend is known
+ * nonnegative, then any reg->range we had before is still good.
+ */
+ if (signed_sub_overflows(smin_ptr, smax_val) ||
+ signed_sub_overflows(smax_ptr, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = smin_ptr - smax_val;
+ dst_reg->smax_value = smax_ptr - smin_val;
+ }
+ if (umin_ptr < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value = umin_ptr - umax_val;
+ dst_reg->umax_value = umax_ptr - umin_val;
+ }
+ dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ if (ptr_reg->type == PTR_TO_PACKET) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ if (smin_val < 0)
+ dst_reg->range = 0;
+ }
+ break;
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* bitwise ops on pointers are troublesome, prohibit for now.
+ * (However, in principle we could allow some cases, e.g.
+ * ptr &= ~3 which would reduce min_value by 3.)
+ */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ default:
+ /* other operators (e.g. MUL,LSH) produce non-pointer results */
+ if (!env->allow_ptr_leaks)
+ verbose("R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
}
- dst_reg->type = UNKNOWN_VALUE;
+ __update_reg_bounds(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
return 0;
}
-static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state src_reg)
{
struct bpf_reg_state *regs = env->cur_state.regs;
- struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
u8 opcode = BPF_OP(insn->code);
- u64 dst_imm = dst_reg->imm;
-
- if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
- return evaluate_reg_imm_alu_unknown(env, insn);
-
- /* dst_reg->type == CONST_IMM here. Simulate execution of insns
- * containing ALU ops. Don't care about overflow or negative
- * values, just add/sub/... them; registers are in u64.
- */
- if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
- dst_imm += insn->imm;
- } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm += src_reg->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
- dst_imm -= insn->imm;
- } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm -= src_reg->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
- dst_imm *= insn->imm;
- } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm *= src_reg->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
- dst_imm |= insn->imm;
- } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm |= src_reg->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
- dst_imm &= insn->imm;
- } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm &= src_reg->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm >>= insn->imm;
- } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm >>= src_reg->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
- dst_imm <<= insn->imm;
- } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM) {
- dst_imm <<= src_reg->imm;
- } else {
- mark_reg_unknown_value(regs, insn->dst_reg);
- goto out;
+ bool src_known, dst_known;
+ s64 smin_val, smax_val;
+ u64 umin_val, umax_val;
+
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->64 */
+ coerce_reg_to_32(dst_reg);
+ coerce_reg_to_32(&src_reg);
}
-
- dst_reg->imm = dst_imm;
-out:
- return 0;
-}
-
-static void check_reg_overflow(struct bpf_reg_state *reg)
-{
- if (reg->max_value > BPF_REGISTER_MAX_RANGE)
- reg->max_value = BPF_REGISTER_MAX_RANGE;
- if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
- reg->min_value > BPF_REGISTER_MAX_RANGE)
- reg->min_value = BPF_REGISTER_MIN_RANGE;
-}
-
-static u32 calc_align(u32 imm)
-{
- if (!imm)
- return 1U << 31;
- return imm - ((imm - 1) & imm);
-}
-
-static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
-{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
- s64 min_val = BPF_REGISTER_MIN_RANGE;
- u64 max_val = BPF_REGISTER_MAX_RANGE;
- u8 opcode = BPF_OP(insn->code);
- u32 dst_align, src_align;
-
- dst_reg = &regs[insn->dst_reg];
- src_align = 0;
- if (BPF_SRC(insn->code) == BPF_X) {
- check_reg_overflow(&regs[insn->src_reg]);
- min_val = regs[insn->src_reg].min_value;
- max_val = regs[insn->src_reg].max_value;
-
- /* If the source register is a random pointer then the
- * min_value/max_value values represent the range of the known
- * accesses into that value, not the actual min/max value of the
- * register itself. In this case we have to reset the reg range
- * values so we know it is not safe to look at.
- */
- if (regs[insn->src_reg].type != CONST_IMM &&
- regs[insn->src_reg].type != UNKNOWN_VALUE) {
- min_val = BPF_REGISTER_MIN_RANGE;
- max_val = BPF_REGISTER_MAX_RANGE;
- src_align = 0;
- } else {
- src_align = regs[insn->src_reg].min_align;
- }
- } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
- (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
- min_val = max_val = insn->imm;
- src_align = calc_align(insn->imm);
- }
-
- dst_align = dst_reg->min_align;
-
- /* We don't know anything about what was done to this register, mark it
- * as unknown. Also, if both derived bounds came from signed/unsigned
- * mixed compares and one side is unbounded, we cannot really do anything
- * with them as boundaries cannot be trusted. Thus, arithmetic of two
- * regs of such kind will get invalidated bounds on the dst side.
- */
- if ((min_val == BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) ||
- (BPF_SRC(insn->code) == BPF_X &&
- ((min_val != BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) ||
- (min_val == BPF_REGISTER_MIN_RANGE &&
- max_val != BPF_REGISTER_MAX_RANGE) ||
- (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
- dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
- (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
- dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
- regs[insn->dst_reg].value_from_signed !=
- regs[insn->src_reg].value_from_signed)) {
- reset_reg_range_values(regs, insn->dst_reg);
- return;
- }
-
- /* If one of our values was at the end of our ranges then we can't just
- * do our normal operations to the register, we need to set the values
- * to the min/max since they are undefined.
- */
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ smin_val = src_reg.smin_value;
+ smax_val = src_reg.smax_value;
+ umin_val = src_reg.umin_value;
+ umax_val = src_reg.umax_value;
+ src_known = tnum_is_const(src_reg.var_off);
+ dst_known = tnum_is_const(dst_reg->var_off);
switch (opcode) {
case BPF_ADD:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value += min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value += max_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
+ signed_add_overflows(dst_reg->smax_value, smax_val)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value += smin_val;
+ dst_reg->smax_value += smax_val;
+ }
+ if (dst_reg->umin_value + umin_val < umin_val ||
+ dst_reg->umax_value + umax_val < umax_val) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value += umin_val;
+ dst_reg->umax_value += umax_val;
+ }
+ dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= max_val;
- dst_reg->min_align = min(src_align, dst_align);
+ if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
+ signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value -= smax_val;
+ dst_reg->smax_value -= smin_val;
+ }
+ if (dst_reg->umin_value < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value -= umax_val;
+ dst_reg->umax_value -= umin_val;
+ }
+ dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
break;
case BPF_MUL:
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value *= min_val;
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value *= max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+ if (smin_val < 0 || dst_reg->smin_value < 0) {
+ /* Ain't nobody got time to multiply that sign */
+ __mark_reg_unbounded(dst_reg);
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ /* Both values are positive, so we can work with unsigned and
+ * copy the result to signed (unless it exceeds S64_MAX).
+ */
+ if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
+ /* Potential overflow, we know nothing */
+ __mark_reg_unbounded(dst_reg);
+ /* (except what we can learn from the var_off) */
+ __update_reg_bounds(dst_reg);
+ break;
+ }
+ dst_reg->umin_value *= umin_val;
+ dst_reg->umax_value *= umax_val;
+ if (dst_reg->umax_value > S64_MAX) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
break;
case BPF_AND:
- /* Disallow AND'ing of negative numbers, ain't nobody got time
- * for that. Otherwise the minimum is 0 and the max is the max
- * value we could AND against.
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value &
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our minimum from the var_off, since that's inherently
+ * bitwise. Our maximum is the minimum of the operands' maxima.
*/
- if (min_val < 0)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else
- dst_reg->min_value = 0;
- dst_reg->max_value = max_val;
- dst_reg->min_align = max(src_align, dst_align);
+ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ANDing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ } else {
+ /* ANDing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_LSH:
- /* Gotta have special overflow logic here, if we're shifting
- * more than MAX_RANGE then just assume we have an invalid
- * range.
+ case BPF_OR:
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value |
+ src_reg.var_off.value);
+ break;
+ }
+ /* We get our maximum from the var_off, and our minimum is the
+ * maximum of the operands' minima
*/
- if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- dst_reg->min_align = 1;
+ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
+ dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
+ dst_reg->umax_value = dst_reg->var_off.value |
+ dst_reg->var_off.mask;
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
+ /* Lose signed bounds when ORing negative numbers,
+ * ain't nobody got time for that.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
} else {
- if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value <<= min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
- dst_reg->min_align <<= min_val;
- }
- if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
- else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value <<= max_val;
+ /* ORing two positives gives a positive, so safe to
+ * cast result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
- case BPF_RSH:
- /* RSH by a negative number is undefined, and the BPF_RSH is an
- * unsigned shift, so make the appropriate casts.
+ case BPF_LSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(regs, insn->dst_reg);
+ break;
+ }
+ /* We lose all sign bit information (except what we can pick
+ * up from var_off)
*/
- if (min_val < 0 || dst_reg->min_value < 0) {
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ /* If we might shift our top bit out, then we know nothing */
+ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
} else {
- dst_reg->min_value =
- (u64)(dst_reg->min_value) >> min_val;
+ dst_reg->umin_value <<= umin_val;
+ dst_reg->umax_value <<= umax_val;
+ }
+ if (src_known)
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
+ else
+ dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
+ break;
+ case BPF_RSH:
+ if (umax_val > 63) {
+ /* Shifts greater than 63 are undefined. This includes
+ * shifts by a negative number.
+ */
+ mark_reg_unknown(regs, insn->dst_reg);
+ break;
}
- if (min_val < 0) {
- dst_reg->min_align = 1;
+ /* BPF_RSH is an unsigned shift, so make the appropriate casts */
+ if (dst_reg->smin_value < 0) {
+ if (umin_val) {
+ /* Sign bit will be cleared */
+ dst_reg->smin_value = 0;
+ } else {
+ /* Lost sign bit information */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
} else {
- dst_reg->min_align >>= (u64) min_val;
- if (!dst_reg->min_align)
- dst_reg->min_align = 1;
+ dst_reg->smin_value =
+ (u64)(dst_reg->smin_value) >> umax_val;
}
- if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value >>= max_val;
+ if (src_known)
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off,
+ umin_val);
+ else
+ dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+ dst_reg->umin_value >>= umax_val;
+ dst_reg->umax_value >>= umin_val;
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
break;
default:
- reset_reg_range_values(regs, insn->dst_reg);
+ mark_reg_unknown(regs, insn->dst_reg);
break;
}
- check_reg_overflow(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
+ return 0;
+}
+
+/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
+ * and var_off.
+ */
+static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ u8 opcode = BPF_OP(insn->code);
+ int rc;
+
+ dst_reg = &regs[insn->dst_reg];
+ src_reg = NULL;
+ if (dst_reg->type != SCALAR_VALUE)
+ ptr_reg = dst_reg;
+ if (BPF_SRC(insn->code) == BPF_X) {
+ src_reg = &regs[insn->src_reg];
+ if (src_reg->type != SCALAR_VALUE) {
+ if (dst_reg->type != SCALAR_VALUE) {
+ /* Combining two pointers by any ALU op yields
+ * an arbitrary scalar.
+ */
+ if (!env->allow_ptr_leaks) {
+ verbose("R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ }
+ mark_reg_unknown(regs, insn->dst_reg);
+ return 0;
+ } else {
+ /* scalar += pointer
+ * This is legal, but we have to reverse our
+ * src/dest handling in computing the range
+ */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* scalar += unknown scalar */
+ __mark_reg_unknown(&off_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn,
+ dst_reg, off_reg);
+ }
+ return rc;
+ }
+ } else if (ptr_reg) {
+ /* pointer += scalar */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += scalar */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, *src_reg);
+ }
+ return rc;
+ }
+ } else {
+ /* Pretend the src is a reg with a known value, since we only
+ * need to be able to read from this state.
+ */
+ off_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&off_reg, insn->imm);
+ src_reg = &off_reg;
+ if (ptr_reg) { /* pointer += K */
+ rc = adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
+ if (rc == -EACCES && env->allow_ptr_leaks) {
+ /* unknown scalar += K */
+ __mark_reg_unknown(dst_reg);
+ return adjust_scalar_min_max_vals(
+ env, insn, dst_reg, off_reg);
+ }
+ return rc;
+ }
+ }
+
+ /* Got here implies adding two SCALAR_VALUEs */
+ if (WARN_ON_ONCE(ptr_reg)) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: unexpected ptr_reg\n");
+ return -EINVAL;
+ }
+ if (WARN_ON(!src_reg)) {
+ print_verifier_state(&env->cur_state);
+ verbose("verifier internal error: no src_reg\n");
+ return -EINVAL;
+ }
+ return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
}
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1976,7 +2299,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -1987,7 +2310,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
@@ -2000,7 +2323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
@@ -2011,15 +2334,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
- /* we are setting our register to something new, we need to
- * reset its range values.
- */
- reset_reg_range_values(regs, insn->dst_reg);
-
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
@@ -2027,24 +2345,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg] = regs[insn->src_reg];
} else {
+ /* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
verbose("R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown_value(regs, insn->dst_reg);
+ mark_reg_unknown(regs, insn->dst_reg);
+ /* high 32 bits are known zero. */
+ regs[insn->dst_reg].var_off = tnum_cast(
+ regs[insn->dst_reg].var_off, 4);
+ __update_reg_bounds(&regs[insn->dst_reg]);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- regs[insn->dst_reg].id = 0;
- regs[insn->dst_reg].max_value = insn->imm;
- regs[insn->dst_reg].min_value = insn->imm;
- regs[insn->dst_reg].min_align = calc_align(insn->imm);
- regs[insn->dst_reg].value_from_signed = false;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(regs + insn->dst_reg, insn->imm);
}
} else if (opcode > BPF_END) {
@@ -2059,7 +2377,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
@@ -2070,7 +2388,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2091,72 +2409,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
- dst_reg = &regs[insn->dst_reg];
-
- /* first we want to adjust our ranges. */
- adjust_reg_min_max_vals(env, insn);
-
- /* pattern match 'bpf_add Rx, imm' instruction */
- if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
- dst_reg->type = PTR_TO_STACK;
- dst_reg->imm = insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == PTR_TO_STACK &&
- ((BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == CONST_IMM) ||
- BPF_SRC(insn->code) == BPF_K)) {
- if (BPF_SRC(insn->code) == BPF_X)
- dst_reg->imm += regs[insn->src_reg].imm;
- else
- dst_reg->imm += insn->imm;
- return 0;
- } else if (opcode == BPF_ADD &&
- BPF_CLASS(insn->code) == BPF_ALU64 &&
- (dst_reg->type == PTR_TO_PACKET ||
- (BPF_SRC(insn->code) == BPF_X &&
- regs[insn->src_reg].type == PTR_TO_PACKET))) {
- /* ptr_to_packet += K|X */
- return check_packet_ptr_add(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == UNKNOWN_VALUE &&
- env->allow_ptr_leaks) {
- /* unknown += K|X */
- return evaluate_reg_alu(env, insn);
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
- dst_reg->type == CONST_IMM &&
- env->allow_ptr_leaks) {
- /* reg_imm += K|X */
- return evaluate_reg_imm_alu(env, insn);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->dst_reg);
- return -EACCES;
- } else if (BPF_SRC(insn->code) == BPF_X &&
- is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
- insn->src_reg);
- return -EACCES;
- }
-
- /* If we did pointer math on a map value then just set it to our
- * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
- * loads to this register appropriately, otherwise just mark the
- * register as unknown.
- */
- if (env->allow_ptr_leaks &&
- BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
- (dst_reg->type == PTR_TO_MAP_VALUE ||
- dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
- dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
- else
- mark_reg_unknown_value(regs, insn->dst_reg);
+ return adjust_reg_min_max_vals(env, insn);
}
return 0;
@@ -2168,27 +2425,48 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
struct bpf_reg_state *regs = state->regs, *reg;
int i;
- /* LLVM can generate two kind of checks:
+ if (dst_reg->off < 0)
+ /* This doesn't give us any range */
+ return;
+
+ if (dst_reg->umax_value > MAX_PACKET_OFF ||
+ dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
+ /* Risk of overflow. For instance, ptr + (1<<63) may be less
+ * than pkt_end, but that's because it's also less than pkt.
+ */
+ return;
+
+ /* LLVM can generate four kind of checks:
*
- * Type 1:
+ * Type 1/2:
*
* r2 = r3;
* r2 += 8;
* if (r2 > pkt_end) goto <handle exception>
* <access okay>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 < pkt_end) goto <access okay>
+ * <handle exception>
+ *
* Where:
* r2 == dst_reg, pkt_end == src_reg
* r2=pkt(id=n,off=8,r=0)
* r3=pkt(id=n,off=0,r=0)
*
- * Type 2:
+ * Type 3/4:
*
* r2 = r3;
* r2 += 8;
* if (pkt_end >= r2) goto <access okay>
* <handle exception>
*
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end <= r2) goto <handle exception>
+ * <access okay>
+ *
* Where:
* pkt_end == dst_reg, r2 == src_reg
* r2=pkt(id=n,off=8,r=0)
@@ -2198,193 +2476,247 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* so that range of bytes [r3, r3 + 8) is safe to access.
*/
+ /* If our ids match, then we must have the same max_value. And we
+ * don't care about the other reg's fixed offset, since if it's too big
+ * the range won't allow anything.
+ * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ */
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
- regs[i].range = max(regs[i].range, dst_reg->off);
+ regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = max(reg->range, dst_reg->off);
+ reg->range = max_t(u16, reg->range, dst_reg->off);
}
}
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
+ * In JEQ/JNE cases we also adjust the var_off values.
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
- bool value_from_signed = true;
- bool is_range = true;
+ /* If the dst_reg is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless src_reg were a pointer into
+ * the same object, but we don't bother with that.
+ * Since false_reg and true_reg have the same type by construction, we
+ * only need to check one of them for pointerness.
+ */
+ if (__is_pointer_value(false, false_reg))
+ return;
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- value_from_signed = false;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ break;
case BPF_JSGT:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGT) {
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- }
- /* If this is false then we know the maximum val is val,
- * otherwise we know the min val is val+1.
- */
- false_reg->max_value = val;
- false_reg->value_from_signed = value_from_signed;
- true_reg->min_value = val + 1;
- true_reg->value_from_signed = value_from_signed;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ break;
+ case BPF_JLT:
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLT:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
break;
case BPF_JGE:
- value_from_signed = false;
- /* fallthrough */
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ break;
case BPF_JSGE:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGE) {
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
- }
- /* If this is false then we know the maximum value is val - 1,
- * otherwise we know the mimimum value is val.
- */
- false_reg->max_value = val - 1;
- false_reg->value_from_signed = value_from_signed;
- true_reg->min_value = val;
- true_reg->value_from_signed = value_from_signed;
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ break;
+ case BPF_JLE:
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ break;
+ case BPF_JSLE:
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
- if (is_range) {
- if (__is_pointer_value(false, false_reg))
- reset_reg_range_values(false_reg, 0);
- if (__is_pointer_value(false, true_reg))
- reset_reg_range_values(true_reg, 0);
- }
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
}
-/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
- * is the variable reg.
+/* Same as above, but for the case that dst_reg holds a constant and src_reg is
+ * the variable reg.
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
- bool value_from_signed = true;
- bool is_range = true;
+ if (__is_pointer_value(false, false_reg))
+ return;
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
- true_reg->max_value = true_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(true_reg, val);
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
- false_reg->max_value = false_reg->min_value = val;
- is_range = false;
+ __mark_reg_known(false_reg, val);
break;
case BPF_JGT:
- value_from_signed = false;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val - 1);
+ false_reg->umin_value = max(false_reg->umin_value, val);
+ break;
case BPF_JSGT:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGT) {
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- }
- /*
- * If this is false, then the val is <= the register, if it is
- * true the register <= to the val.
- */
- false_reg->min_value = val;
- false_reg->value_from_signed = value_from_signed;
- true_reg->max_value = val - 1;
- true_reg->value_from_signed = value_from_signed;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
+ break;
+ case BPF_JLT:
+ true_reg->umin_value = max(true_reg->umin_value, val + 1);
+ false_reg->umax_value = min(false_reg->umax_value, val);
+ break;
+ case BPF_JSLT:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
break;
case BPF_JGE:
- value_from_signed = false;
- /* fallthrough */
+ true_reg->umax_value = min(true_reg->umax_value, val);
+ false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ break;
case BPF_JSGE:
- if (true_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(true_reg, 0);
- if (false_reg->value_from_signed != value_from_signed)
- reset_reg_range_values(false_reg, 0);
- if (opcode == BPF_JGE) {
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
- }
- /* If this is false then constant < register, if it is true then
- * the register < constant.
- */
- false_reg->min_value = val + 1;
- false_reg->value_from_signed = value_from_signed;
- true_reg->max_value = val;
- true_reg->value_from_signed = value_from_signed;
+ true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ break;
+ case BPF_JLE:
+ true_reg->umin_value = max(true_reg->umin_value, val);
+ false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ break;
+ case BPF_JSLE:
+ true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
break;
default:
break;
}
- check_reg_overflow(false_reg);
- check_reg_overflow(true_reg);
- if (is_range) {
- if (__is_pointer_value(false, false_reg))
- reset_reg_range_values(false_reg, 0);
- if (__is_pointer_value(false, true_reg))
- reset_reg_range_values(true_reg, 0);
+ __reg_deduce_bounds(false_reg);
+ __reg_deduce_bounds(true_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(false_reg);
+ __reg_bound_offset(true_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(false_reg);
+ __update_reg_bounds(true_reg);
+}
+
+/* Regs are known to be equal, so intersect their min/max/var_off */
+static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
+ struct bpf_reg_state *dst_reg)
+{
+ src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
+ dst_reg->umin_value);
+ src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
+ dst_reg->umax_value);
+ src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
+ dst_reg->smin_value);
+ src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
+ dst_reg->smax_value);
+ src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
+ dst_reg->var_off);
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(src_reg);
+ __reg_deduce_bounds(dst_reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(src_reg);
+ __reg_bound_offset(dst_reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(src_reg);
+ __update_reg_bounds(dst_reg);
+}
+
+static void reg_combine_min_max(struct bpf_reg_state *true_src,
+ struct bpf_reg_state *true_dst,
+ struct bpf_reg_state *false_src,
+ struct bpf_reg_state *false_dst,
+ u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ:
+ __reg_combine_min_max(true_src, true_dst);
+ break;
+ case BPF_JNE:
+ __reg_combine_min_max(false_src, false_dst);
+ break;
}
}
static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *reg = &regs[regno];
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
- if (type == UNKNOWN_VALUE) {
- __mark_reg_unknown_value(regs, regno);
+ /* Old offset (both fixed and variable parts) should
+ * have been known-zero, because we don't allow pointer
+ * arithmetic on pointers that might be NULL.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
+ !tnum_equals_const(reg->var_off, 0) ||
+ reg->off)) {
+ __mark_reg_known_zero(reg);
+ reg->off = 0;
+ }
+ if (is_null) {
+ reg->type = SCALAR_VALUE;
} else if (reg->map_ptr->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = reg->map_ptr->inner_map_meta;
} else {
- reg->type = type;
+ reg->type = PTR_TO_MAP_VALUE;
}
/* We don't need id from this point onwards anymore, thus we
* should better reset it, so that state pruning has chances
@@ -2398,19 +2730,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
* be folded together at some point.
*/
static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
- enum bpf_reg_type type)
+ bool is_null)
{
struct bpf_reg_state *regs = state->regs;
u32 id = regs[regno].id;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- mark_map_reg(regs, i, id, type);
+ mark_map_reg(regs, i, id, is_null);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
}
}
@@ -2422,7 +2754,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
int err;
- if (opcode > BPF_EXIT) {
+ if (opcode > BPF_JSLE) {
verbose("invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
@@ -2434,7 +2766,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
@@ -2451,7 +2783,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -2460,7 +2792,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if R == 0 where R was initialized to zero earlier */
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
- dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
+ dst_reg->type == SCALAR_VALUE &&
+ tnum_equals_const(dst_reg->var_off, insn->imm)) {
if (opcode == BPF_JEQ) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
@@ -2482,17 +2815,30 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
+ * this is only legit if both are scalars (or pointers to the same
+ * object, I suppose, but we don't support that right now), because
+ * otherwise the different base pointers mean the offsets aren't
+ * comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
- if (regs[insn->src_reg].type == CONST_IMM)
- reg_set_min_max(&other_branch->regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].imm,
- opcode);
- else if (dst_reg->type == CONST_IMM)
- reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
- &regs[insn->src_reg], dst_reg->imm,
- opcode);
- } else {
+ if (dst_reg->type == SCALAR_VALUE &&
+ regs[insn->src_reg].type == SCALAR_VALUE) {
+ if (tnum_is_const(regs[insn->src_reg].var_off))
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, regs[insn->src_reg].var_off.value,
+ opcode);
+ else if (tnum_is_const(dst_reg->var_off))
+ reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ &regs[insn->src_reg],
+ dst_reg->var_off.value, opcode);
+ else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ /* Comparing for equality, we can combine knowledge */
+ reg_combine_min_max(&other_branch->regs[insn->src_reg],
+ &other_branch->regs[insn->dst_reg],
+ &regs[insn->src_reg],
+ &regs[insn->dst_reg], opcode);
+ }
+ } else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch->regs[insn->dst_reg],
dst_reg, insn->imm, opcode);
}
@@ -2504,18 +2850,24 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* Mark all identical map registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
- mark_map_regs(this_branch, insn->dst_reg,
- opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
- mark_map_regs(other_branch, insn->dst_reg,
- opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
+ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
+ mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
find_good_pkt_pointers(this_branch, dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+ dst_reg->type == PTR_TO_PACKET &&
+ regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+ find_good_pkt_pointers(other_branch, dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+ dst_reg->type == PTR_TO_PACKET_END &&
+ regs[insn->src_reg].type == PTR_TO_PACKET) {
+ find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
@@ -2548,16 +2900,15 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
if (insn->src_reg == 0) {
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = imm;
- regs[insn->dst_reg].id = 0;
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ __mark_reg_known(&regs[insn->dst_reg], imm);
return 0;
}
@@ -2615,7 +2966,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_6, SRC_OP);
if (err)
return err;
@@ -2626,19 +2977,22 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (mode == BPF_IND) {
/* check explicit source operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
}
/* reset caller saved regs to unreadable */
- for (i = 0; i < CALLER_SAVED_REGS; i++)
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
/* mark destination R0 register as readable, since it contains
- * the value fetched from the packet
+ * the value fetched from the packet.
+ * Already marked as written above.
*/
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ mark_reg_unknown(regs, BPF_REG_0);
return 0;
}
@@ -2841,57 +3195,144 @@ err_free:
return ret;
}
-/* the following conditions reduce the number of explored insns
- * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+/* check %cur's range satisfies %old's */
+static bool range_within(struct bpf_reg_state *old,
+ struct bpf_reg_state *cur)
+{
+ return old->umin_value <= cur->umin_value &&
+ old->umax_value >= cur->umax_value &&
+ old->smin_value <= cur->smin_value &&
+ old->smax_value >= cur->smax_value;
+}
+
+/* Maximum number of register states that can exist at once */
+#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
+struct idpair {
+ u32 old;
+ u32 cur;
+};
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well. But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe. But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before. If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
- struct bpf_reg_state *old,
- struct bpf_reg_state *cur)
+static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
{
- if (old->id != cur->id)
- return false;
+ unsigned int i;
+
+ for (i = 0; i < ID_MAP_SIZE; i++) {
+ if (!idmap[i].old) {
+ /* Reached an empty slot; haven't seen this id before */
+ idmap[i].old = old_id;
+ idmap[i].cur = cur_id;
+ return true;
+ }
+ if (idmap[i].old == old_id)
+ return idmap[i].cur == cur_id;
+ }
+ /* We ran out of idmap slots, which should be impossible */
+ WARN_ON_ONCE(1);
+ return false;
+}
- /* old ptr_to_packet is more conservative, since it allows smaller
- * range. Ex:
- * old(off=0,r=10) is equal to cur(off=0,r=20), because
- * old(off=0,r=10) means that with range=10 the verifier proceeded
- * further and found no issues with the program. Now we're in the same
- * spot with cur(off=0,r=20), so we're safe too, since anything further
- * will only be looking at most 10 bytes after this pointer.
- */
- if (old->off == cur->off && old->range < cur->range)
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct idpair *idmap)
+{
+ if (!(rold->live & REG_LIVE_READ))
+ /* explored state didn't use this */
return true;
- /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
- * since both cannot be used for packet access and safe(old)
- * pointer has smaller off that could be used for further
- * 'if (ptr > data_end)' check
- * Ex:
- * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
- * that we cannot access the packet.
- * The safe range is:
- * [ptr, ptr + range - off)
- * so whenever off >=range, it means no safe bytes from this pointer.
- * When comparing old->off <= cur->off, it means that older code
- * went with smaller offset and that offset was later
- * used to figure out the safe range after 'if (ptr > data_end)' check
- * Say, 'old' state was explored like:
- * ... R3(off=0, r=0)
- * R4 = R3 + 20
- * ... now R4(off=20,r=0) <-- here
- * if (R4 > data_end)
- * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
- * ... the code further went all the way to bpf_exit.
- * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
- * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
- * goes further, such cur_R4 will give larger safe packet range after
- * 'if (R4 > data_end)' and all further insn were already good with r=20,
- * so they will be good with r=30 and we can prune the search.
- */
- if (!env->strict_alignment && old->off <= cur->off &&
- old->off >= old->range && cur->off >= cur->range)
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0)
return true;
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+ if (rcur->type == NOT_INIT)
+ return false;
+ switch (rold->type) {
+ case SCALAR_VALUE:
+ if (rcur->type == SCALAR_VALUE) {
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ } else {
+ /* if we knew anything about the old value, we're not
+ * equal, because we can't know anything about the
+ * scalar value of the pointer in the new value.
+ */
+ return rold->umin_value == 0 &&
+ rold->umax_value == U64_MAX &&
+ rold->smin_value == S64_MIN &&
+ rold->smax_value == S64_MAX &&
+ tnum_is_unknown(rold->var_off);
+ }
+ case PTR_TO_MAP_VALUE:
+ /* If the new min/max/var_off satisfy the old ones and
+ * everything else matches, we are OK.
+ * We don't care about the 'id' value, because nothing
+ * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ /* a PTR_TO_MAP_VALUE could be safe to use as a
+ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+ * checked, doing so could have affected others with the same
+ * id, and we can't check for that because we lost the id when
+ * we converted to a PTR_TO_MAP_VALUE.
+ */
+ if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
+ return false;
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+ return false;
+ /* Check our ids match any regs they're supposed to */
+ return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET:
+ if (rcur->type != PTR_TO_PACKET)
+ return false;
+ /* We must have at least as much range as the old ptr
+ * did, so that any accesses which were safe before are
+ * still safe. This is true even if old range < old off,
+ * since someone could have accessed through (ptr - k), or
+ * even done ptr -= k in a register, to get a safe access.
+ */
+ if (rold->range > rcur->range)
+ return false;
+ /* If the offsets don't match, we can't trust our alignment;
+ * nor can we be sure that we won't fall out of range.
+ */
+ if (rold->off != rcur->off)
+ return false;
+ /* id relations must be preserved */
+ if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ return false;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_CTX:
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_END:
+ /* Only valid matches are exact, which memcmp() above
+ * would have accepted
+ */
+ default:
+ /* Don't know what's going on, just say it's not safe */
+ return false;
+ }
+
+ /* Shouldn't get here; if we do, say it's not safe */
+ WARN_ON_ONCE(1);
return false;
}
@@ -2925,44 +3366,18 @@ static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
struct bpf_verifier_state *cur)
{
- bool varlen_map_access = env->varlen_map_value_access;
- struct bpf_reg_state *rold, *rcur;
+ struct idpair *idmap;
+ bool ret = false;
int i;
- for (i = 0; i < MAX_BPF_REG; i++) {
- rold = &old->regs[i];
- rcur = &cur->regs[i];
-
- if (memcmp(rold, rcur, sizeof(*rold)) == 0)
- continue;
-
- /* If the ranges were not the same, but everything else was and
- * we didn't do a variable access into a map then we are a-ok.
- */
- if (!varlen_map_access &&
- memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
- continue;
-
- /* If we didn't map access then again we don't care about the
- * mismatched range values and it's ok if our old type was
- * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
- */
- if (rold->type == NOT_INIT ||
- (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
- rcur->type != NOT_INIT))
- continue;
-
- /* Don't care about the reg->id in this case. */
- if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
- rold->map_ptr == rcur->map_ptr)
- continue;
-
- if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
- compare_ptrs_to_packet(env, rold, rcur))
- continue;
-
+ idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
+ /* If we failed to allocate the idmap, just say it's not safe */
+ if (!idmap)
return false;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
+ goto out_free;
}
for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -2974,35 +3389,104 @@ static bool states_equal(struct bpf_verifier_env *env,
* this verifier states are not equivalent,
* return false to continue verification of this path
*/
- return false;
+ goto out_free;
if (i % BPF_REG_SIZE)
continue;
if (old->stack_slot_type[i] != STACK_SPILL)
continue;
- if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- sizeof(old->spilled_regs[0])))
- /* when explored and current stack slot types are
- * the same, check that stored pointers types
+ if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
* are the same as well.
* Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
* but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
- return false;
+ goto out_free;
else
continue;
}
- return true;
+ ret = true;
+out_free:
+ kfree(idmap);
+ return ret;
+}
+
+/* A write screens off any subsequent reads; but write marks come from the
+ * straight-line code between a state and its parent. When we arrive at a
+ * jump target (in the first iteration of the propagate_liveness() loop),
+ * we didn't arrive by the straight-line code, so read marks in state must
+ * propagate to parent regardless of state's write marks.
+ */
+static bool do_propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ bool writes = parent == state->parent; /* Observe write marks */
+ bool touched = false; /* any changes made? */
+ int i;
+
+ if (!parent)
+ return touched;
+ /* Propagate read liveness of registers... */
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+ /* We don't need to worry about FP liveness because it's read-only */
+ for (i = 0; i < BPF_REG_FP; i++) {
+ if (parent->regs[i].live & REG_LIVE_READ)
+ continue;
+ if (writes && (state->regs[i].live & REG_LIVE_WRITTEN))
+ continue;
+ if (state->regs[i].live & REG_LIVE_READ) {
+ parent->regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ /* ... and stack slots */
+ for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
+ if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ continue;
+ if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ continue;
+ if (parent->spilled_regs[i].live & REG_LIVE_READ)
+ continue;
+ if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+ continue;
+ if (state->spilled_regs[i].live & REG_LIVE_READ) {
+ parent->spilled_regs[i].live |= REG_LIVE_READ;
+ touched = true;
+ }
+ }
+ return touched;
+}
+
+/* "parent" is "a state from which we reach the current state", but initially
+ * it is not the state->parent (i.e. "the state whose straight-line code leads
+ * to the current state"), instead it is the state that happened to arrive at
+ * a (prunable) equivalent of the current state. See comment above
+ * do_propagate_liveness() for consequences of this.
+ * This function is just a more efficient way of calling mark_reg_read() or
+ * mark_stack_slot_read() on each reg in "parent" that is read in "state",
+ * though it requires that parent != state->parent in the call arguments.
+ */
+static void propagate_liveness(const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent)
+{
+ while (do_propagate_liveness(state, parent)) {
+ /* Something changed, so we need to feed those changes onward */
+ state = parent;
+ parent = state->parent;
+ }
}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
+ int i;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3012,11 +3496,20 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state))
+ if (states_equal(env, &sl->state, &env->cur_state)) {
/* reached equivalent register/stack state,
- * prune the search
+ * prune the search.
+ * Registers read by the continuation are read by us.
+ * If we have any write marks in env->cur_state, they
+ * will prevent corresponding reads in the continuation
+ * from reaching our parent (an explored_state). Our
+ * own state will get the read marks recorded, but
+ * they'll be immediately forgotten as we're pruning
+ * this state and will pop a new one.
*/
+ propagate_liveness(&sl->state, &env->cur_state);
return 1;
+ }
sl = sl->next;
}
@@ -3034,6 +3527,19 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
+ /* connect new state to parentage chain */
+ env->cur_state.parent = &new_sl->state;
+ /* clear write marks in current state: the writes we did are not writes
+ * our child did, so they don't screen off its reads from us.
+ * (There are no read marks in current state, because reads always mark
+ * their parent and current state never has children yet. Only
+ * explored_states can get read marks.)
+ */
+ for (i = 0; i < BPF_REG_FP; i++)
+ env->cur_state.regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
+ if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
+ env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
return 0;
}
@@ -3057,8 +3563,8 @@ static int do_check(struct bpf_verifier_env *env)
bool do_print_state = false;
init_reg_state(regs);
+ state->parent = NULL;
insn_idx = 0;
- env->varlen_map_value_access = false;
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -3127,11 +3633,11 @@ static int do_check(struct bpf_verifier_env *env)
/* check for reserved fields is already done */
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
@@ -3181,11 +3687,11 @@ static int do_check(struct bpf_verifier_env *env)
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -3216,7 +3722,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
@@ -3270,7 +3776,7 @@ static int do_check(struct bpf_verifier_env *env)
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
if (err)
return err;
@@ -3310,7 +3816,6 @@ process_bpf_exit:
verbose("invalid BPF_LD mode\n");
return -EINVAL;
}
- reset_reg_range_values(regs, insn->dst_reg);
} else {
verbose("unknown insn class %d\n", class);
return -EINVAL;
@@ -3669,7 +4174,11 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
- if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
+ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
+ * handlers are currently limited to 64 bit only.
+ */
+ if (ebpf_jit_enabled() && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_map_lookup_elem) {
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON ||
!map_ptr->ops->map_gen_lookup)
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 793565c..5151ff2 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -33,6 +33,9 @@ struct cgroup_taskset {
struct list_head src_csets;
struct list_head dst_csets;
+ /* the number of tasks in the set */
+ int nr_tasks;
+
/* the subsys currently being processed */
int ssid;
@@ -153,6 +156,8 @@ static inline void get_css_set(struct css_set *cset)
bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);
+bool cgroup_is_thread_root(struct cgroup *cgrp);
+bool cgroup_is_threaded(struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -170,7 +175,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
struct cgroup_namespace *ns);
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
struct cgroup_mgctx *mgctx);
@@ -180,10 +185,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup);
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off);
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 7bf4b15..024085d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (cgroup_on_dfl(to))
return -EINVAL;
- if (!cgroup_may_migrate_to(to))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(to);
+ if (ret)
+ return ret;
mutex_lock(&cgroup_mutex);
@@ -121,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
* ->can_attach() fails.
*/
do {
- css_task_iter_start(&from->self, &it);
+ css_task_iter_start(&from->self, 0, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
@@ -373,7 +374,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
@@ -510,10 +511,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return 0;
}
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ bool threadgroup)
{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
+ struct cgroup *cgrp;
+ struct task_struct *task;
+ const struct cred *cred, *tcred;
+ ssize_t ret;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, threadgroup);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ cred = current_cred();
+ tcred = get_task_cred(task);
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(cgrp, task, threadgroup);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, false);
}
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
@@ -592,7 +641,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
+ .write = cgroup1_procs_write,
},
{
.name = "cgroup.clone_children",
@@ -611,7 +660,7 @@ struct cftype cgroup1_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
+ .write = cgroup1_tasks_write,
},
{
.name = "notify_on_release",
@@ -701,7 +750,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
rcu_read_unlock();
- css_task_iter_start(&cgrp->self, &it);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
@@ -846,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+ if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
+ seq_puts(seq, ",cpuset_v2_mode");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -900,6 +951,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
opts->cpuset_clone_children = true;
continue;
}
+ if (!strcmp(token, "cpuset_v2_mode")) {
+ opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ continue;
+ }
if (!strcmp(token, "xattr")) {
opts->flags |= CGRP_ROOT_XATTR;
continue;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 620794a..d6551cd 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
@@ -316,13 +319,87 @@ static void cgroup_idr_remove(struct idr *idr, int id)
spin_unlock_bh(&cgroup_idr_lock);
}
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+static bool cgroup_has_tasks(struct cgroup *cgrp)
{
- struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+ return cgrp->nr_populated_csets;
+}
- if (parent_css)
- return container_of(parent_css, struct cgroup, self);
- return NULL;
+bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+ return cgrp->dom_cgrp != cgrp;
+}
+
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+ /*
+ * Root isn't under domain level resource control exempting it from
+ * the no-internal-process constraint, so it can serve as a thread
+ * root and a parent of resource domains at the same time.
+ */
+ return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return true;
+
+ /* domain roots can't be nested under threaded */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* can only have either domain or threaded children */
+ if (cgrp->nr_populated_domain_children)
+ return false;
+
+ /* and no domain controllers can be enabled */
+ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return false;
+
+ return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+ /* thread root should be a domain */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* a domain w/ threaded children is a thread root */
+ if (cgrp->nr_threaded_children)
+ return true;
+
+ /*
+ * A domain which has tasks and explicit threaded controllers
+ * enabled is a thread root.
+ */
+ if (cgroup_has_tasks(cgrp) &&
+ (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+ return true;
+
+ return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+ /* the cgroup itself can be a thread root */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* but the ancestors can't be unless mixable */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+ return false;
+ if (cgroup_is_threaded(cgrp))
+ return false;
+ }
+
+ return true;
}
/* subsystems visibly enabled on a cgroup */
@@ -331,8 +408,14 @@ static u16 cgroup_control(struct cgroup *cgrp)
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
- if (parent)
- return parent->subtree_control;
+ if (parent) {
+ u16 ss_mask = parent->subtree_control;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -345,8 +428,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- if (parent)
- return parent->subtree_ss_mask;
+ if (parent) {
+ u16 ss_mask = parent->subtree_ss_mask;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
return cgrp->root->subsys_mask;
}
@@ -436,22 +525,12 @@ out_unlock:
return css;
}
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
-{
- css_get(&cgrp->self);
-}
-
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -560,9 +639,11 @@ EXPORT_SYMBOL_GPL(of_css);
*/
struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
+ .dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
@@ -570,6 +651,11 @@ struct css_set init_css_set = {
static int css_set_count = 1; /* 1 for init_css_set */
+static bool css_set_threaded(struct css_set *cset)
+{
+ return cset->dom_cset != cset;
+}
+
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
@@ -587,39 +673,48 @@ static bool css_set_populated(struct css_set *cset)
}
/**
- * cgroup_update_populated - updated populated count of a cgroup
+ * cgroup_update_populated - update the populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last. Update @cgrp->populated_cnt accordingly. The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
+ * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
*
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed. This can be used to detect when @cgrp and
- * its descendants become populated or empty.
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise. When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed. This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
+ struct cgroup *child = NULL;
+ int adj = populated ? 1 : -1;
+
lockdep_assert_held(&css_set_lock);
do {
- bool trigger;
+ bool was_populated = cgroup_is_populated(cgrp);
- if (populated)
- trigger = !cgrp->populated_cnt++;
- else
- trigger = !--cgrp->populated_cnt;
+ if (!child) {
+ cgrp->nr_populated_csets += adj;
+ } else {
+ if (cgroup_is_threaded(child))
+ cgrp->nr_populated_threaded_children += adj;
+ else
+ cgrp->nr_populated_domain_children += adj;
+ }
- if (!trigger)
+ if (was_populated == cgroup_is_populated(cgrp))
break;
cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
+ child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
@@ -630,7 +725,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
- * ->populated_cnt of all associated cgroups accordingly.
+ * populated counters of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
@@ -653,7 +748,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
- * This function automatically handles populated_cnt updates and
+ * This function automatically handles populated counter updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
@@ -737,6 +832,8 @@ void put_css_set_locked(struct css_set *cset)
if (!refcount_dec_and_test(&cset->refcount))
return;
+ WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
/* This css_set is dead. unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
@@ -753,6 +850,11 @@ void put_css_set_locked(struct css_set *cset)
kfree(link);
}
+ if (css_set_threaded(cset)) {
+ list_del(&cset->threaded_csets_node);
+ put_css_set_locked(cset->dom_cset);
+ }
+
kfree_rcu(cset, rcu_head);
}
@@ -771,6 +873,7 @@ static bool compare_css_sets(struct css_set *cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
+ struct cgroup *new_dfl_cgrp;
struct list_head *l1, *l2;
/*
@@ -781,6 +884,16 @@ static bool compare_css_sets(struct css_set *cset,
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
+
+ /* @cset's domain should match the default cgroup's */
+ if (cgroup_on_dfl(new_cgrp))
+ new_dfl_cgrp = new_cgrp;
+ else
+ new_dfl_cgrp = old_cset->dfl_cgrp;
+
+ if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+ return false;
+
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
@@ -988,9 +1101,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
}
refcount_set(&cset->refcount, 1);
+ cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
+ INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
@@ -1028,6 +1143,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
spin_unlock_irq(&css_set_lock);
+ /*
+ * If @cset should be threaded, look up the matching dom_cset and
+ * link them up. We first fully initialize @cset then look for the
+ * dom_cset. It's simpler this way and safe as @cset is guaranteed
+ * to stay empty until we return.
+ */
+ if (cgroup_is_threaded(cset->dfl_cgrp)) {
+ struct css_set *dcset;
+
+ dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+ if (!dcset) {
+ put_css_set(cset);
+ return NULL;
+ }
+
+ spin_lock_irq(&css_set_lock);
+ cset->dom_cset = dcset;
+ list_add_tail(&cset->threaded_csets_node,
+ &dcset->threaded_csets);
+ spin_unlock_irq(&css_set_lock);
+ }
+
return cset;
}
@@ -1155,6 +1292,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
if (cset == &init_css_set) {
res = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
@@ -1670,6 +1809,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
+ cgrp->dom_cgrp = cgrp;
+ cgrp->max_descendants = INT_MAX;
+ cgrp->max_depth = INT_MAX;
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -1737,7 +1879,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -2006,6 +2149,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
if (!cset->mg_src_cgrp)
return;
+ mgctx->tset.nr_tasks++;
+
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
@@ -2094,21 +2239,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
struct css_set *cset, *tmp_cset;
int ssid, failed_ssid, ret;
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset->src_csets))
- return 0;
-
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->can_attach) {
- tset->ssid = ssid;
- ret = ss->can_attach(tset);
- if (ret) {
- failed_ssid = ssid;
- goto out_cancel_attach;
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
+ if (ret) {
+ failed_ssid = ssid;
+ goto out_cancel_attach;
+ }
}
- }
- } while_each_subsys_mask();
+ } while_each_subsys_mask();
+ }
/*
* Now that we're guaranteed success, proceed to move all tasks to
@@ -2137,25 +2280,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->attach) {
- tset->ssid = ssid;
- ss->attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
ret = 0;
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ssid == failed_ssid)
- break;
- if (ss->cancel_attach) {
- tset->ssid = ssid;
- ss->cancel_attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ssid == failed_ssid)
+ break;
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
out_release_tset:
spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2168,17 +2315,40 @@ out_release_tset:
}
/**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
*/
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
- return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
- !dst_cgrp->subtree_control;
+ /* v1 doesn't have any restriction */
+ if (!cgroup_on_dfl(dst_cgrp))
+ return 0;
+
+ /* verify @dst_cgrp can host resources */
+ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(dst_cgrp))
+ return 0;
+
+ /*
+ * If @dst_cgrp is already or can become a thread root or is
+ * threaded, it doesn't matter.
+ */
+ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+ return 0;
+
+ /* apply no-internal-process constraint */
+ if (dst_cgrp->subtree_control)
+ return -EBUSY;
+
+ return 0;
}
/**
@@ -2383,8 +2553,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
struct task_struct *task;
int ret;
- if (!cgroup_may_migrate_to(dst_cgrp))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -2411,96 +2582,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
return ret;
}
-static int cgroup_procs_write_permission(struct task_struct *task,
- struct cgroup *dst_cgrp,
- struct kernfs_open_file *of)
-{
- struct super_block *sb = of->file->f_path.dentry->d_sb;
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
- struct cgroup *src_cgrp, *com_cgrp;
- struct inode *inode;
- int ret;
-
- if (!cgroup_on_dfl(dst_cgrp)) {
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
-
- /*
- * even if we're attaching all tasks in the thread group,
- * we only need to check permissions on one of them.
- */
- if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->euid, tcred->suid))
- ret = 0;
- else
- ret = -EACCES;
-
- put_cred(tcred);
- return ret;
- }
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* and the common ancestor */
- com_cgrp = src_cgrp;
- while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
- com_cgrp = cgroup_parent(com_cgrp);
-
- /* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
- if (ret)
- return ret;
-
- /*
- * If namespaces are delegation boundaries, %current must be able
- * to see both source and destination cgroups from its namespace.
- */
- if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
- (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
- !cgroup_is_descendant(dst_cgrp, root_cgrp)))
- return -ENOENT;
-
- return 0;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
- struct cgroup_subsys *ss;
- struct cgroup *cgrp;
pid_t pid;
- int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
+ return ERR_PTR(-EINVAL);
percpu_down_write(&cgroup_threadgroup_rwsem);
+
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- ret = -ESRCH;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-ESRCH);
+ goto out_unlock_threadgroup;
}
} else {
tsk = current;
@@ -2516,35 +2614,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* cgroup with no rt_runtime allocated. Just say no.
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
- ret = -EINVAL;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-EINVAL);
+ goto out_unlock_threadgroup;
}
get_task_struct(tsk);
+ goto out_unlock_rcu;
+
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+out_unlock_rcu:
rcu_read_unlock();
+ return tsk;
+}
- ret = cgroup_procs_write_permission(tsk, cgrp, of);
- if (!ret)
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
- put_task_struct(tsk);
- goto out_unlock_threadgroup;
+ /* release reference from cgroup_procs_write_start() */
+ put_task_struct(task);
-out_unlock_rcu:
- rcu_read_unlock();
-out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
- cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, true);
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -2887,6 +2983,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+ u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+ /* if nothing is getting enabled, nothing to worry about */
+ if (!enable)
+ return 0;
+
+ /* can @cgrp host any resources? */
+ if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return 0;
+
+ if (domain_enable) {
+ /* can't enable domain controllers inside a thread subtree */
+ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return -EOPNOTSUPP;
+ } else {
+ /*
+ * Threaded controllers can handle internal competitions
+ * and are always allowed inside a (prospective) thread
+ * subtree.
+ */
+ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return 0;
+ }
+
+ /*
+ * Controllers can't be enabled for a cgroup with tasks to avoid
+ * child cgroups competing against tasks.
+ */
+ if (cgroup_has_tasks(cgrp))
+ return -EBUSY;
+
+ return 0;
+}
+
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
@@ -2962,33 +3098,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
goto out_unlock;
}
- /*
- * Except for the root, subtree_control must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (enable && cgroup_parent(cgrp)) {
- struct cgrp_cset_link *link;
-
- /*
- * Because namespaces pin csets too, @cgrp->cset_links
- * might not be empty even when @cgrp is empty. Walk and
- * verify each cset.
- */
- spin_lock_irq(&css_set_lock);
-
- ret = 0;
- list_for_each_entry(link, &cgrp->cset_links, cset_link) {
- if (css_set_populated(link->cset)) {
- ret = -EBUSY;
- break;
- }
- }
-
- spin_unlock_irq(&css_set_lock);
-
- if (ret)
- goto out_unlock;
- }
+ ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+ if (ret)
+ goto out_unlock;
/* save and update control masks and prepare csses */
cgroup_save_control(cgrp);
@@ -2997,16 +3109,182 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
-
cgroup_finalize_control(cgrp, ret);
+ if (ret)
+ goto out_unlock;
kernfs_activate(cgrp->kn);
- ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
+/**
+ * cgroup_enable_threaded - make @cgrp threaded
+ * @cgrp: the target cgroup
+ *
+ * Called when "threaded" is written to the cgroup.type interface file and
+ * tries to make @cgrp threaded and join the parent's resource domain.
+ * This function is never called on the root cgroup as cgroup.type doesn't
+ * exist on it.
+ */
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup *dom_cgrp = parent->dom_cgrp;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* noop if already threaded */
+ if (cgroup_is_threaded(cgrp))
+ return 0;
+
+ /* we're joining the parent's domain, ensure its validity */
+ if (!cgroup_is_valid_domain(dom_cgrp) ||
+ !cgroup_can_be_thread_root(dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /*
+ * The following shouldn't cause actual migrations and should
+ * always succeed.
+ */
+ cgroup_save_control(cgrp);
+
+ cgrp->dom_cgrp = dom_cgrp;
+ ret = cgroup_apply_control(cgrp);
+ if (!ret)
+ parent->nr_threaded_children++;
+ else
+ cgrp->dom_cgrp = cgrp;
+
+ cgroup_finalize_control(cgrp, ret);
+ return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ if (cgroup_is_threaded(cgrp))
+ seq_puts(seq, "threaded\n");
+ else if (!cgroup_is_valid_domain(cgrp))
+ seq_puts(seq, "domain invalid\n");
+ else if (cgroup_is_thread_root(cgrp))
+ seq_puts(seq, "domain threaded\n");
+ else
+ seq_puts(seq, "domain\n");
+
+ return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ /* only switching to threaded mode is supported */
+ if (strcmp(strstrip(buf), "threaded"))
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /* threaded can only be enabled */
+ ret = cgroup_enable_threaded(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int descendants = READ_ONCE(cgrp->max_descendants);
+
+ if (descendants == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", descendants);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int descendants;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ descendants = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &descendants);
+ if (ret)
+ return ret;
+ }
+
+ if (descendants < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_descendants = descendants;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int depth = READ_ONCE(cgrp->max_depth);
+
+ if (depth == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", depth);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int depth;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ depth = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &depth);
+ if (ret)
+ return ret;
+ }
+
+ if (depth < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_depth = depth;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
static int cgroup_events_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "populated %d\n",
@@ -3014,6 +3292,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "nr_descendants %d\n",
+ cgroup->nr_descendants);
+ seq_printf(seq, "nr_dying_descendants %d\n",
+ cgroup->nr_dying_descendants);
+
+ return 0;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -3230,7 +3520,6 @@ restart:
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
@@ -3655,6 +3944,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
return ret;
}
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+ struct list_head *l;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* find the next threaded cset */
+ if (it->tcset_pos) {
+ l = it->tcset_pos->next;
+
+ if (l != it->tcset_head) {
+ it->tcset_pos = l;
+ return container_of(l, struct css_set,
+ threaded_csets_node);
+ }
+
+ it->tcset_pos = NULL;
+ }
+
+ /* find the next cset */
+ l = it->cset_pos;
+ l = l->next;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
+ return NULL;
+ }
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+
+ it->cset_pos = l;
+
+ /* initialize threaded css_set walking */
+ if (it->flags & CSS_TASK_ITER_THREADED) {
+ if (it->cur_dcset)
+ put_css_set_locked(it->cur_dcset);
+ it->cur_dcset = cset;
+ get_css_set(cset);
+
+ it->tcset_head = &cset->threaded_csets;
+ it->tcset_pos = &cset->threaded_csets;
+ }
+
+ return cset;
+}
+
/**
* css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
@@ -3663,32 +4004,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
*/
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
- struct list_head *l = it->cset_pos;
- struct cgrp_cset_link *link;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set */
do {
- l = l->next;
- if (l == it->cset_head) {
- it->cset_pos = NULL;
+ cset = css_task_iter_next_css_set(it);
+ if (!cset) {
it->task_pos = NULL;
return;
}
-
- if (it->ss) {
- cset = container_of(l, struct css_set,
- e_cset_node[it->ss->id]);
- } else {
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
- }
} while (!css_set_populated(cset));
- it->cset_pos = l;
-
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
else
@@ -3728,6 +4056,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
WARN_ON_ONCE(!l);
+repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
@@ -3742,11 +4071,18 @@ static void css_task_iter_advance(struct css_task_iter *it)
css_task_iter_advance_css_set(it);
else
it->task_pos = l;
+
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+ !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+ cg_list)))
+ goto repeat;
}
/**
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
@@ -3754,7 +4090,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*/
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
/* no one should try to iterate before mounting cgroups */
@@ -3765,6 +4101,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
spin_lock_irq(&css_set_lock);
it->ss = css->ss;
+ it->flags = flags;
if (it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -3822,6 +4159,9 @@ void css_task_iter_end(struct css_task_iter *it)
spin_unlock_irq(&css_set_lock);
}
+ if (it->cur_dcset)
+ put_css_set(it->cur_dcset);
+
if (it->cur_task)
put_task_struct(it->cur_task);
}
@@ -3838,16 +4178,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
- struct task_struct *task;
- do {
- task = css_task_iter_next(it);
- } while (task && !thread_group_leader(task));
-
- return task;
+ return css_task_iter_next(it);
}
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+ unsigned int iter_flags)
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
@@ -3865,24 +4201,169 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
if (!it)
return ERR_PTR(-ENOMEM);
of->priv = it;
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
} else if (!(*pos)++) {
css_task_iter_end(it);
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
}
return cgroup_procs_next(s, NULL, NULL);
}
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+
+ /*
+ * All processes of a threaded subtree belong to the domain cgroup
+ * of the subtree. Only threads can be distributed across the
+ * subtree. Reject reads on cgroup.procs in the subtree proper.
+ * They're always empty anyway.
+ */
+ if (cgroup_is_threaded(cgrp))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+ CSS_TASK_ITER_THREADED);
+}
+
static int cgroup_procs_show(struct seq_file *s, void *v)
{
- seq_printf(s, "%d\n", task_tgid_vnr(v));
+ seq_printf(s, "%d\n", task_pid_vnr(v));
+ return 0;
+}
+
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup *com_cgrp = src_cgrp;
+ struct inode *inode;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* find the common ancestor */
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+ return -ENOENT;
+
return 0;
}
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, true);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, true);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+ return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, false);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /* thread migrations follow the cgroup.procs delegation rule */
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ /* and must be contained in the same domain */
+ ret = -EOPNOTSUPP;
+ if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, false);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
{
+ .name = "cgroup.type",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_type_show,
+ .write = cgroup_type_write,
+ },
+ {
.name = "cgroup.procs",
.flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
@@ -3893,6 +4374,14 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_procs_write,
},
{
+ .name = "cgroup.threads",
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_threads_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
+ .write = cgroup_threads_write,
+ },
+ {
.name = "cgroup.controllers",
.seq_show = cgroup_controllers_show,
},
@@ -3908,6 +4397,20 @@ static struct cftype cgroup_base_files[] = {
.file_offset = offsetof(struct cgroup, events_file),
.seq_show = cgroup_events_show,
},
+ {
+ .name = "cgroup.max.descendants",
+ .seq_show = cgroup_max_descendants_show,
+ .write = cgroup_max_descendants_write,
+ },
+ {
+ .name = "cgroup.max.depth",
+ .seq_show = cgroup_max_depth_show,
+ .write = cgroup_max_depth_write,
+ },
+ {
+ .name = "cgroup.stat",
+ .seq_show = cgroup_stat_show,
+ },
{ } /* terminate */
};
@@ -4007,9 +4510,15 @@ static void css_release_work_fn(struct work_struct *work)
if (ss->css_released)
ss->css_released(css);
} else {
+ struct cgroup *tcgrp;
+
/* cgroup release path */
trace_cgroup_release(cgrp);
+ for (tcgrp = cgroup_parent(cgrp); tcgrp;
+ tcgrp = cgroup_parent(tcgrp))
+ tcgrp->nr_dying_descendants--;
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -4096,9 +4605,6 @@ static void offline_css(struct cgroup_subsys_state *css)
if (!(css->flags & CSS_ONLINE))
return;
- if (ss->css_reset)
- ss->css_reset(css);
-
if (ss->css_offline)
ss->css_offline(css);
@@ -4208,9 +4714,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->root = root;
cgrp->level = level;
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+ if (tcgrp != cgrp)
+ tcgrp->nr_descendants++;
+ }
+
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4251,6 +4761,29 @@ out_free_cgrp:
return ERR_PTR(ret);
}
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+ struct cgroup *cgroup;
+ int ret = false;
+ int level = 1;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+ if (cgroup->nr_descendants >= cgroup->max_descendants)
+ goto fail;
+
+ if (level > cgroup->max_depth)
+ goto fail;
+
+ level++;
+ }
+
+ ret = true;
+fail:
+ return ret;
+}
+
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
@@ -4265,6 +4798,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
if (!parent)
return -ENODEV;
+ if (!cgroup_check_hierarchy_limits(parent)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
cgrp = cgroup_create(parent);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
@@ -4416,6 +4954,7 @@ static void kill_css(struct cgroup_subsys_state *css)
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
int ssid;
@@ -4460,7 +4999,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
kernfs_remove(cgrp->kn);
- cgroup1_check_for_release(cgroup_parent(cgrp));
+ if (parent && cgroup_is_threaded(cgrp))
+ parent->nr_threaded_children--;
+
+ for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ tcgrp->nr_descendants--;
+ tcgrp->nr_dying_descendants++;
+ }
+
+ cgroup1_check_for_release(parent);
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -4655,11 +5202,17 @@ int __init cgroup_init(void)
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ /* implicit controllers must be threaded too */
+ WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
if (ss->implicit_on_dfl)
cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
else if (!ss->dfl_cftypes)
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
+ if (ss->threaded)
+ cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
} else {
@@ -4669,6 +5222,10 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
+
+ mutex_lock(&cgroup_mutex);
+ css_populate_dir(init_css_set.subsys[ssid]);
+ mutex_unlock(&cgroup_mutex);
}
/* init_css_set.subsys[] has been updated, re-hash */
@@ -4700,6 +5257,18 @@ static int __init cgroup_wq_init(void)
}
core_initcall(cgroup_wq_init);
+void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return;
+ kernfs_path(kn, buf, buflen);
+ kernfs_put(kn);
+}
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ca8376e..67230ec 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,6 +56,7 @@
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <linux/oom.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
@@ -63,6 +64,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -299,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
+ * Cgroup v2 behavior is used when on default hierarchy or the
+ * cgroup_v2_mode flag is set.
+ */
+static inline bool is_in_v2_mode(void)
+{
+ return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -488,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -576,6 +587,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
+/* Must be called with cpuset_mutex held. */
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
/*
* generate_sched_domains()
*
@@ -861,7 +879,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
@@ -895,8 +913,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- cpumask_empty(new_cpus))
+ if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -913,7 +930,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1091,7 +1108,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
@@ -1149,8 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- nodes_empty(*new_mems))
+ if (is_in_v2_mode() && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1167,7 +1183,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1284,7 +1300,7 @@ static void update_tasks_flags(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
cpuset_update_task_spread_flag(cs, task);
css_task_iter_end(&it);
@@ -1459,7 +1475,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ if (!is_in_v2_mode() &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1891,6 +1907,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
@@ -1977,7 +1994,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2054,7 +2071,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2248,7 +2265,7 @@ retry:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2279,7 +2296,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool on_dfl = is_in_v2_mode();
mutex_lock(&cpuset_mutex);
@@ -2342,13 +2359,7 @@ void cpuset_update_active_cpus(void)
* We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
- *
- * We still need to do partition_sched_domains() synchronously;
- * otherwise, the scheduler will get confused and put tasks to the
- * dead CPU. Fall back to the default single domain.
- * cpuset_hotplug_workfn() will rebuild it as necessary.
*/
- partition_sched_domains(1, NULL, NULL);
schedule_work(&cpuset_hotplug_work);
}
@@ -2497,12 +2508,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* If we're in interrupt, yes, we can always allocate. If @node is set in
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * yes. If current has access to memory reserves as an oom victim, yes.
* Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * unless the task has been OOM killed.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -2525,7 +2536,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
- * TIF_MEMDIE - any node ok
+ * tsk_is_oom_victim - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -2543,7 +2554,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index dac46af..f661b4c 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -114,27 +114,49 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- int dead_cnt = 0, extra_refs = 0;
+ int dead_cnt = 0, extra_refs = 0, threaded_csets = 0;
spin_lock_irq(&css_set_lock);
+
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
int refcnt = refcount_read(&cset->refcount);
- seq_printf(seq, " %d", refcnt);
- if (refcnt - cset->nr_tasks > 0) {
- int extra = refcnt - cset->nr_tasks;
-
- seq_printf(seq, " +%d", extra);
- /*
- * Take out the one additional reference in
- * init_css_set.
- */
- if (cset == &init_css_set)
- extra--;
- extra_refs += extra;
+ /*
+ * Print out the proc_cset and threaded_cset relationship
+ * and highlight difference between refcount and task_count.
+ */
+ seq_printf(seq, "css_set %pK", cset);
+ if (rcu_dereference_protected(cset->dom_cset, 1) != cset) {
+ threaded_csets++;
+ seq_printf(seq, "=>%pK", cset->dom_cset);
+ }
+ if (!list_empty(&cset->threaded_csets)) {
+ struct css_set *tcset;
+ int idx = 0;
+
+ list_for_each_entry(tcset, &cset->threaded_csets,
+ threaded_csets_node) {
+ seq_puts(seq, idx ? "," : "<=");
+ seq_printf(seq, "%pK", tcset);
+ idx++;
+ }
+ } else {
+ seq_printf(seq, " %d", refcnt);
+ if (refcnt - cset->nr_tasks > 0) {
+ int extra = refcnt - cset->nr_tasks;
+
+ seq_printf(seq, " +%d", extra);
+ /*
+ * Take out the one additional reference in
+ * init_css_set.
+ */
+ if (cset == &init_css_set)
+ extra--;
+ extra_refs += extra;
+ }
}
seq_puts(seq, "\n");
@@ -163,10 +185,12 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
}
spin_unlock_irq(&css_set_lock);
- if (!dead_cnt && !extra_refs)
+ if (!dead_cnt && !extra_refs && !threaded_csets)
return 0;
seq_puts(seq, "\n");
+ if (threaded_csets)
+ seq_printf(seq, "threaded css_sets = %d\n", threaded_csets);
if (extra_refs)
seq_printf(seq, "extra references = %d\n", extra_refs);
if (dead_cnt)
@@ -352,6 +376,7 @@ static int __init enable_cgroup_debug(char *str)
{
debug_cgrp_subsys.dfl_cftypes = debug_files;
debug_cgrp_subsys.implicit_on_dfl = true;
+ debug_cgrp_subsys.threaded = true;
return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 1b72d56..0823679 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
rcu_read_unlock();
/* are all tasks frozen? */
- css_task_iter_start(css, &it);
+ css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
if (freezing(task)) {
@@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
freeze_task(task);
css_task_iter_end(&it);
@@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
__thaw_task(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2237201..9829c67 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = {
.free = pids_free,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
+ .threaded = true,
};
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index d708290..d3fd428 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -10,6 +10,7 @@
# CONFIG_USELIB is not set
CONFIG_ANDROID=y
CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
CONFIG_ANDROID_LOW_MEMORY_KILLER=y
CONFIG_ARMV8_DEPRECATED=y
CONFIG_ASHMEM=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index eee0331..acf5308 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu)
__cpu_die(cpu);
tick_cleanup_dead_cpu(cpu);
+ rcutree_migrate_callbacks(cpu);
return 0;
}
@@ -1252,7 +1253,17 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
struct cpuhp_step *sp;
int ret = 0;
- if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
+ /*
+ * If name is NULL, then the state gets removed.
+ *
+ * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
+ * the first allocation from these dynamic ranges, so the removal
+ * would trigger a new allocation and clear the wrong (already
+ * empty) state, leaving the callbacks of the to be cleared state
+ * dangling, which causes wreckage on the next hotplug operation.
+ */
+ if (name && (state == CPUHP_AP_ONLINE_DYN ||
+ state == CPUHP_BP_PREPARE_DYN)) {
ret = cpuhp_reserve_state(state);
if (ret < 0)
return ret;
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 009cc9a..67b02e1 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -22,15 +22,21 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static DEFINE_RWLOCK(cpu_pm_notifier_lock);
-static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
{
int ret;
- ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ /*
+ * __atomic_notifier_call_chain has a RCU read critical section, which
+ * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
+ * RCU know this.
+ */
+ rcu_irq_enter_irqson();
+ ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
nr_to_call, nr_calls);
+ rcu_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -47,14 +53,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -69,14 +68,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
@@ -100,7 +92,6 @@ int cpu_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -108,7 +99,6 @@ int cpu_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -128,13 +118,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
*/
int cpu_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
@@ -159,7 +143,6 @@ int cpu_cluster_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -167,7 +150,6 @@ int cpu_cluster_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -190,13 +172,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*/
int cpu_cluster_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ff..3e691b7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx)
return parent_ctx;
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+ enum pid_type type)
{
+ u32 nr;
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
- return task_tgid_nr_ns(p, event->ns);
+ nr = __task_pid_nr_ns(p, type, event->ns);
+ /* avoid -1 if it is idle thread or runs in another ns */
+ if (!nr && !pid_alive(p))
+ nr = -1;
+ return nr;
}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
+ return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+}
- return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ return perf_event_pid_type(event, p, PIDTYPE_PID);
}
/*
@@ -1570,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ size += sizeof(data->phys_addr);
+
event->header_size = size;
}
@@ -2217,6 +2225,33 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
+/*
+ * Complement to update_event_times(). This computes the tstamp_* values to
+ * continue 'enabled' state from @now, and effectively discards the time
+ * between the prior tstamp_stopped and now (as we were in the OFF state, or
+ * just switched (context) time base).
+ *
+ * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+ * cannot have been scheduled in yet. And going into INACTIVE state means
+ * '@event->tstamp_stopped = @now'.
+ *
+ * Thus given the rules of update_event_times():
+ *
+ * total_time_enabled = tstamp_stopped - tstamp_enabled
+ * total_time_running = tstamp_stopped - tstamp_running
+ *
+ * We can insert 'tstamp_stopped == now' and reverse them to compute new
+ * tstamp_* values.
+ */
+static void __perf_event_enable_time(struct perf_event *event, u64 now)
+{
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+
+ event->tstamp_stopped = now;
+ event->tstamp_enabled = now - event->total_time_enabled;
+ event->tstamp_running = now - event->total_time_running;
+}
+
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
@@ -2224,9 +2259,12 @@ static void add_event_to_ctx(struct perf_event *event,
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ /*
+ * We can be called with event->state == STATE_OFF when we create with
+ * .disabled = 1. In that case the IOC_ENABLE will call this function.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2471,10 +2509,11 @@ static void __perf_event_mark_enabled(struct perf_event *event)
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ __perf_event_enable_time(event, tstamp);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ /* XXX should not be > INACTIVE if event isn't */
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ __perf_event_enable_time(sub, tstamp);
}
}
@@ -3180,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
return;
perf_ctx_lock(cpuctx, ctx);
+ /*
+ * We must check ctx->nr_events while holding ctx->lock, such
+ * that we serialize against perf_install_in_context().
+ */
+ if (!ctx->nr_events)
+ goto unlock;
+
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
@@ -3193,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
+
+unlock:
perf_ctx_unlock(cpuctx, ctx);
}
@@ -3625,10 +3673,7 @@ unlock:
static inline u64 perf_event_count(struct perf_event *event)
{
- if (event->pmu->count)
- return event->pmu->count(event);
-
- return __perf_event_count(event);
+ return local64_read(&event->count) + atomic64_read(&event->child_count);
}
/*
@@ -3659,15 +3704,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
goto out;
}
- /*
- * It must not have a pmu::count method, those are not
- * NMI safe.
- */
- if (event->pmu->count) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
/* If this is a per-task event, it must be for current */
if ((event->attach_state & PERF_ATTACH_TASK) &&
event->hw.target != current) {
@@ -5090,7 +5126,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -5113,7 +5149,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event);
+ event->pmu->event_unmapped(event, vma->vm_mm);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
@@ -5411,7 +5447,7 @@ aux_unlock:
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
return ret;
}
@@ -5972,6 +6008,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ perf_output_put(handle, data->phys_addr);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5987,6 +6026,38 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+static u64 perf_virt_to_phys(u64 virt)
+{
+ u64 phys_addr = 0;
+ struct page *p = NULL;
+
+ if (!virt)
+ return 0;
+
+ if (virt >= TASK_SIZE) {
+ /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ if (virt_addr_valid((void *)(uintptr_t)virt) &&
+ !(virt >= VMALLOC_START && virt < VMALLOC_END))
+ phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+ } else {
+ /*
+ * Walking the pages tables for user address.
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * Try IRQ-safe __get_user_pages_fast first.
+ * If failed, leave phys_addr as 0.
+ */
+ if ((current->mm != NULL) &&
+ (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+ if (p)
+ put_page(p);
+ }
+
+ return phys_addr;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6105,6 +6176,9 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ data->phys_addr = perf_virt_to_phys(data->addr);
}
static void __always_inline
@@ -7256,6 +7330,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+void perf_event_itrace_started(struct perf_event *event)
+{
+ event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
static void perf_log_itrace_start(struct perf_event *event)
{
struct perf_output_handle handle;
@@ -7271,7 +7350,7 @@ static void perf_log_itrace_start(struct perf_event *event)
event = event->parent;
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
- event->hw.itrace_started)
+ event->attach_state & PERF_ATTACH_ITRACE)
return;
rec.header.type = PERF_RECORD_ITRACE_START;
@@ -7875,16 +7954,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task);
+ rctx, task, NULL);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task)
+ struct task_struct *task, struct perf_event *event)
{
struct perf_sample_data data;
- struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7898,9 +7976,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ /* Use the given event instead of the hlist */
+ if (event) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
+ } else {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
}
/*
@@ -8050,7 +8134,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
- bool is_kprobe, is_tracepoint;
+ bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -8061,7 +8145,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- if (!is_kprobe && !is_tracepoint)
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
@@ -8070,13 +8155,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
- if (is_tracepoint) {
+ if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
@@ -9580,6 +9666,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
+ attr->size = size;
+
if (attr->__reserved_1)
return -EINVAL;
@@ -9852,6 +9940,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* Only privileged users can get physical addresses */
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+ perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
if (!attr.sample_max_stack)
attr.sample_max_stack = sysctl_perf_event_max_stack;
@@ -10001,28 +10094,27 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
@@ -11201,5 +11293,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
* controller is not mounted on a legacy hierarchy.
*/
.implicit_on_dfl = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78..843e970 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -38,9 +38,9 @@ struct ring_buffer {
struct user_struct *mmap_user;
/* AUX area */
- local_t aux_head;
+ long aux_head;
local_t aux_nest;
- local_t aux_wakeup;
+ long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
@@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion)
{
int rctx;
- if (in_nmi())
+ if (unlikely(in_nmi()))
rctx = 3;
else if (in_irq())
rctx = 2;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ee97196..af71a84e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
goto err_put;
- aux_head = local_read(&rb->aux_head);
+ aux_head = rb->aux_head;
handle->rb = rb;
handle->event = event;
@@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
*/
if (!rb->aux_overwrite) {
aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
- handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
@@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
- local_set(&rb->aux_head, aux_head);
+ rb->aux_head = aux_head;
} else {
handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
- aux_head = local_read(&rb->aux_head);
- local_add(size, &rb->aux_head);
+ aux_head = rb->aux_head;
+ rb->aux_head += size;
}
if (size || handle->aux_flags) {
@@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags);
}
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
wakeup = true;
- local_add(rb->aux_watermark, &rb->aux_wakeup);
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
}
if (wakeup) {
@@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
struct ring_buffer *rb = handle->rb;
- unsigned long aux_head;
if (size > handle->size)
return -ENOSPC;
- local_add(size, &rb->aux_head);
+ rb->aux_head += size;
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
perf_output_wakeup(handle);
- local_add(rb->aux_watermark, &rb->aux_wakeup);
- handle->wakeup = local_read(&rb->aux_wakeup) +
- rb->aux_watermark;
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
- handle->head = aux_head;
+ handle->head = rb->aux_head;
handle->size -= size;
return 0;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0e137f9..267f6ef 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- newmm->uprobes_state.xol_area = NULL;
-
if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
set_bit(MMF_HAS_UPROBES, &newmm->flags);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548fa..a35d8a1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
- TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
kcov_task_exit(tsk);
@@ -819,7 +818,8 @@ void __noreturn do_exit(long code)
* Ensure that we must observe the pi_state in exit_mm() ->
* mm_release() -> exit_pi_state_list().
*/
- raw_spin_unlock_wait(&tsk->pi_lock);
+ raw_spin_lock_irq(&tsk->pi_lock);
+ raw_spin_unlock_irq(&tsk->pi_lock);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -881,9 +881,7 @@ void __noreturn do_exit(long code)
*/
flush_ptrace_hw_breakpoint(tsk);
- TASKS_RCU(preempt_disable());
- TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
- TASKS_RCU(preempt_enable());
+ exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
@@ -918,8 +916,9 @@ void __noreturn do_exit(long code)
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
- TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+ exit_tasks_rcu_finish();
+ lockdep_free_task(tsk);
do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0..24a4c0b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -88,6 +88,7 @@
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
+#include <linux/thread_info.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -217,7 +218,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
return s->addr;
}
- stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
THREADINFO_GFP,
PAGE_KERNEL,
@@ -484,6 +485,8 @@ void __init fork_init(void)
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
NULL, free_vm_stack_cache);
#endif
+
+ lockdep_init_task(&init_task);
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -654,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = dup_userfaultfd(tmp, &uf);
if (retval)
goto fail_nomem_anon_vma_fork;
- if (anon_vma_fork(tmp, mpnt))
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /* VM_WIPEONFORK gets a clean slate in the child. */
+ tmp->anon_vma = NULL;
+ if (anon_vma_prepare(tmp))
+ goto fail_nomem_anon_vma_fork;
+ } else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
@@ -698,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
rb_parent = &tmp->vm_rb;
mm->map_count++;
- retval = copy_page_range(mm, oldmm, mpnt);
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(mm, oldmm, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -785,6 +794,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_uprobes_state(struct mm_struct *mm)
+{
+#ifdef CONFIG_UPROBES
+ mm->uprobes_state.xol_area = NULL;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
@@ -806,11 +822,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
- clear_tlb_flush_pending(mm);
+ init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
+ mm_init_uprobes_state(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -910,7 +928,6 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
- set_bit(MMF_OOM_SKIP, &mm->flags);
mmdrop(mm);
}
@@ -926,22 +943,6 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-#ifdef CONFIG_MMU
-static void mmput_async_fn(struct work_struct *work)
-{
- struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
- __mmput(mm);
-}
-
-void mmput_async(struct mm_struct *mm)
-{
- if (atomic_dec_and_test(&mm->mm_users)) {
- INIT_WORK(&mm->async_put_work, mmput_async_fn);
- schedule_work(&mm->async_put_work);
- }
-}
-#endif
-
/**
* set_mm_exe_file - change a reference to the mm's executable file
*
@@ -1691,6 +1692,7 @@ static __latent_entropy struct task_struct *copy_process(
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
+ lockdep_init_task(p);
#endif
#ifdef CONFIG_DEBUG_MUTEXES
@@ -1949,6 +1951,7 @@ bad_fork_cleanup_audit:
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_policy:
+ lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
diff --git a/kernel/futex.c b/kernel/futex.c
index 16dbe4c..3d38eaf 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -670,13 +670,14 @@ again:
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
- * truncated in parallel so warn for now if this happens.
+ * truncated in parallel which is almost certainly an
+ * application bug. In such a case, just retry.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
- if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ if (!atomic_inc_not_zero(&inode->i_count)) {
rcu_read_unlock();
put_page(page);
@@ -875,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid)
return p;
}
+#ifdef CONFIG_FUTEX_PI
+
/*
* This task is holding PI mutexes at exit time => bad.
* Kernel cleans up PI-state, but userspace is likely hosed.
@@ -932,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
}
+#endif
+
/*
* We need to check the following states:
*
@@ -1546,6 +1551,45 @@ out:
return ret;
}
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+ unsigned int op = (encoded_op & 0x70000000) >> 28;
+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+ int oldval, ret;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+ if (oparg < 0 || oparg > 31)
+ return -EINVAL;
+ oparg = 1 << oparg;
+ }
+
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+ return -EFAULT;
+
+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+ if (ret)
+ return ret;
+
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ return oldval == cmparg;
+ case FUTEX_OP_CMP_NE:
+ return oldval != cmparg;
+ case FUTEX_OP_CMP_LT:
+ return oldval < cmparg;
+ case FUTEX_OP_CMP_GE:
+ return oldval >= cmparg;
+ case FUTEX_OP_CMP_LE:
+ return oldval <= cmparg;
+ case FUTEX_OP_CMP_GT:
+ return oldval > cmparg;
+ default:
+ return -ENOSYS;
+ }
+}
+
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
@@ -1799,6 +1843,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+ /*
+ * When PI not supported: return -ENOSYS if requeue_pi is true,
+ * consequently the compiler knows requeue_pi is always false past
+ * this point which will optimize away all the conditional code
+ * further down.
+ */
+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+ return -ENOSYS;
+
if (requeue_pi) {
/*
* Requeue PI only works on two distinct uaddrs. This
@@ -2594,6 +2647,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
if (refill_pi_state_cache())
return -ENOMEM;
@@ -2773,6 +2829,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
struct futex_q *top_waiter;
int ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
retry:
if (get_user(uval, uaddr))
return -EFAULT;
@@ -2983,6 +3042,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
if (uaddr == uaddr2)
return -EINVAL;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 27c4e77..a117adf 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -63,11 +63,20 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+# Support for simulated interrupts
+config IRQ_SIM
+ bool
+ select IRQ_WORK
+
# Support for hierarchical irq domains
config IRQ_DOMAIN_HIERARCHY
bool
select IRQ_DOMAIN
+# Support for hierarchical fasteoi+edge and fasteoi+level handlers
+config IRQ_FASTEOI_HIERARCHY_HANDLERS
+ bool
+
# Generic IRQ IPI support
config GENERIC_IRQ_IPI
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index e4aef73..1970caf 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_IRQ_TIMINGS) += timings.o
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
+obj-$(CONFIG_IRQ_SIM) += irq_sim.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a3cc37c..f51b7b6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags;
+ unsigned long flags, trigger, tmp;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
@@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
irq_settings_clr_and_set(desc, clr, set);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
+
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
if (irq_settings_has_no_balance_set(desc))
@@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
+
+ irqd_set(&desc->irq_data, trigger);
irq_put_desc_unlock(desc, flags);
}
@@ -1092,6 +1098,112 @@ void irq_cpu_offline(void)
}
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+
+#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
+/**
+ * handle_fasteoi_ack_irq - irq handler for edge hierarchy
+ * stacked on transparent controllers
+ *
+ * @desc: the interrupt description structure for this irq
+ *
+ * Like handle_fasteoi_irq(), but for use with hierarchy where
+ * the irq_chip also needs to have its ->irq_ack() function
+ * called.
+ */
+void handle_fasteoi_ack_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ raw_spin_lock(&desc->lock);
+
+ if (!irq_may_run(desc))
+ goto out;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ /*
+ * If its disabled or no action available
+ * then mask it and get out of here:
+ */
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ mask_irq(desc);
+ goto out;
+ }
+
+ kstat_incr_irqs_this_cpu(desc);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
+
+ /* Start handling the irq */
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
+
+ preflow_handler(desc);
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+
+ raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq);
+
+/**
+ * handle_fasteoi_mask_irq - irq handler for level hierarchy
+ * stacked on transparent controllers
+ *
+ * @desc: the interrupt description structure for this irq
+ *
+ * Like handle_fasteoi_irq(), but for use with hierarchy where
+ * the irq_chip also needs to have its ->irq_mask_ack() function
+ * called.
+ */
+void handle_fasteoi_mask_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ raw_spin_lock(&desc->lock);
+ mask_ack_irq(desc);
+
+ if (!irq_may_run(desc))
+ goto out;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ /*
+ * If its disabled or no action available
+ * then mask it and get out of here:
+ */
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ mask_irq(desc);
+ goto out;
+ }
+
+ kstat_incr_irqs_this_cpu(desc);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
+
+ preflow_handler(desc);
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+
+ raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
+
+#endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
+
/**
* irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
* NULL)
@@ -1105,6 +1217,7 @@ void irq_chip_enable_parent(struct irq_data *data)
else
data->chip->irq_unmask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_enable_parent);
/**
* irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if
@@ -1119,6 +1232,7 @@ void irq_chip_disable_parent(struct irq_data *data)
else
data->chip->irq_mask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_disable_parent);
/**
* irq_chip_ack_parent - Acknowledge the parent interrupt
@@ -1181,6 +1295,7 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_affinity_parent);
/**
* irq_chip_set_type_parent - Set IRQ type on the parent interrupt
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index aee8f7e..638eb9c 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc)
affinity = cpu_online_mask;
brokeaff = true;
}
-
- err = irq_do_set_affinity(d, affinity, true);
+ /*
+ * Do not set the force argument of irq_do_set_affinity() as this
+ * disables the masking of offline CPUs from the supplied affinity
+ * mask and therefore might keep/reassign the irq to the outgoing
+ * CPU.
+ */
+ err = irq_do_set_affinity(d, affinity, false);
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 4d384ed..c3fdb36 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -5,6 +5,7 @@
*/
#include <linux/irqdomain.h>
#include <linux/irq.h>
+#include <linux/uaccess.h>
#include "internals.h"
@@ -171,8 +172,55 @@ static int irq_debug_open(struct inode *inode, struct file *file)
return single_open(file, irq_debug_show, inode->i_private);
}
+static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct irq_desc *desc = file_inode(file)->i_private;
+ char buf[8] = { 0, };
+ size_t size;
+
+ size = min(sizeof(buf) - 1, count);
+ if (copy_from_user(buf, user_buf, size))
+ return -EFAULT;
+
+ if (!strncmp(buf, "trigger", size)) {
+ unsigned long flags;
+ int err;
+
+ /* Try the HW interface first */
+ err = irq_set_irqchip_state(irq_desc_get_irq(desc),
+ IRQCHIP_STATE_PENDING, true);
+ if (!err)
+ return count;
+
+ /*
+ * Otherwise, try to inject via the resend interface,
+ * which may or may not succeed.
+ */
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ if (irq_settings_is_level(desc)) {
+ /* Can't do level, sorry */
+ err = -EINVAL;
+ } else {
+ desc->istate |= IRQS_PENDING;
+ check_irq_resend(desc);
+ err = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
+
+ return err ? err : count;
+ }
+
+ return count;
+}
+
static const struct file_operations dfs_irq_ops = {
.open = irq_debug_open,
+ .write = irq_debug_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
@@ -186,7 +234,7 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
return;
sprintf(name, "%d", irq);
- desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc,
+ desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc,
&dfs_irq_ops);
}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a2c4805..a4aa390 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -151,7 +151,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
#define for_each_action_of_desc(desc, act) \
- for (act = desc->act; act; act = act->next)
+ for (act = desc->action; act; act = act->next)
struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 1a9abc1..259a22a 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
- if (!data || !ipimask || cpu > nr_cpu_ids)
+ if (!data || !ipimask || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
if (!cpumask_test_cpu(cpu, ipimask))
@@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (!chip->ipi_send_single && !chip->ipi_send_mask)
return -EINVAL;
- if (cpu > nr_cpu_ids)
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
if (dest) {
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
new file mode 100644
index 0000000..24caabf
--- /dev/null
+++ b/kernel/irq/irq_sim.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/irq_sim.h>
+#include <linux/irq.h>
+
+struct irq_sim_devres {
+ struct irq_sim *sim;
+};
+
+static void irq_sim_irqmask(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+ irq_ctx->enabled = false;
+}
+
+static void irq_sim_irqunmask(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+ irq_ctx->enabled = true;
+}
+
+static struct irq_chip irq_sim_irqchip = {
+ .name = "irq_sim",
+ .irq_mask = irq_sim_irqmask,
+ .irq_unmask = irq_sim_irqunmask,
+};
+
+static void irq_sim_handle_irq(struct irq_work *work)
+{
+ struct irq_sim_work_ctx *work_ctx;
+
+ work_ctx = container_of(work, struct irq_sim_work_ctx, work);
+ handle_simple_irq(irq_to_desc(work_ctx->irq));
+}
+
+/**
+ * irq_sim_init - Initialize the interrupt simulator: allocate a range of
+ * dummy interrupts.
+ *
+ * @sim: The interrupt simulator object to initialize.
+ * @num_irqs: Number of interrupts to allocate
+ *
+ * Returns 0 on success and a negative error number on failure.
+ */
+int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
+{
+ int i;
+
+ sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL);
+ if (!sim->irqs)
+ return -ENOMEM;
+
+ sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0);
+ if (sim->irq_base < 0) {
+ kfree(sim->irqs);
+ return sim->irq_base;
+ }
+
+ for (i = 0; i < num_irqs; i++) {
+ sim->irqs[i].irqnum = sim->irq_base + i;
+ sim->irqs[i].enabled = false;
+ irq_set_chip(sim->irq_base + i, &irq_sim_irqchip);
+ irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]);
+ irq_set_handler(sim->irq_base + i, &handle_simple_irq);
+ irq_modify_status(sim->irq_base + i,
+ IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
+ }
+
+ init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq);
+ sim->irq_count = num_irqs;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_sim_init);
+
+/**
+ * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt
+ * descriptors and allocated memory.
+ *
+ * @sim: The interrupt simulator to tear down.
+ */
+void irq_sim_fini(struct irq_sim *sim)
+{
+ irq_work_sync(&sim->work_ctx.work);
+ irq_free_descs(sim->irq_base, sim->irq_count);
+ kfree(sim->irqs);
+}
+EXPORT_SYMBOL_GPL(irq_sim_fini);
+
+static void devm_irq_sim_release(struct device *dev, void *res)
+{
+ struct irq_sim_devres *this = res;
+
+ irq_sim_fini(this->sim);
+}
+
+/**
+ * irq_sim_init - Initialize the interrupt simulator for a managed device.
+ *
+ * @dev: Device to initialize the simulator object for.
+ * @sim: The interrupt simulator object to initialize.
+ * @num_irqs: Number of interrupts to allocate
+ *
+ * Returns 0 on success and a negative error number on failure.
+ */
+int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
+ unsigned int num_irqs)
+{
+ struct irq_sim_devres *dr;
+ int rv;
+
+ dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ rv = irq_sim_init(sim, num_irqs);
+ if (rv) {
+ devres_free(dr);
+ return rv;
+ }
+
+ dr->sim = sim;
+ devres_add(dev, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devm_irq_sim_init);
+
+/**
+ * irq_sim_fire - Enqueue an interrupt.
+ *
+ * @sim: The interrupt simulator object.
+ * @offset: Offset of the simulated interrupt which should be fired.
+ */
+void irq_sim_fire(struct irq_sim *sim, unsigned int offset)
+{
+ if (sim->irqs[offset].enabled) {
+ sim->work_ctx.irq = irq_sim_irqnum(sim, offset);
+ irq_work_queue(&sim->work_ctx.work);
+ }
+}
+EXPORT_SYMBOL_GPL(irq_sim_fire);
+
+/**
+ * irq_sim_irqnum - Get the allocated number of a dummy interrupt.
+ *
+ * @sim: The interrupt simulator object.
+ * @offset: Offset of the simulated interrupt for which to retrieve
+ * the number.
+ */
+int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset)
+{
+ return sim->irqs[offset].irqnum;
+}
+EXPORT_SYMBOL_GPL(irq_sim_irqnum);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f1f2514..e84b705 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -41,6 +41,9 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
#endif
+const struct fwnode_operations irqchip_fwnode_ops;
+EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
+
/**
* irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
* identifying an irq domain
@@ -86,7 +89,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
fwid->data = data;
- fwid->fwnode.type = FWNODE_IRQCHIP;
+ fwid->fwnode.ops = &irqchip_fwnode_ops;
return &fwid->fwnode;
}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
@@ -193,10 +196,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
}
if (!domain->name) {
- if (fwnode) {
- pr_err("Invalid fwnode type (%d) for irqdomain\n",
- fwnode->type);
- }
+ if (fwnode)
+ pr_err("Invalid fwnode type for irqdomain\n");
domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
atomic_inc_return(&unknown_domains));
if (!domain->name) {
@@ -455,6 +456,31 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
+static void irq_domain_clear_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq)
+{
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+}
+
+static void irq_domain_set_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ struct irq_data *irq_data)
+{
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = irq_data->irq;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+}
+
void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
@@ -483,13 +509,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
domain->mapcount--;
/* Clear reverse map for this hwirq */
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_clear_mapping(domain, hwirq);
}
int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
@@ -533,13 +553,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
domain->mapcount++;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = virq;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_set_mapping(domain, hwirq, irq_data);
mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -1138,16 +1152,9 @@ static void irq_domain_insert_irq(int virq)
for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
struct irq_domain *domain = data->domain;
- irq_hw_number_t hwirq = data->hwirq;
domain->mapcount++;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = virq;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&domain->revmap_tree, hwirq, data);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_set_mapping(domain, data->hwirq, data);
/* If not already assigned, give the domain the chip's name */
if (!domain->name && data->chip)
@@ -1171,13 +1178,7 @@ static void irq_domain_remove_irq(int virq)
irq_hw_number_t hwirq = data->hwirq;
domain->mapcount--;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_clear_mapping(domain, hwirq);
}
}
@@ -1362,7 +1363,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
unsigned int nr_irqs)
{
- domain->ops->free(domain, irq_base, nr_irqs);
+ if (domain->ops->free)
+ domain->ops->free(domain, irq_base, nr_irqs);
}
int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
@@ -1448,6 +1450,175 @@ out_free_desc:
return ret;
}
+/* The irq_data was moved, fix the revmap to refer to the new location */
+static void irq_domain_fix_revmap(struct irq_data *d)
+{
+ void **slot;
+
+ if (d->hwirq < d->domain->revmap_size)
+ return; /* Not using radix tree. */
+
+ /* Fix up the revmap. */
+ mutex_lock(&revmap_trees_mutex);
+ slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
+ if (slot)
+ radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
+ mutex_unlock(&revmap_trees_mutex);
+}
+
+/**
+ * irq_domain_push_irq() - Push a domain in to the top of a hierarchy.
+ * @domain: Domain to push.
+ * @virq: Irq to push the domain in to.
+ * @arg: Passed to the irq_domain_ops alloc() function.
+ *
+ * For an already existing irqdomain hierarchy, as might be obtained
+ * via a call to pci_enable_msix(), add an additional domain to the
+ * head of the processing chain. Must be called before request_irq()
+ * has been called.
+ */
+int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
+{
+ struct irq_data *child_irq_data;
+ struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_desc *desc;
+ int rv = 0;
+
+ /*
+ * Check that no action has been set, which indicates the virq
+ * is in a state where this function doesn't have to deal with
+ * races between interrupt handling and maintaining the
+ * hierarchy. This will catch gross misuse. Attempting to
+ * make the check race free would require holding locks across
+ * calls to struct irq_domain_ops->alloc(), which could lead
+ * to deadlock, so we just do a simple check before starting.
+ */
+ desc = irq_to_desc(virq);
+ if (!desc)
+ return -EINVAL;
+ if (WARN_ON(desc->action))
+ return -EBUSY;
+
+ if (domain == NULL)
+ return -EINVAL;
+
+ if (WARN_ON(!irq_domain_is_hierarchy(domain)))
+ return -EINVAL;
+
+ if (!root_irq_data)
+ return -EINVAL;
+
+ if (domain->parent != root_irq_data->domain)
+ return -EINVAL;
+
+ child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
+ irq_data_get_node(root_irq_data));
+ if (!child_irq_data)
+ return -ENOMEM;
+
+ mutex_lock(&irq_domain_mutex);
+
+ /* Copy the original irq_data. */
+ *child_irq_data = *root_irq_data;
+
+ /*
+ * Overwrite the root_irq_data, which is embedded in struct
+ * irq_desc, with values for this domain.
+ */
+ root_irq_data->parent_data = child_irq_data;
+ root_irq_data->domain = domain;
+ root_irq_data->mask = 0;
+ root_irq_data->hwirq = 0;
+ root_irq_data->chip = NULL;
+ root_irq_data->chip_data = NULL;
+
+ /* May (probably does) set hwirq, chip, etc. */
+ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+ if (rv) {
+ /* Restore the original irq_data. */
+ *root_irq_data = *child_irq_data;
+ goto error;
+ }
+
+ irq_domain_fix_revmap(child_irq_data);
+ irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
+
+error:
+ mutex_unlock(&irq_domain_mutex);
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(irq_domain_push_irq);
+
+/**
+ * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy.
+ * @domain: Domain to remove.
+ * @virq: Irq to remove the domain from.
+ *
+ * Undo the effects of a call to irq_domain_push_irq(). Must be
+ * called either before request_irq() or after free_irq().
+ */
+int irq_domain_pop_irq(struct irq_domain *domain, int virq)
+{
+ struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *child_irq_data;
+ struct irq_data *tmp_irq_data;
+ struct irq_desc *desc;
+
+ /*
+ * Check that no action is set, which indicates the virq is in
+ * a state where this function doesn't have to deal with races
+ * between interrupt handling and maintaining the hierarchy.
+ * This will catch gross misuse. Attempting to make the check
+ * race free would require holding locks across calls to
+ * struct irq_domain_ops->free(), which could lead to
+ * deadlock, so we just do a simple check before starting.
+ */
+ desc = irq_to_desc(virq);
+ if (!desc)
+ return -EINVAL;
+ if (WARN_ON(desc->action))
+ return -EBUSY;
+
+ if (domain == NULL)
+ return -EINVAL;
+
+ if (!root_irq_data)
+ return -EINVAL;
+
+ tmp_irq_data = irq_domain_get_irq_data(domain, virq);
+
+ /* We can only "pop" if this domain is at the top of the list */
+ if (WARN_ON(root_irq_data != tmp_irq_data))
+ return -EINVAL;
+
+ if (WARN_ON(root_irq_data->domain != domain))
+ return -EINVAL;
+
+ child_irq_data = root_irq_data->parent_data;
+ if (WARN_ON(!child_irq_data))
+ return -EINVAL;
+
+ mutex_lock(&irq_domain_mutex);
+
+ root_irq_data->parent_data = NULL;
+
+ irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_free_irqs_hierarchy(domain, virq, 1);
+
+ /* Restore the original irq_data. */
+ *root_irq_data = *child_irq_data;
+
+ irq_domain_fix_revmap(root_irq_data);
+
+ mutex_unlock(&irq_domain_mutex);
+
+ kfree(child_irq_data);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
+
/**
* irq_domain_free_irqs - Free IRQ number and associated data structures
* @virq: base IRQ number
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1d1a5b9..573dc52 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -400,8 +400,18 @@ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- chip = irq_data_get_irq_chip(data);
- if (chip && chip->irq_set_vcpu_affinity)
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
irq_put_desc_unlock(desc, flags);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7f9642a..6376b4a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -61,12 +61,12 @@ static int show_irq_affinity(int type, struct seq_file *m)
case EFFECTIVE:
case EFFECTIVE_LIST:
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
- mask = desc->irq_common_data.effective_affinity;
+ mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
break;
-#else
- return -EINVAL;
#endif
- };
+ default:
+ return -EINVAL;
+ }
switch (type) {
case AFFINITY_LIST:
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d11c506..0bf2e8f5 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,29 +79,7 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_enable(struct static_key *key)
-{
- int count = static_key_count(key);
-
- WARN_ON_ONCE(count < 0 || count > 1);
-
- if (!count)
- static_key_slow_inc(key);
-}
-EXPORT_SYMBOL_GPL(static_key_enable);
-
-void static_key_disable(struct static_key *key)
-{
- int count = static_key_count(key);
-
- WARN_ON_ONCE(count < 0 || count > 1);
-
- if (count)
- static_key_slow_dec(key);
-}
-EXPORT_SYMBOL_GPL(static_key_disable);
-
-void static_key_slow_inc(struct static_key *key)
+static void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
@@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key)
return;
}
- cpus_read_lock();
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
atomic_set(&key->enabled, -1);
jump_label_update(key);
- atomic_set(&key->enabled, 1);
+ /*
+ * Ensure that if the above cmpxchg loop observes our positive
+ * value, it must also observe all the text changes.
+ */
+ atomic_set_release(&key->enabled, 1);
} else {
atomic_inc(&key->enabled);
}
jump_label_unlock();
+}
+
+void static_key_slow_inc(struct static_key *key)
+{
+ cpus_read_lock();
+ static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
-static void __static_key_slow_dec(struct static_key *key,
- unsigned long rate_limit, struct delayed_work *work)
+void static_key_enable_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE();
+
+ if (atomic_read(&key->enabled) > 0) {
+ WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+ return;
+ }
+
+ jump_label_lock();
+ if (atomic_read(&key->enabled) == 0) {
+ atomic_set(&key->enabled, -1);
+ jump_label_update(key);
+ /*
+ * See static_key_slow_inc().
+ */
+ atomic_set_release(&key->enabled, 1);
+ }
+ jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);
+
+void static_key_enable(struct static_key *key)
+{
+ cpus_read_lock();
+ static_key_enable_cpuslocked(key);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE();
+
+ if (atomic_read(&key->enabled) != 1) {
+ WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+ return;
+ }
+
+ jump_label_lock();
+ if (atomic_cmpxchg(&key->enabled, 1, 0))
+ jump_label_update(key);
+ jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);
+
+void static_key_disable(struct static_key *key)
{
cpus_read_lock();
+ static_key_disable_cpuslocked(key);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
+static void static_key_slow_dec_cpuslocked(struct static_key *key,
+ unsigned long rate_limit,
+ struct delayed_work *work)
+{
/*
* The negative count check is valid even when a negative
* key->enabled is in use by static_key_slow_inc(); a
@@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key,
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
- cpus_read_unlock();
return;
}
@@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key,
jump_label_update(key);
}
jump_label_unlock();
+}
+
+static void __static_key_slow_dec(struct static_key *key,
+ unsigned long rate_limit,
+ struct delayed_work *work)
+{
+ cpus_read_lock();
+ static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1ae7c41..20fef1a 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *pages;
- pages = alloc_pages(gfp_mask, order);
+ pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
if (pages) {
unsigned int count, i;
@@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
count = 1 << order;
for (i = 0; i < count; i++)
SetPageReserved(pages + i);
+
+ arch_kexec_post_alloc_pages(page_address(pages), count,
+ gfp_mask);
+
+ if (gfp_mask & __GFP_ZERO)
+ for (i = 0; i < count; i++)
+ clear_highpage(pages + i);
}
return pages;
@@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)
order = page_private(page);
count = 1 << order;
+
+ arch_kexec_pre_free_pages(page_address(page), count);
+
for (i = 0; i < count; i++)
ClearPageReserved(page + i);
__free_pages(page, order);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6d016c5..2f37acd 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
/*
+ * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
+ * running at the same time without returning. When this happens we
+ * believe you've somehow ended up with a recursive module dependency
+ * creating a loop.
+ *
+ * We have no option but to fail.
+ *
+ * Userspace should proactively try to detect and prevent these.
+ */
+#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
+
+/*
modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)
pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
atomic_read(&kmod_concurrent_max),
MAX_KMOD_CONCURRENT, module_name);
- wait_event_interruptible(kmod_wq,
- atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
+ ret = wait_event_killable_timeout(kmod_wq,
+ atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
+ MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (!ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return -ETIME;
+ } else if (ret == -ERESTARTSYS) {
+ pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
+ return ret;
+ }
}
trace_module_request(module_name, wait, _RET_IP_);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528..1c19edf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -637,6 +637,7 @@ repeat:
schedule();
try_to_freeze();
+ cond_resched();
goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7d2499b..44c8d0d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -58,6 +58,10 @@
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#include <linux/slab.h>
+#endif
+
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
@@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on);
#if VERBOSE
# define HARDIRQ_VERBOSE 1
# define SOFTIRQ_VERBOSE 1
-# define RECLAIM_VERBOSE 1
#else
# define HARDIRQ_VERBOSE 0
# define SOFTIRQ_VERBOSE 0
-# define RECLAIM_VERBOSE 0
#endif
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
/*
* Quick filtering for interesting events:
*/
@@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
}
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+static void cross_init(struct lockdep_map *lock, int cross);
+static int cross_lock(struct lockdep_map *lock);
+static int lock_acquire_crosslock(struct held_lock *hlock);
+static int lock_release_crosslock(struct lockdep_map *lock);
+#else
+static inline void cross_init(struct lockdep_map *lock, int cross) {}
+static inline int cross_lock(struct lockdep_map *lock) { return 0; }
+static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
+static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
+#endif
+
/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
@@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src,
printk(KERN_CONT "\n\n");
}
- printk(" Possible unsafe locking scenario:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
+ if (cross_lock(tgt->instance)) {
+ printk(" Possible unsafe locking scenario by crosslock:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk(" unlock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ } else {
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ }
}
/*
@@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- pr_warn("\nbut task is already holding lock:\n");
+
+ if (cross_lock(check_tgt->instance))
+ pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
+ else
+ pr_warn("\nbut task is already holding lock:\n");
+
print_lock(check_tgt);
pr_warn("\nwhich lock already depends on the new lock.\n\n");
pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
@@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data)
static noinline int print_circular_bug(struct lock_list *this,
struct lock_list *target,
struct held_lock *check_src,
- struct held_lock *check_tgt)
+ struct held_lock *check_tgt,
+ struct stack_trace *trace)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- if (!save_trace(&this->trace))
+ if (cross_lock(check_tgt->instance))
+ this->trace = *trace;
+ else if (!save_trace(&this->trace))
return 0;
depth = get_lock_depth(target);
@@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
return result;
}
+static noinline int
+check_redundant(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
+}
+
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* Forwards and backwards subgraph searching, for the purposes of
@@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
+ if (cross_lock(prev->instance))
+ continue;
+
return print_deadlock_bug(curr, prev, next);
}
return 1;
@@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int *stack_saved)
+ struct held_lock *next, int distance, struct stack_trace *trace,
+ int (*save)(struct stack_trace *trace))
{
struct lock_list *entry;
int ret;
struct lock_list this;
struct lock_list *uninitialized_var(target_entry);
- /*
- * Static variable, serialized by the graph_lock().
- *
- * We use this static variable to save the stack trace in case
- * we call into this function multiple times due to encountering
- * trylocks in the held lock stack.
- */
- static struct stack_trace trace;
/*
* Prove that the new <prev> -> <next> dependency would not
@@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
this.parent = NULL;
ret = check_noncircular(&this, hlock_class(prev), &target_entry);
if (unlikely(!ret))
- return print_circular_bug(&this, target_entry, next, prev);
+ return print_circular_bug(&this, target_entry, next, prev, trace);
else if (unlikely(ret < 0))
return print_bfs_bug(ret);
@@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
- return 2;
+ return 1;
}
}
- if (!*stack_saved) {
- if (!save_trace(&trace))
- return 0;
- *stack_saved = 1;
+ /*
+ * Is the <prev> -> <next> link redundant?
+ */
+ this.class = hlock_class(prev);
+ this.parent = NULL;
+ ret = check_redundant(&this, hlock_class(next), &target_entry);
+ if (!ret) {
+ debug_atomic_inc(nr_redundant);
+ return 2;
}
+ if (ret < 0)
+ return print_bfs_bug(ret);
+
+
+ if (save && !save(trace))
+ return 0;
/*
* Ok, all validations passed, add the new lock
@@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
ret = add_lock_to_list(hlock_class(next),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance, &trace);
+ next->acquire_ip, distance, trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance, &trace);
+ next->acquire_ip, distance, trace);
if (!ret)
return 0;
@@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Debugging printouts:
*/
if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
- /* We drop graph lock, so another thread can overwrite trace. */
- *stack_saved = 0;
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
@@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
print_lock_name(hlock_class(next));
printk(KERN_CONT "\n");
dump_stack();
- return graph_lock();
+ if (!graph_lock())
+ return 0;
}
- return 1;
+ return 2;
}
/*
@@ -1925,8 +1985,9 @@ static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
- int stack_saved = 0;
struct held_lock *hlock;
+ struct stack_trace trace;
+ int (*save)(struct stack_trace *trace) = save_trace;
/*
* Debugging checks.
@@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
int distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
/*
- * Only non-recursive-read entries get new dependencies
- * added:
+ * Only non-crosslock entries get new dependencies added.
+ * Crosslock entries will be added by commit later:
*/
- if (hlock->read != 2 && hlock->check) {
- if (!check_prev_add(curr, hlock, next,
- distance, &stack_saved))
- return 0;
+ if (!cross_lock(hlock->instance)) {
/*
- * Stop after the first non-trylock entry,
- * as non-trylock entries have added their
- * own direct dependencies already, so this
- * lock is connected to them indirectly:
+ * Only non-recursive-read entries get new dependencies
+ * added:
*/
- if (!hlock->trylock)
- break;
+ if (hlock->read != 2 && hlock->check) {
+ int ret = check_prev_add(curr, hlock, next,
+ distance, &trace, save);
+ if (!ret)
+ return 0;
+
+ /*
+ * Stop saving stack_trace if save_trace() was
+ * called at least once:
+ */
+ if (save && ret == 2)
+ save = NULL;
+
+ /*
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
+ */
+ if (!hlock->trylock)
+ break;
+ }
}
depth--;
/*
@@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr,
}
/*
- * Look up a dependency chain. If the key is not present yet then
- * add it and return 1 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 0.
- * (On return with 1 graph_lock is held.)
+ * This is for building a chain between just two different classes,
+ * instead of adding a new hlock upon current, which is done by
+ * add_chain_cache().
+ *
+ * This can be called in any context with two classes, while
+ * add_chain_cache() must be done within the lock owener's context
+ * since it uses hlock which might be racy in another context.
*/
-static inline int lookup_chain_cache(struct task_struct *curr,
- struct held_lock *hlock,
- u64 chain_key)
+static inline int add_chain_cache_classes(unsigned int prev,
+ unsigned int next,
+ unsigned int irq_context,
+ u64 chain_key)
{
- struct lock_class *class = hlock_class(hlock);
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- int i, j;
+
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
/*
* We might need to take the graph lock, ensure we've got IRQs
@@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr,
*/
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
+
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
+ dump_stack();
+ return 0;
+ }
+
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ chain->irq_context = irq_context;
+ chain->depth = 2;
+ if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ chain->base = nr_chain_hlocks;
+ nr_chain_hlocks += chain->depth;
+ chain_hlocks[chain->base] = prev - 1;
+ chain_hlocks[chain->base + 1] = next -1;
+ }
+#ifdef CONFIG_DEBUG_LOCKDEP
/*
- * We can walk it lock-free, because entries only get added
- * to the hash:
+ * Important for check_no_collision().
*/
- hlist_for_each_entry_rcu(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
-cache_hit:
- debug_atomic_inc(chain_lookup_hits);
- if (!check_no_collision(curr, hlock, chain))
- return 0;
-
- if (very_verbose(class))
- printk("\nhash chain already cached, key: "
- "%016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key,
- class->key, class->name);
+ else {
+ if (!debug_locks_off_graph_unlock())
return 0;
- }
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ return 0;
}
- if (very_verbose(class))
- printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key, class->key, class->name);
+#endif
+
+ hlist_add_head_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(chain_lookup_misses);
+ inc_chains();
+
+ return 1;
+}
+
+/*
+ * Adds a dependency chain into chain hashtable. And must be called with
+ * graph_lock held.
+ *
+ * Return 0 if fail, and graph_lock is released.
+ * Return 1 if succeed, with graph_lock held.
+ */
+static inline int add_chain_cache(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+ int i, j;
+
/*
* Allocate a new chain entry from the static array, and add
* it to the hash:
*/
- if (!graph_lock())
- return 0;
+
/*
- * We have to walk the chain again locked - to avoid duplicates:
+ * We might need to take the graph lock, ensure we've got IRQs
+ * disabled to make this an IRQ-safe lock.. for recursion reasons
+ * lockdep won't complain about its own locking errors.
*/
- hlist_for_each_entry(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
- graph_unlock();
- goto cache_hit;
- }
- }
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2235,6 +2351,78 @@ cache_hit:
return 1;
}
+/*
+ * Look up a dependency chain.
+ */
+static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
+{
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ /*
+ * We can walk it lock-free, because entries only get added
+ * to the hash:
+ */
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+ debug_atomic_inc(chain_lookup_hits);
+ return chain;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * If the key is not present yet in dependency chain cache then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache_add(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct lock_chain *chain = lookup_chain_cache(chain_key);
+
+ if (chain) {
+cache_hit:
+ if (!check_no_collision(curr, hlock, chain))
+ return 0;
+
+ if (very_verbose(class)) {
+ printk("\nhash chain already cached, key: "
+ "%016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key,
+ class->key, class->name);
+ }
+
+ return 0;
+ }
+
+ if (very_verbose(class)) {
+ printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key, class->key, class->name);
+ }
+
+ if (!graph_lock())
+ return 0;
+
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ chain = lookup_chain_cache(chain_key);
+ if (chain) {
+ graph_unlock();
+ goto cache_hit;
+ }
+
+ if (!add_chain_cache(curr, hlock, chain_key))
+ return 0;
+
+ return 1;
+}
+
static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
struct held_lock *hlock, int chain_head, u64 chain_key)
{
@@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
*
* We look up the chain_key and do the O(N^2) check and update of
* the dependencies only if this is a new dependency chain.
- * (If lookup_chain_cache() returns with 1 it acquires
+ * (If lookup_chain_cache_add() return with 1 it acquires
* graph_lock for us)
*/
if (!hlock->trylock && hlock->check &&
- lookup_chain_cache(curr, hlock, chain_key)) {
+ lookup_chain_cache_add(curr, hlock, chain_key)) {
/*
* Check whether last held lock:
*
@@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* Add dependency only if this lock is not the head
* of the chain, and if it's not a secondary read-lock:
*/
- if (!chain_head && ret != 2)
+ if (!chain_head && ret != 2) {
if (!check_prevs_add(curr, hlock))
return 0;
+ }
+
graph_unlock();
- } else
- /* after lookup_chain_cache(): */
+ } else {
+ /* after lookup_chain_cache_add(): */
if (unlikely(!debug_locks))
return 0;
+ }
return 1;
}
@@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
return 0;
}
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
- return class_filter(class);
-#endif
- return 0;
-}
-
#define STRICT_READ_CHECKS 1
static int (*state_verbose_f[])(struct lock_class *class) = {
@@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
-{
- struct task_struct *curr = current;
-
- if (unlikely(!debug_locks))
- return;
-
- gfp_mask = current_gfp_context(gfp_mask);
-
- /* no reclaim without waiting on it */
- if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
- return;
-
- /* this guy won't enter reclaim */
- if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
- return;
-
- /* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
- return;
-
- /*
- * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
- */
- if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
- return;
-
- /* Disable lockdep if explicitly requested */
- if (gfp_mask & __GFP_NOLOCKDEP)
- return;
-
- mark_held_locks(curr, RECLAIM_FS);
-}
-
-static void check_flags(unsigned long flags);
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
- unsigned long flags;
-
- if (unlikely(current->lockdep_recursion))
- return;
-
- raw_local_irq_save(flags);
- check_flags(flags);
- current->lockdep_recursion = 1;
- __lockdep_trace_alloc(gfp_mask, flags);
- current->lockdep_recursion = 0;
- raw_local_irq_restore(flags);
-}
-
static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
{
/*
@@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
- /*
- * We reuse the irq context infrastructure more broadly as a general
- * context checking code. This tests GFP_FS recursion (a lock taken
- * during reclaim for a GFP_FS allocation is held over a GFP_FS
- * allocation).
- */
- if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
- if (hlock->read) {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
- return 0;
- } else {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
- return 0;
- }
- }
-
return 1;
}
@@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr,
return 0;
}
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
-
#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
/*
@@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
+static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
int i;
@@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
raw_local_irq_restore(flags);
}
}
+
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ cross_init(lock, 0);
+ __lockdep_init_map(lock, name, key, subclass);
+}
EXPORT_SYMBOL_GPL(lockdep_init_map);
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ cross_init(lock, 1);
+ __lockdep_init_map(lock, name, key, subclass);
+}
+EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
+#endif
+
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int chain_head = 0;
int class_idx;
u64 chain_key;
+ int ret;
if (unlikely(!debug_locks))
return 0;
@@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes + 1;
- if (depth) {
+ /* TODO: nest_lock is not implemented for crosslock yet. */
+ if (depth && !cross_lock(lock)) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (hlock->references) {
@@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
+ ret = lock_acquire_crosslock(hlock);
+ /*
+ * 2 means normal acquire operations are needed. Otherwise, it's
+ * ok just to return with '0:fail, 1:success'.
+ */
+ if (ret != 2)
+ return ret;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
struct task_struct *curr = current;
struct held_lock *hlock;
unsigned int depth;
- int i;
+ int ret, i;
if (unlikely(!debug_locks))
return 0;
+ ret = lock_release_crosslock(lock);
+ /*
+ * 2 means normal release operations are needed. Otherwise, it's
+ * ok just to return with '0:fail, 1:success'.
+ */
+ if (ret != 2)
+ return ret;
+
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
@@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
}
EXPORT_SYMBOL_GPL(lock_unpin_lock);
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
-{
- current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
-}
-EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
-
-void lockdep_clear_current_reclaim_state(void)
-{
- current->lockdep_reclaim_gfp = 0;
-}
-EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
-
#ifdef CONFIG_LOCK_STAT
static int
print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -4484,6 +4619,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
}
+
+ /*
+ * The lock history for each syscall should be independent. So wipe the
+ * slate clean on return to userspace.
+ */
+ lockdep_invariant_state(false);
}
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4532,3 +4673,488 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
dump_stack();
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+
+/*
+ * Crossrelease works by recording a lock history for each thread and
+ * connecting those historic locks that were taken after the
+ * wait_for_completion() in the complete() context.
+ *
+ * Task-A Task-B
+ *
+ * mutex_lock(&A);
+ * mutex_unlock(&A);
+ *
+ * wait_for_completion(&C);
+ * lock_acquire_crosslock();
+ * atomic_inc_return(&cross_gen_id);
+ * |
+ * | mutex_lock(&B);
+ * | mutex_unlock(&B);
+ * |
+ * | complete(&C);
+ * `-- lock_commit_crosslock();
+ *
+ * Which will then add a dependency between B and C.
+ */
+
+#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR])
+
+/*
+ * Whenever a crosslock is held, cross_gen_id will be increased.
+ */
+static atomic_t cross_gen_id; /* Can be wrapped */
+
+/*
+ * Make an entry of the ring buffer invalid.
+ */
+static inline void invalidate_xhlock(struct hist_lock *xhlock)
+{
+ /*
+ * Normally, xhlock->hlock.instance must be !NULL.
+ */
+ xhlock->hlock.instance = NULL;
+}
+
+/*
+ * Lock history stacks; we have 2 nested lock history stacks:
+ *
+ * HARD(IRQ)
+ * SOFT(IRQ)
+ *
+ * The thing is that once we complete a HARD/SOFT IRQ the future task locks
+ * should not depend on any of the locks observed while running the IRQ. So
+ * what we do is rewind the history buffer and erase all our knowledge of that
+ * temporal event.
+ */
+
+void crossrelease_hist_start(enum xhlock_context_t c)
+{
+ struct task_struct *cur = current;
+
+ if (!cur->xhlocks)
+ return;
+
+ cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+ cur->hist_id_save[c] = cur->hist_id;
+}
+
+void crossrelease_hist_end(enum xhlock_context_t c)
+{
+ struct task_struct *cur = current;
+
+ if (cur->xhlocks) {
+ unsigned int idx = cur->xhlock_idx_hist[c];
+ struct hist_lock *h = &xhlock(idx);
+
+ cur->xhlock_idx = idx;
+
+ /* Check if the ring was overwritten. */
+ if (h->hist_id != cur->hist_id_save[c])
+ invalidate_xhlock(h);
+ }
+}
+
+/*
+ * lockdep_invariant_state() is used to annotate independence inside a task, to
+ * make one task look like multiple independent 'tasks'.
+ *
+ * Take for instance workqueues; each work is independent of the last. The
+ * completion of a future work does not depend on the completion of a past work
+ * (in general). Therefore we must not carry that (lock) dependency across
+ * works.
+ *
+ * This is true for many things; pretty much all kthreads fall into this
+ * pattern, where they have an invariant state and future completions do not
+ * depend on past completions. Its just that since they all have the 'same'
+ * form -- the kthread does the same over and over -- it doesn't typically
+ * matter.
+ *
+ * The same is true for system-calls, once a system call is completed (we've
+ * returned to userspace) the next system call does not depend on the lock
+ * history of the previous system call.
+ *
+ * They key property for independence, this invariant state, is that it must be
+ * a point where we hold no locks and have no history. Because if we were to
+ * hold locks, the restore at _end() would not necessarily recover it's history
+ * entry. Similarly, independence per-definition means it does not depend on
+ * prior state.
+ */
+void lockdep_invariant_state(bool force)
+{
+ /*
+ * We call this at an invariant point, no current state, no history.
+ * Verify the former, enforce the latter.
+ */
+ WARN_ON_ONCE(!force && current->lockdep_depth);
+ invalidate_xhlock(&xhlock(current->xhlock_idx));
+}
+
+static int cross_lock(struct lockdep_map *lock)
+{
+ return lock ? lock->cross : 0;
+}
+
+/*
+ * This is needed to decide the relationship between wrapable variables.
+ */
+static inline int before(unsigned int a, unsigned int b)
+{
+ return (int)(a - b) < 0;
+}
+
+static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
+{
+ return hlock_class(&xhlock->hlock);
+}
+
+static inline struct lock_class *xlock_class(struct cross_lock *xlock)
+{
+ return hlock_class(&xlock->hlock);
+}
+
+/*
+ * Should we check a dependency with previous one?
+ */
+static inline int depend_before(struct held_lock *hlock)
+{
+ return hlock->read != 2 && hlock->check && !hlock->trylock;
+}
+
+/*
+ * Should we check a dependency with next one?
+ */
+static inline int depend_after(struct held_lock *hlock)
+{
+ return hlock->read != 2 && hlock->check;
+}
+
+/*
+ * Check if the xhlock is valid, which would be false if,
+ *
+ * 1. Has not used after initializaion yet.
+ * 2. Got invalidated.
+ *
+ * Remind hist_lock is implemented as a ring buffer.
+ */
+static inline int xhlock_valid(struct hist_lock *xhlock)
+{
+ /*
+ * xhlock->hlock.instance must be !NULL.
+ */
+ return !!xhlock->hlock.instance;
+}
+
+/*
+ * Record a hist_lock entry.
+ *
+ * Irq disable is only required.
+ */
+static void add_xhlock(struct held_lock *hlock)
+{
+ unsigned int idx = ++current->xhlock_idx;
+ struct hist_lock *xhlock = &xhlock(idx);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * This can be done locklessly because they are all task-local
+ * state, we must however ensure IRQs are disabled.
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+#endif
+
+ /* Initialize hist_lock's members */
+ xhlock->hlock = *hlock;
+ xhlock->hist_id = ++current->hist_id;
+
+ xhlock->trace.nr_entries = 0;
+ xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
+ xhlock->trace.entries = xhlock->trace_entries;
+ xhlock->trace.skip = 3;
+ save_stack_trace(&xhlock->trace);
+}
+
+static inline int same_context_xhlock(struct hist_lock *xhlock)
+{
+ return xhlock->hlock.irq_context == task_irq_context(current);
+}
+
+/*
+ * This should be lockless as far as possible because this would be
+ * called very frequently.
+ */
+static void check_add_xhlock(struct held_lock *hlock)
+{
+ /*
+ * Record a hist_lock, only in case that acquisitions ahead
+ * could depend on the held_lock. For example, if the held_lock
+ * is trylock then acquisitions ahead never depends on that.
+ * In that case, we don't need to record it. Just return.
+ */
+ if (!current->xhlocks || !depend_before(hlock))
+ return;
+
+ add_xhlock(hlock);
+}
+
+/*
+ * For crosslock.
+ */
+static int add_xlock(struct held_lock *hlock)
+{
+ struct cross_lock *xlock;
+ unsigned int gen_id;
+
+ if (!graph_lock())
+ return 0;
+
+ xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
+
+ /*
+ * When acquisitions for a crosslock are overlapped, we use
+ * nr_acquire to perform commit for them, based on cross_gen_id
+ * of the first acquisition, which allows to add additional
+ * dependencies.
+ *
+ * Moreover, when no acquisition of a crosslock is in progress,
+ * we should not perform commit because the lock might not exist
+ * any more, which might cause incorrect memory access. So we
+ * have to track the number of acquisitions of a crosslock.
+ *
+ * depend_after() is necessary to initialize only the first
+ * valid xlock so that the xlock can be used on its commit.
+ */
+ if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
+ goto unlock;
+
+ gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
+ xlock->hlock = *hlock;
+ xlock->hlock.gen_id = gen_id;
+unlock:
+ graph_unlock();
+ return 1;
+}
+
+/*
+ * Called for both normal and crosslock acquires. Normal locks will be
+ * pushed on the hist_lock queue. Cross locks will record state and
+ * stop regular lock_acquire() to avoid being placed on the held_lock
+ * stack.
+ *
+ * Return: 0 - failure;
+ * 1 - crosslock, done;
+ * 2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_acquire_crosslock(struct held_lock *hlock)
+{
+ /*
+ * CONTEXT 1 CONTEXT 2
+ * --------- ---------
+ * lock A (cross)
+ * X = atomic_inc_return(&cross_gen_id)
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Y = atomic_read_acquire(&cross_gen_id)
+ * lock B
+ *
+ * atomic_read_acquire() is for ordering between A and B,
+ * IOW, A happens before B, when CONTEXT 2 see Y >= X.
+ *
+ * Pairs with atomic_inc_return() in add_xlock().
+ */
+ hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
+
+ if (cross_lock(hlock->instance))
+ return add_xlock(hlock);
+
+ check_add_xhlock(hlock);
+ return 2;
+}
+
+static int copy_trace(struct stack_trace *trace)
+{
+ unsigned long *buf = stack_trace + nr_stack_trace_entries;
+ unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ unsigned int nr = min(max_nr, trace->nr_entries);
+
+ trace->nr_entries = nr;
+ memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
+ trace->entries = buf;
+ nr_stack_trace_entries += nr;
+
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
+ dump_stack();
+
+ return 0;
+ }
+
+ return 1;
+}
+
+static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
+{
+ unsigned int xid, pid;
+ u64 chain_key;
+
+ xid = xlock_class(xlock) - lock_classes;
+ chain_key = iterate_chain_key((u64)0, xid);
+ pid = xhlock_class(xhlock) - lock_classes;
+ chain_key = iterate_chain_key(chain_key, pid);
+
+ if (lookup_chain_cache(chain_key))
+ return 1;
+
+ if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
+ chain_key))
+ return 0;
+
+ if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
+ &xhlock->trace, copy_trace))
+ return 0;
+
+ return 1;
+}
+
+static void commit_xhlocks(struct cross_lock *xlock)
+{
+ unsigned int cur = current->xhlock_idx;
+ unsigned int prev_hist_id = xhlock(cur).hist_id;
+ unsigned int i;
+
+ if (!graph_lock())
+ return;
+
+ if (xlock->nr_acquire) {
+ for (i = 0; i < MAX_XHLOCKS_NR; i++) {
+ struct hist_lock *xhlock = &xhlock(cur - i);
+
+ if (!xhlock_valid(xhlock))
+ break;
+
+ if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
+ break;
+
+ if (!same_context_xhlock(xhlock))
+ break;
+
+ /*
+ * Filter out the cases where the ring buffer was
+ * overwritten and the current entry has a bigger
+ * hist_id than the previous one, which is impossible
+ * otherwise:
+ */
+ if (unlikely(before(prev_hist_id, xhlock->hist_id)))
+ break;
+
+ prev_hist_id = xhlock->hist_id;
+
+ /*
+ * commit_xhlock() returns 0 with graph_lock already
+ * released if fail.
+ */
+ if (!commit_xhlock(xlock, xhlock))
+ return;
+ }
+ }
+
+ graph_unlock();
+}
+
+void lock_commit_crosslock(struct lockdep_map *lock)
+{
+ struct cross_lock *xlock;
+ unsigned long flags;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (!current->xhlocks)
+ return;
+
+ /*
+ * Do commit hist_locks with the cross_lock, only in case that
+ * the cross_lock could depend on acquisitions after that.
+ *
+ * For example, if the cross_lock does not have the 'check' flag
+ * then we don't need to check dependencies and commit for that.
+ * Just skip it. In that case, of course, the cross_lock does
+ * not depend on acquisitions ahead, either.
+ *
+ * WARNING: Don't do that in add_xlock() in advance. When an
+ * acquisition context is different from the commit context,
+ * invalid(skipped) cross_lock might be accessed.
+ */
+ if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ xlock = &((struct lockdep_map_cross *)lock)->xlock;
+ commit_xhlocks(xlock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_commit_crosslock);
+
+/*
+ * Return: 0 - failure;
+ * 1 - crosslock, done;
+ * 2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_release_crosslock(struct lockdep_map *lock)
+{
+ if (cross_lock(lock)) {
+ if (!graph_lock())
+ return 0;
+ ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
+ graph_unlock();
+ return 1;
+ }
+ return 2;
+}
+
+static void cross_init(struct lockdep_map *lock, int cross)
+{
+ if (cross)
+ ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
+
+ lock->cross = cross;
+
+ /*
+ * Crossrelease assumes that the ring buffer size of xhlocks
+ * is aligned with power of 2. So force it on build.
+ */
+ BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
+}
+
+void lockdep_init_task(struct task_struct *task)
+{
+ int i;
+
+ task->xhlock_idx = UINT_MAX;
+ task->hist_id = 0;
+
+ for (i = 0; i < XHLOCK_CTX_NR; i++) {
+ task->xhlock_idx_hist[i] = UINT_MAX;
+ task->hist_id_save[i] = 0;
+ }
+
+ task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
+ GFP_KERNEL);
+}
+
+void lockdep_free_task(struct task_struct *task)
+{
+ if (task->xhlocks) {
+ void *tmp = task->xhlocks;
+ /* Diable crossrelease for current */
+ task->xhlocks = NULL;
+ kfree(tmp);
+ }
+}
+#endif
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c08fbd2..1da4669 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -143,6 +143,8 @@ struct lockdep_stats {
int redundant_softirqs_on;
int redundant_softirqs_off;
int nr_unused_locks;
+ int nr_redundant_checks;
+ int nr_redundant;
int nr_cyclic_checks;
int nr_cyclic_check_recursions;
int nr_find_usage_forwards_checks;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 6d1fcc7..68d9e26 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
debug_atomic_read(chain_lookup_hits));
seq_printf(m, " cyclic checks: %11llu\n",
debug_atomic_read(nr_cyclic_checks));
+ seq_printf(m, " redundant checks: %11llu\n",
+ debug_atomic_read(nr_redundant_checks));
+ seq_printf(m, " redundant links: %11llu\n",
+ debug_atomic_read(nr_redundant));
seq_printf(m, " find-mask forwards checks: %11llu\n",
debug_atomic_read(nr_find_usage_forwards_checks));
seq_printf(m, " find-mask backwards checks: %11llu\n",
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc..35ca09f 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
*/
LOCKDEP_STATE(HARDIRQ)
LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index a316794..a74ee6a 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
+
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
WRITE_ONCE(prev->next, node);
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd24153..294294c 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- * CPU0 CPU1
- *
- * global_lock(); local_lock(i)
- * spin_lock(&G) spin_lock(&L[i])
- * for (i) if (!spin_is_locked(&G)) {
- * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
- * return;
- * }
- * // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- * CPU0 CPU1
- *
- * flag = set;
- * smp_mb(); spin_lock(&l)
- * spin_unlock_wait(&l); if (!flag)
- * // add to lockless list
- * spin_unlock(&l);
- * // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
- *
- * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
- * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
- u32 val;
-
- for (;;) {
- val = atomic_read(&lock->val);
-
- if (!val) /* not locked, we're done */
- goto done;
-
- if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
- break;
-
- /* not locked, but pending, wait until we observe the lock */
- cpu_relax();
- }
-
- /* any unlock is good */
- while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
- cpu_relax();
-
-done:
- smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
-
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 4ccfcaa..4355568 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
struct __qspinlock *l = (void *)lock;
if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
qstat_inc(qstat_pv_lock_stealing, true);
return true;
}
@@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
/*
* The pending bit check in pv_queued_spin_steal_lock() isn't a memory
- * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
- * just to be sure that it will get it.
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock just to be sure that it will get it.
*/
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
return !READ_ONCE(l->locked) &&
- (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
- == _Q_PENDING_VAL);
+ (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+ _Q_LOCKED_VAL) == _Q_PENDING_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
@@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
*/
old = val;
new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
- val = atomic_cmpxchg(&lock->val, old, new);
+ val = atomic_cmpxchg_acquire(&lock->val, old, new);
if (val == old)
return 1;
@@ -362,8 +362,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* observe its next->locked value and advance itself.
*
* Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ *
+ * The write to next->locked in arch_mcs_spin_unlock_contended()
+ * must be ordered before the read of pn->state in the cmpxchg()
+ * below for the code to work correctly. To guarantee full ordering
+ * irrespective of the success or failure of the cmpxchg(),
+ * a relaxed version with explicit barrier is used. The control
+ * dependency will order the reading of pn->state before any
+ * subsequent writes.
*/
- if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ smp_mb__before_atomic();
+ if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
+ != vcpu_halted)
return;
/*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 72ad45a..8d039b9 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,6 +40,9 @@ struct rt_mutex_waiter {
/*
* Various helpers to access the waiters-tree:
*/
+
+#ifdef CONFIG_RT_MUTEXES
+
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
return !RB_EMPTY_ROOT(&lock->waiters);
@@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p)
pi_tree_entry);
}
+#else
+
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return false;
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ return NULL;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return false;
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif
+
/*
* lock->owner state tracking:
*/
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 20819df..0848634 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
/*
* get a read lock on the semaphore
*/
-void __sched __down_read(struct rw_semaphore *sem)
+int __sched __down_read_common(struct rw_semaphore *sem, int state)
{
struct rwsem_waiter waiter;
unsigned long flags;
@@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem)
goto out;
}
- set_current_state(TASK_UNINTERRUPTIBLE);
-
/* set up my own style of waitqueue */
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
@@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem)
list_add_tail(&waiter.list, &sem->wait_list);
- /* we don't need to touch the semaphore struct anymore */
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
/* wait to be given the lock */
for (;;) {
if (!waiter.task)
break;
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
- __set_current_state(TASK_RUNNING);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
out:
- ;
+ return 0;
+
+out_nolock:
+ /*
+ * We didn't take the lock, so that there is a writer, which
+ * is owner or the first waiter of the sem. If it's a waiter,
+ * it will be woken by current owner. Not need to wake anybody.
+ */
+ list_del(&waiter.list);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ return -EINTR;
+}
+
+void __sched __down_read(struct rw_semaphore *sem)
+{
+ __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_read_killable(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_KILLABLE);
}
/*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 34e727f..02f6606 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
/*
* Wait for the read lock to be granted
*/
-__visible
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
{
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
struct rwsem_waiter waiter;
@@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
/* wait to be given the lock */
while (true) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(state);
if (!waiter.task)
break;
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ break;
+ }
schedule();
}
__set_current_state(TASK_RUNNING);
return sem;
+out_nolock:
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ __set_current_state(TASK_RUNNING);
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(rwsem_down_read_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+
/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f..0000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- * barrier() smp_mb() sys_membarrier()
- * barrier() X X O
- * smp_mb() X O O
- * sys_membarrier() O O O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
- /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
- if (tick_nohz_full_enabled())
- return -ENOSYS;
- if (unlikely(flags))
- return -EINVAL;
- switch (cmd) {
- case MEMBARRIER_CMD_QUERY:
- return MEMBARRIER_CMD_BITMASK;
- case MEMBARRIER_CMD_SHARED:
- if (num_online_cpus() > 1)
- synchronize_sched();
- return 0;
- default:
- return -EINVAL;
- }
-}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 124bed7..066e73c 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
}
#endif
-static void *try_ram_remap(resource_size_t offset, size_t size)
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
+{
+ return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
{
unsigned long pfn = PHYS_PFN(offset);
/* In the simple case just return the existing linear address */
- if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
+ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+ arch_memremap_can_ram_remap(offset, size, flags))
return __va(offset);
+
return NULL; /* fallback to arch_memremap_wb */
}
@@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* memremap() - remap an iomem_resource as cacheable memory
* @offset: iomem resource start address
* @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ * MEMREMAP_ENC, MEMREMAP_DEC
*
* memremap() is "ioremap" for cases where it is known that the resource
* being mapped does not have i/o side effects and the __iomem
@@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
* the requested range is potentially in System RAM.
*/
if (is_ram == REGION_INTERSECTS)
- addr = try_ram_remap(offset, size);
+ addr = try_ram_remap(offset, size, flags);
if (!addr)
addr = arch_memremap_wb(offset, size);
}
@@ -182,18 +194,41 @@ struct page_map {
struct vmem_altmap altmap;
};
-static void pgmap_radix_release(struct resource *res)
+static unsigned long order_at(struct resource *res, unsigned long pgoff)
{
- resource_size_t key, align_start, align_size, align_end;
+ unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+ unsigned long nr_pages, mask;
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(res), SECTION_SIZE);
- align_end = align_start + align_size - 1;
+ nr_pages = PHYS_PFN(resource_size(res));
+ if (nr_pages == pgoff)
+ return ULONG_MAX;
+
+ /*
+ * What is the largest aligned power-of-2 range available from
+ * this resource pgoff to the end of the resource range,
+ * considering the alignment of the current pgoff?
+ */
+ mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+ if (!mask)
+ return ULONG_MAX;
+
+ return find_first_bit(&mask, BITS_PER_LONG);
+}
+
+#define foreach_order_pgoff(res, order, pgoff) \
+ for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+ pgoff += 1UL << order, order = order_at((res), pgoff))
+
+static void pgmap_radix_release(struct resource *res)
+{
+ unsigned long pgoff, order;
mutex_lock(&pgmap_lock);
- for (key = res->start; key <= res->end; key += SECTION_SIZE)
- radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+ foreach_order_pgoff(res, order, pgoff)
+ radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
mutex_unlock(&pgmap_lock);
+
+ synchronize_rcu();
}
static unsigned long pfn_first(struct page_map *page_map)
@@ -256,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
WARN_ON_ONCE(!rcu_read_lock_held());
- page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+ page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
return page_map ? &page_map->pgmap : NULL;
}
@@ -281,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap)
{
- resource_size_t key, align_start, align_size, align_end;
+ resource_size_t align_start, align_size, align_end;
+ unsigned long pfn, pgoff, order;
pgprot_t pgprot = PAGE_KERNEL;
struct dev_pagemap *pgmap;
struct page_map *page_map;
int error, nid, is_ram;
- unsigned long pfn;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -325,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
mutex_lock(&pgmap_lock);
error = 0;
align_end = align_start + align_size - 1;
- for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+
+ foreach_order_pgoff(res, order, pgoff) {
struct dev_pagemap *dup;
rcu_read_lock();
- dup = find_dev_pagemap(key);
+ dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
rcu_read_unlock();
if (dup) {
dev_err(dev, "%s: %pr collides with mapping for %s\n",
@@ -337,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
error = -EBUSY;
break;
}
- error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
- page_map);
+ error = __radix_tree_insert(&pgmap_radix,
+ PHYS_PFN(res->start) + pgoff, order, page_map);
if (error) {
dev_err(dev, "%s: failed: %d\n", __func__, error);
break;
diff --git a/kernel/panic.c b/kernel/panic.c
index a58932b..bdd18af 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -26,6 +26,7 @@
#include <linux/nmi.h>
#include <linux/console.h>
#include <linux/bug.h>
+#include <linux/ratelimit.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err)
+{
+ WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
+ err, (void *)instruction_pointer(regs),
+ current->comm, task_pid_nr(current),
+ from_kuid_munged(&init_user_ns, current_uid()),
+ from_kuid_munged(&init_user_ns, current_euid()));
+}
+#endif
+
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
diff --git a/kernel/pid.c b/kernel/pid.c
index 731c4e5..020dedb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID)
+ if (type != PIDTYPE_PID) {
+ if (type == __PIDTYPE_TGID)
+ type = PIDTYPE_PID;
task = task->group_leader;
+ }
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
}
EXPORT_SYMBOL(__task_pid_nr_ns);
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
@@ -575,13 +572,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- unsigned int pidhash_size;
-
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
- pidhash_size = 1U << pidhash_shift;
}
void __init pidmap_init(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e1914c7..a5c36e9 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -651,7 +651,7 @@ static int load_image_and_restore(void)
int error;
unsigned int flags;
- pr_debug("Loading hibernation image.\n");
+ pm_pr_dbg("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
@@ -681,7 +681,7 @@ int hibernate(void)
bool snapshot_test = false;
if (!hibernation_available()) {
- pr_debug("Hibernation not available.\n");
+ pm_pr_dbg("Hibernation not available.\n");
return -EPERM;
}
@@ -692,6 +692,7 @@ int hibernate(void)
goto Unlock;
}
+ pr_info("hibernation entry\n");
pm_prepare_console();
error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error) {
@@ -727,7 +728,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pr_debug("Writing image.\n");
+ pm_pr_dbg("Writing image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -739,7 +740,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pr_debug("Image restored successfully.\n");
+ pm_pr_dbg("Image restored successfully.\n");
}
Free_bitmaps:
@@ -747,7 +748,7 @@ int hibernate(void)
Thaw:
unlock_device_hotplug();
if (snapshot_test) {
- pr_debug("Checking hibernation image\n");
+ pm_pr_dbg("Checking hibernation image\n");
error = swsusp_check();
if (!error)
error = load_image_and_restore();
@@ -762,6 +763,8 @@ int hibernate(void)
atomic_inc(&snapshot_device_available);
Unlock:
unlock_system_sleep();
+ pr_info("hibernation exit\n");
+
return error;
}
@@ -811,7 +814,7 @@ static int software_resume(void)
goto Unlock;
}
- pr_debug("Checking hibernation image partition %s\n", resume_file);
+ pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
if (resume_delay) {
pr_info("Waiting %dsec before reading resume device ...\n",
@@ -853,10 +856,10 @@ static int software_resume(void)
}
Check_image:
- pr_debug("Hibernation image partition %d:%d present\n",
+ pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
- pr_debug("Looking for hibernation image.\n");
+ pm_pr_dbg("Looking for hibernation image.\n");
error = swsusp_check();
if (error)
goto Unlock;
@@ -868,6 +871,7 @@ static int software_resume(void)
goto Unlock;
}
+ pr_info("resume from hibernation\n");
pm_prepare_console();
error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (error) {
@@ -875,7 +879,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pr_debug("Preparing processes for restore.\n");
+ pm_pr_dbg("Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -884,11 +888,12 @@ static int software_resume(void)
Finish:
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
+ pr_info("resume from hibernation failed (%d)\n", error);
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
mutex_unlock(&pm_mutex);
- pr_debug("Hibernation image not present or could not be loaded.\n");
+ pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
swsusp_close(FMODE_READ);
@@ -1012,8 +1017,8 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
error = -EINVAL;
if (!error)
- pr_debug("Hibernation mode set to '%s'\n",
- hibernation_modes[mode]);
+ pm_pr_dbg("Hibernation mode set to '%s'\n",
+ hibernation_modes[mode]);
unlock_system_sleep();
return error ? error : n;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 42bd800..3a2ca90 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -150,7 +150,7 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
power_attr(mem_sleep);
#endif /* CONFIG_SUSPEND */
-#ifdef CONFIG_PM_DEBUG
+#ifdef CONFIG_PM_SLEEP_DEBUG
int pm_test_level = TEST_NONE;
static const char * const pm_tests[__TEST_AFTER_LAST] = {
@@ -211,7 +211,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
}
power_attr(pm_test);
-#endif /* CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_SLEEP_DEBUG */
#ifdef CONFIG_DEBUG_FS
static char *suspend_step_name(enum suspend_stat_step step)
@@ -361,6 +361,61 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
power_attr_ro(pm_wakeup_irq);
+bool pm_debug_messages_on __read_mostly;
+
+static ssize_t pm_debug_messages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", pm_debug_messages_on);
+}
+
+static ssize_t pm_debug_messages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_debug_messages_on = !!val;
+ return n;
+}
+
+power_attr(pm_debug_messages);
+
+/**
+ * __pm_pr_dbg - Print a suspend debug message to the kernel log.
+ * @defer: Whether or not to use printk_deferred() to print the message.
+ * @fmt: Message format.
+ *
+ * The message will be emitted if enabled through the pm_debug_messages
+ * sysfs attribute.
+ */
+void __pm_pr_dbg(bool defer, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (!pm_debug_messages_on)
+ return;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (defer)
+ printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
+ else
+ printk(KERN_DEBUG "PM: %pV", &vaf);
+
+ va_end(args);
+}
+
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -691,12 +746,11 @@ static struct attribute * g[] = {
&wake_lock_attr.attr,
&wake_unlock_attr.attr,
#endif
-#ifdef CONFIG_PM_DEBUG
- &pm_test_attr.attr,
-#endif
#ifdef CONFIG_PM_SLEEP_DEBUG
+ &pm_test_attr.attr,
&pm_print_times_attr.attr,
&pm_wakeup_irq_attr.attr,
+ &pm_debug_messages_attr.attr,
#endif
#endif
#ifdef CONFIG_FREEZER
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7fdc40d..1d2d761 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -192,7 +192,6 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
extern const char * const pm_labels[];
extern const char *pm_states[];
extern const char *mem_sleep_states[];
-extern suspend_state_t mem_sleep_current;
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
@@ -245,7 +244,11 @@ enum {
#define TEST_FIRST TEST_NONE
#define TEST_MAX (__TEST_AFTER_LAST - 1)
+#ifdef CONFIG_PM_SLEEP_DEBUG
extern int pm_test_level;
+#else
+#define pm_test_level (TEST_NONE)
+#endif
#ifdef CONFIG_SUSPEND_FREEZER
static inline int suspend_freeze_processes(void)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 22231772..0972a8e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3ecf275..3e2b4f5 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -8,6 +8,8 @@
* This file is released under the GPLv2.
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -33,53 +35,55 @@
#include "power.h"
const char * const pm_labels[] = {
- [PM_SUSPEND_FREEZE] = "freeze",
+ [PM_SUSPEND_TO_IDLE] = "freeze",
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
const char *pm_states[PM_SUSPEND_MAX];
static const char * const mem_sleep_labels[] = {
- [PM_SUSPEND_FREEZE] = "s2idle",
+ [PM_SUSPEND_TO_IDLE] = "s2idle",
[PM_SUSPEND_STANDBY] = "shallow",
[PM_SUSPEND_MEM] = "deep",
};
const char *mem_sleep_states[PM_SUSPEND_MAX];
-suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
-static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
+suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE;
+suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
+suspend_state_t pm_suspend_target_state;
+EXPORT_SYMBOL_GPL(pm_suspend_target_state);
unsigned int pm_suspend_global_flags;
EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
static const struct platform_suspend_ops *suspend_ops;
-static const struct platform_freeze_ops *freeze_ops;
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static const struct platform_s2idle_ops *s2idle_ops;
+static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
-enum freeze_state __read_mostly suspend_freeze_state;
-static DEFINE_SPINLOCK(suspend_freeze_lock);
+enum s2idle_states __read_mostly s2idle_state;
+static DEFINE_SPINLOCK(s2idle_lock);
-void freeze_set_ops(const struct platform_freeze_ops *ops)
+void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
lock_system_sleep();
- freeze_ops = ops;
+ s2idle_ops = ops;
unlock_system_sleep();
}
-static void freeze_begin(void)
+static void s2idle_begin(void)
{
- suspend_freeze_state = FREEZE_STATE_NONE;
+ s2idle_state = S2IDLE_STATE_NONE;
}
-static void freeze_enter(void)
+static void s2idle_enter(void)
{
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
- spin_lock_irq(&suspend_freeze_lock);
+ spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
- suspend_freeze_state = FREEZE_STATE_ENTER;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_ENTER;
+ spin_unlock_irq(&s2idle_lock);
get_online_cpus();
cpuidle_resume();
@@ -87,56 +91,75 @@ static void freeze_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
- wait_event(suspend_freeze_wait_head,
- suspend_freeze_state == FREEZE_STATE_WAKE);
+ wait_event(s2idle_wait_head,
+ s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
put_online_cpus();
- spin_lock_irq(&suspend_freeze_lock);
+ spin_lock_irq(&s2idle_lock);
out:
- suspend_freeze_state = FREEZE_STATE_NONE;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_NONE;
+ spin_unlock_irq(&s2idle_lock);
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
}
static void s2idle_loop(void)
{
- pr_debug("PM: suspend-to-idle\n");
+ pm_pr_dbg("suspend-to-idle\n");
+
+ for (;;) {
+ int error;
+
+ dpm_noirq_begin();
+
+ /*
+ * Suspend-to-idle equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus s2idle_enter() should be called right after
+ * all devices have been suspended.
+ */
+ error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
+ if (!error)
+ s2idle_enter();
+
+ dpm_noirq_resume_devices(PMSG_RESUME);
+ if (error && (error != -EBUSY || !pm_wakeup_pending())) {
+ dpm_noirq_end();
+ break;
+ }
- do {
- freeze_enter();
+ if (s2idle_ops && s2idle_ops->wake)
+ s2idle_ops->wake();
- if (freeze_ops && freeze_ops->wake)
- freeze_ops->wake();
+ dpm_noirq_end();
- dpm_resume_noirq(PMSG_RESUME);
- if (freeze_ops && freeze_ops->sync)
- freeze_ops->sync();
+ if (s2idle_ops && s2idle_ops->sync)
+ s2idle_ops->sync();
if (pm_wakeup_pending())
break;
pm_wakeup_clear(false);
- } while (!dpm_suspend_noirq(PMSG_SUSPEND));
+ }
- pr_debug("PM: resume from suspend-to-idle\n");
+ pm_pr_dbg("resume from suspend-to-idle\n");
}
-void freeze_wake(void)
+void s2idle_wake(void)
{
unsigned long flags;
- spin_lock_irqsave(&suspend_freeze_lock, flags);
- if (suspend_freeze_state > FREEZE_STATE_NONE) {
- suspend_freeze_state = FREEZE_STATE_WAKE;
- wake_up(&suspend_freeze_wait_head);
+ spin_lock_irqsave(&s2idle_lock, flags);
+ if (s2idle_state > S2IDLE_STATE_NONE) {
+ s2idle_state = S2IDLE_STATE_WAKE;
+ wake_up(&s2idle_wait_head);
}
- spin_unlock_irqrestore(&suspend_freeze_lock, flags);
+ spin_unlock_irqrestore(&s2idle_lock, flags);
}
-EXPORT_SYMBOL_GPL(freeze_wake);
+EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
@@ -152,19 +175,19 @@ void __init pm_states_init(void)
{
/* "mem" and "freeze" are always present in /sys/power/state. */
pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
+ pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE];
/*
* Suspend-to-idle should be supported even without any suspend_ops,
* initialize mem_sleep_states[] accordingly here.
*/
- mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
+ mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE];
}
static int __init mem_sleep_default_setup(char *str)
{
suspend_state_t state;
- for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
+ for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++)
if (mem_sleep_labels[state] &&
!strcmp(str, mem_sleep_labels[state])) {
mem_sleep_default = state;
@@ -193,7 +216,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
}
if (valid_state(PM_SUSPEND_MEM)) {
mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
- if (mem_sleep_default == PM_SUSPEND_MEM)
+ if (mem_sleep_default >= PM_SUSPEND_MEM)
mem_sleep_current = PM_SUSPEND_MEM;
}
@@ -216,49 +239,49 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
}
static int platform_suspend_prepare(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ?
suspend_ops->prepare() : 0;
}
static int platform_suspend_prepare_late(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
- freeze_ops->prepare() : 0;
+ return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ?
+ s2idle_ops->prepare() : 0;
}
static int platform_suspend_prepare_noirq(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ?
suspend_ops->prepare_late() : 0;
}
static void platform_resume_noirq(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake)
suspend_ops->wake();
}
static void platform_resume_early(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
- freeze_ops->restore();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore)
+ s2idle_ops->restore();
}
static void platform_resume_finish(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish)
suspend_ops->finish();
}
static int platform_suspend_begin(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
- return freeze_ops->begin();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin)
+ return s2idle_ops->begin();
else if (suspend_ops && suspend_ops->begin)
return suspend_ops->begin(state);
else
@@ -267,21 +290,21 @@ static int platform_suspend_begin(suspend_state_t state)
static void platform_resume_end(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
- freeze_ops->end();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end)
+ s2idle_ops->end();
else if (suspend_ops && suspend_ops->end)
suspend_ops->end();
}
static void platform_recover(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover)
suspend_ops->recover();
}
static bool platform_suspend_again(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ?
suspend_ops->suspend_again() : false;
}
@@ -370,16 +393,21 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- pr_err("PM: late suspend of devices failed\n");
+ pr_err("late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
if (error)
goto Devices_early_resume;
+ if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) {
+ s2idle_loop();
+ goto Platform_early_resume;
+ }
+
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
- pr_err("PM: noirq suspend of devices failed\n");
+ pr_err("noirq suspend of devices failed\n");
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -389,17 +417,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
- /*
- * PM_SUSPEND_FREEZE equals
- * frozen processes + suspended devices + idle processors.
- * Thus we should invoke freeze_enter() soon after
- * all the devices are suspended.
- */
- if (state == PM_SUSPEND_FREEZE) {
- s2idle_loop();
- goto Platform_early_resume;
- }
-
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -456,6 +473,8 @@ int suspend_devices_and_enter(suspend_state_t state)
if (!sleep_state_supported(state))
return -ENOSYS;
+ pm_suspend_target_state = state;
+
error = platform_suspend_begin(state);
if (error)
goto Close;
@@ -464,7 +483,7 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
- pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ pr_err("Some devices failed to suspend, or early wake event detected\n");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -485,6 +504,7 @@ int suspend_devices_and_enter(suspend_state_t state)
Close:
platform_resume_end(state);
+ pm_suspend_target_state = PM_SUSPEND_ON;
return error;
Recover_platform:
@@ -518,10 +538,10 @@ static int enter_state(suspend_state_t state)
int error;
trace_suspend_resume(TPS("suspend_enter"), state, true);
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
- pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
+ pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
return -EAGAIN;
}
#endif
@@ -531,18 +551,18 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
- if (state == PM_SUSPEND_FREEZE)
- freeze_begin();
+ if (state == PM_SUSPEND_TO_IDLE)
+ s2idle_begin();
#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- pr_info("PM: Syncing filesystems ... ");
+ pr_info("Syncing filesystems ... ");
sys_sync();
pr_cont("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif
- pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
+ pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
@@ -552,13 +572,13 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Suspending system (%s)\n", pm_states[state]);
+ pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
Finish:
- pr_debug("PM: Finishing wakeup.\n");
+ pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
mutex_unlock(&pm_mutex);
@@ -579,6 +599,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -586,6 +607,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pr_info("suspend exit\n");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 5db2170..6a897e8 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -104,9 +104,9 @@ repeat:
printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status < 0)
- state = PM_SUSPEND_FREEZE;
+ state = PM_SUSPEND_TO_IDLE;
}
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 57d2257..d7cdc42 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio)
if (bio->bi_status) {
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
}
@@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio->bi_bdev = hib_resume_bdev;
+ bio_set_dev(bio, hib_resume_bdev);
bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c94..9210379 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU
- bool
- default n
+ def_bool PREEMPT
select SRCU
help
This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 808b8c8..e4b43fe 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -356,22 +356,10 @@ do { \
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
-{
- return true;
-}
-static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
-{
- return false;
-}
-
-static inline void rcu_expedite_gp(void)
-{
-}
-
-static inline void rcu_unexpedite_gp(void)
-{
-}
+static inline bool rcu_gp_is_normal(void) { return true; }
+static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline void rcu_expedite_gp(void) { }
+static inline void rcu_unexpedite_gp(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
*gpnum = 0;
*completed = 0;
}
-static inline void rcutorture_record_test_transition(void)
-{
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
+static inline void rcutorture_record_test_transition(void) { }
+static inline void rcutorture_record_progress(unsigned long vernum) { }
#ifdef CONFIG_RCU_TRACE
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
@@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
#endif
#ifdef CONFIG_TINY_RCU
-
-/*
- * Return the number of grace periods started.
- */
-static inline unsigned long rcu_batches_started(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods started.
- */
-static inline unsigned long rcu_batches_started_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited sched grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
- return 0;
-}
-
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return 0;
-}
-
-static inline void rcu_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-
-static inline void show_rcu_gp_kthreads(void)
-{
-}
-
+static inline unsigned long rcu_batches_started(void) { return 0; }
+static inline unsigned long rcu_batches_started_bh(void) { return 0; }
+static inline unsigned long rcu_batches_started_sched(void) { return 0; }
+static inline unsigned long rcu_batches_completed(void) { return 0; }
+static inline unsigned long rcu_batches_completed_bh(void) { return 0; }
+static inline unsigned long rcu_batches_completed_sched(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; }
+static inline unsigned long
+srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+static inline void rcu_force_quiescent_state(void) { }
+static inline void rcu_bh_force_quiescent_state(void) { }
+static inline void rcu_sched_force_quiescent_state(void) { }
+static inline void show_rcu_gp_kthreads(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
extern unsigned long rcutorture_testseq;
extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2b62a38..7649fcd 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
}
/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
- int cnt = 0;
- struct rcu_head **rhpp = &rclp->head;
-
- for (;;) {
- if (!*rhpp)
- return cnt;
- if (++cnt > lim)
- return -1;
- rhpp = &(*rhpp)->next;
- }
-}
-
-/*
* Dequeue the oldest rcu_head structure from the specified callback
* list. This function assumes that the callback is non-lazy, but
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
- if (seg == RCU_DONE_TAIL)
- return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
- return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
* Does the specified rcu_segcblist structure contain callbacks that
* are ready to be invoked?
*/
@@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
}
/*
- * Dequeue and return the first ready-to-invoke callback. If there
- * are no ready-to-invoke callbacks, return NULL. Disables interrupts
- * to avoid interference. Does not protect from interference from other
- * CPUs or tasks.
- */
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
- int i;
- struct rcu_head *rhp;
-
- local_irq_save(flags);
- if (!rcu_segcblist_ready_cbs(rsclp)) {
- local_irq_restore(flags);
- return NULL;
- }
- rhp = rsclp->head;
- BUG_ON(!rhp);
- rsclp->head = rhp->next;
- for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
- if (rsclp->tails[i] != &rhp->next)
- break;
- rsclp->tails[i] = &rsclp->head;
- }
- smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
- WRITE_ONCE(rsclp->len, rsclp->len - 1);
- local_irq_restore(flags);
- return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rsclp->len_lazy--;
- local_irq_restore(flags);
-}
-
-/*
* Return a pointer to the first callback in the specified rcu_segcblist
* structure. This is useful for diagnostics.
*/
@@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
}
/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
- return rcu_segcblist_is_enabled(rsclp) &&
- !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
* Enqueue the specified callback onto the specified rcu_segcblist
* structure, updating accounting as needed. Note that the ->len
* field may be accessed locklessly, hence the WRITE_ONCE().
@@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
return true;
return false;
}
+
+/*
+ * Merge the source rcu_segcblist structure into the destination
+ * rcu_segcblist structure, then initialize the source. Any pending
+ * callbacks from the source get to start over. It is best to
+ * advance and accelerate both the destination and the source
+ * before merging.
+ */
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp)
+{
+ struct rcu_cblist donecbs;
+ struct rcu_cblist pendcbs;
+
+ rcu_cblist_init(&donecbs);
+ rcu_cblist_init(&pendcbs);
+ rcu_segcblist_extract_count(src_rsclp, &donecbs);
+ rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
+ rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+ rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+ rcu_segcblist_init(src_rsclp);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 6e36e36..581c12b 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
rclp->len_lazy--;
}
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
- return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
- WARN_ON_ONCE(!rclp->head);
- return rclp->tail;
-}
-
void rcu_cblist_init(struct rcu_cblist *rclp);
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
@@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
unsigned long seq);
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cc1811..1f87a02 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks perf testing.
*/
@@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {
.name = "tasks"
};
-#define RCUPERF_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUPERF_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* If performance tests complete, wait for shutdown to commence.
*/
@@ -658,7 +643,7 @@ rcu_perf_init(void)
int firsterr = 0;
static struct rcu_perf_ops *perf_ops[] = {
&rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
- RCUPERF_TASKS_OPS
+ &tasks_ops,
};
if (!torture_init_begin(perf_type, verbose, &perf_runnable))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8c..45f2ffbc 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
static u64 notrace rcu_trace_clock_local(void)
{
u64 ts = trace_clock_local();
- unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+
+ (void)do_div(ts, NSEC_PER_USEC);
return ts;
}
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
- .name = "rcu_busted"
+ .name = "busted"
};
/*
@@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
delay = torture_random(rrsp) %
(nrealreaders * 2 * longdelay * uspertick);
- if (!delay)
+ if (!delay && in_task())
schedule_timeout_interruptible(longdelay);
else
rcu_read_delay(rrsp);
@@ -561,44 +562,7 @@ static void srcu_torture_barrier(void)
static void srcu_torture_stats(void)
{
- int __maybe_unused cpu;
- int idx;
-
-#ifdef CONFIG_TREE_SRCU
- idx = srcu_ctlp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *counts;
-
- counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
- u0 = counts->srcu_unlock_count[!idx];
- u1 = counts->srcu_unlock_count[idx];
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = counts->srcu_lock_count[!idx];
- l1 = counts->srcu_lock_count[idx];
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
- }
- pr_cont("\n");
-#elif defined(CONFIG_TINY_SRCU)
- idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
- pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
- torture_type, TORTURE_FLAG, idx,
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-#endif
+ srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
}
static void srcu_torture_synchronize_expedited(void)
@@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcu"
};
@@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcud"
};
@@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks torture testing.
*/
@@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUTORTURE_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
+static void rcu_torture_timer_cb(struct rcu_head *rhp)
+{
+ kfree(rhp);
+}
+
/*
* RCU torture reader from timer handler. Dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
@@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
+
+ /* Test call_rcu() invocation from interrupt handler. */
+ if (cur_ops->call) {
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+
+ if (rhp)
+ cur_ops->call(rhp, rcu_torture_timer_cb);
+ }
}
/*
@@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void)
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
&flags, &gpnum, &completed);
wtp = READ_ONCE(writer_task);
- pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
gpnum, completed, flags,
- wtp == NULL ? ~0UL : wtp->state);
+ wtp == NULL ? ~0UL : wtp->state,
+ wtp == NULL ? -1 : (int)task_cpu(wtp));
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1749,7 +1714,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &sched_ops, RCUTORTURE_TASKS_OPS
+ &sched_ops, &tasks_ops,
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c104..76ac5f5 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+int rcu_scheduler_active __read_mostly;
+
static int init_srcu_struct_fields(struct srcu_struct *sp)
{
sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
destroy_rcu_head_on_stack(&rs.head);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/* Lockdep diagnostics. */
+void __init rcu_scheduler_starting(void)
+{
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524..729a870 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
/*
* Initialize SRCU combining tree. Note that statically allocated
@@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
wait_for_completion(&rcu.completion);
destroy_rcu_head_on_stack(&rcu.head);
+
+ /*
+ * Make sure that later code is ordered after the SRCU grace
+ * period. This pairs with the raw_spin_lock_irq_rcu_node()
+ * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
+ * because the current CPU might have been totally uninvolved with
+ * (and thus unordered against) that grace period.
+ */
+ smp_mb();
}
/**
@@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/*
* This is the work-queue function that handles SRCU grace periods.
*/
-void process_srcu(struct work_struct *work)
+static void process_srcu(struct work_struct *work)
{
struct srcu_struct *sp;
@@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)
srcu_advance_state(sp);
srcu_reschedule(sp, srcu_get_delay(sp));
}
-EXPORT_SYMBOL_GPL(process_srcu);
void srcutorture_get_gp_data(enum rcutorture_type test_type,
struct srcu_struct *sp, int *flags,
@@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+{
+ int cpu;
+ int idx;
+ unsigned long s0 = 0, s1 = 0;
+
+ idx = sp->srcu_idx & 0x1;
+ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *counts;
+
+ counts = per_cpu_ptr(sp->sda, cpu);
+ u0 = counts->srcu_unlock_count[!idx];
+ u1 = counts->srcu_unlock_count[idx];
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = counts->srcu_lock_count[!idx];
+ l1 = counts->srcu_lock_count[idx];
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
+
static int __init srcu_bootup_announce(void)
{
pr_info("Hierarchical SRCU implementation.\n");
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f848896..a64eee0 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.curtail = &rcu_bh_ctrlblk.rcucblist,
};
-#include "tiny_plugin.h"
-
void rcu_barrier_bh(void)
{
wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2..0000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-#include <linux/kernel_stat.h>
-
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously. Note that unlike
- * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
- * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
- * The reason for this is that Tiny RCU does not need kthreads, so does
- * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process. Unless SRCU is in the mix.
- */
-void __init rcu_scheduler_starting(void)
-{
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
- ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
-}
-
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51d4c3a..84fe966 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \
.gp_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \
- .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
- .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
- .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
@@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user)
*/
void rcu_idle_enter(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
rcu_eqs_enter(false);
- local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
*/
void rcu_user_enter(void)
{
- rcu_eqs_enter(1);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+ rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user)
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
+ __this_cpu_inc(disable_rcu_irq_enter);
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_eqs_exit_common(oldval, user);
+ __this_cpu_dec(disable_rcu_irq_enter);
}
}
@@ -979,7 +975,6 @@ void rcu_idle_exit(void)
rcu_eqs_exit(false);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ) {
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rsp->name, j - gpa,
rsp->gpnum, rsp->completed,
rsp->gp_flags,
gp_state_getname(rsp->gp_state), rsp->gp_state,
- rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+ rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
+ rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
if (rsp->gp_kthread) {
sched_show_task(rsp->gp_kthread);
wake_up_process(rsp->gp_kthread);
@@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)
}
/*
- * Helper function for wait_event_interruptible_timeout() wakeup
- * at force-quiescent-state time.
+ * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * time.
*/
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
{
@@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- swait_event_interruptible(rsp->gp_wq,
- READ_ONCE(rsp->gp_flags) &
- RCU_GP_FLAG_INIT);
+ swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
@@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = swait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_idle_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+ rcu_preempt_blocked_readers_cgp(rnp));
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
mask, rnp->qsmask, rnp->level,
@@ -2563,85 +2560,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Send the specified CPU's RCU callbacks to the orphanage. The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
- struct rcu_node *rnp, struct rcu_data *rdp)
-{
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs do not have orphanable callbacks. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
- return;
-
- /*
- * Orphan the callbacks. First adjust the counts. This is safe
- * because _rcu_barrier() excludes CPU-hotplug operations, so it
- * cannot be running now. Thus no memory barrier is required.
- */
- rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
- rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * Next, move those callbacks still needing a grace period to
- * the orphanage, where some other CPU will pick them up.
- * Some of the callbacks might have gone partway through a grace
- * period, but that is too bad. They get to start over because we
- * cannot assume that grace periods are synchronized across CPUs.
- */
- rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-
- /*
- * Then move the ready-to-invoke callbacks to the orphanage,
- * where some other CPU will pick them up. These will not be
- * required to pass though another grace period: They are done.
- */
- rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-
- /* Finally, disallow further callbacks on this CPU. */
- rcu_segcblist_disable(&rdp->cblist);
-}
-
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage. The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
- struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs are handled specially. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
- return;
-
- /* Do the accounting first. */
- rdp->n_cbs_adopted += rsp->orphan_done.len;
- if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
- rcu_idle_count_callbacks_posted();
- rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * We do not need a memory barrier here because the only way we
- * can get here if there is an rcu_barrier() in flight is if
- * we are the task doing the rcu_barrier().
- */
-
- /* First adopt the ready-to-invoke callbacks, then the done ones. */
- rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
- WARN_ON_ONCE(rsp->orphan_done.head);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
- WARN_ON_ONCE(rsp->orphan_pend.head);
- WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
- !rcu_segcblist_n_cbs(&rdp->cblist));
-}
-
-/*
* Trace the fact that this CPU is going offline.
*/
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
@@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
/*
* The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup,
- * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them. There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
*/
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
- unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
-
- /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
- rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
- rcu_adopt_orphan_cbs(rsp, flags);
- raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-
- WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
- !rcu_segcblist_empty(&rdp->cblist),
- "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
- cpu, rcu_segcblist_n_cbs(&rdp->cblist),
- rcu_segcblist_first_cb(&rdp->cblist));
}
/*
@@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("LastCB"), -1,
+ rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);
}
}
@@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
atomic_inc(&rsp->barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1,
+ rsp->barrier_sequence);
}
}
@@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp)
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, s);
+ _rcu_barrier_trace(rsp, TPS("Begin"), -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
/* Did someone else do our work for us? */
if (rcu_seq_done(&rsp->barrier_sequence, s)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1,
+ rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
@@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Mark the start of the barrier operation. */
rcu_seq_start(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
- _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,
rsp->barrier_sequence);
} else {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,
rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
@@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
rcu_barrier_callback, rsp, cpu, 0);
}
} else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
- _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,
rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
- _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,
rsp->barrier_sequence);
}
}
@@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
wait_for_completion(&rsp->barrier_completion);
/* Mark the end of the barrier operation. */
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);
rcu_seq_end(&rsp->barrier_sequence);
/* Other rcu_barrier() invocations can now safely proceed. */
@@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- if (!rdp->beenonline)
- WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
@@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
+ int nbits;
+ unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_state *rsp;
@@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->qsmaskinitnext |= mask;
+ oldmask = rnp->expmaskinitnext;
rnp->expmaskinitnext |= mask;
+ oldmask ^= rnp->expmaskinitnext;
+ nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
+ /* Allow lockless access for expedited grace periods. */
+ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+ smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu)
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_idle_cpu(cpu, rsp);
}
+
+/* Migrate the dead CPU's callbacks to the current CPU. */
+static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *my_rdp;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ return; /* No callbacks to migrate. */
+
+ local_irq_save(flags);
+ my_rdp = this_cpu_ptr(rsp->rda);
+ if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
+ local_irq_restore(flags);
+ return;
+ }
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
+ rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+ rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
+ !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+ !rcu_segcblist_empty(&rdp->cblist),
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+ cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_first_cb(&rdp->cblist));
+}
+
+/*
+ * The outgoing CPU has just passed through the dying-idle state,
+ * and we are being invoked from the CPU that was IPIed to continue the
+ * offline operation. We need to migrate the outgoing CPU's callbacks.
+ */
+void rcutree_migrate_callbacks(int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_migrate_callbacks(cpu, rsp);
+}
#endif
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9af0f31..8e1f285 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -219,8 +219,6 @@ struct rcu_data {
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -268,7 +266,9 @@ struct rcu_data {
struct rcu_head **nocb_follower_tail;
struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
+ raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+ struct timer_list nocb_timer; /* Enforce finite deferral. */
/* The following fields are used by the leader, hence own cacheline. */
struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -350,15 +350,6 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */
- raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
- /* Protect following fields. */
- struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
- /* need a grace period. */
- struct rcu_cblist orphan_done; /* Orphaned callbacks that */
- /* are ready to invoke. */
- /* (Contains counts.) */
- /* End of fields guarded by orphan_lock. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
@@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dd21ca4..46d61b5 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
unsigned long flags;
unsigned long mask;
unsigned long oldmask;
- int ncpus = READ_ONCE(rsp->ncpus);
+ int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */
struct rcu_node *rnp;
struct rcu_node *rnp_up;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 908b309..55bde94 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
struct task_struct *t = current;
lockdep_assert_held(&rnp->lock);
+ WARN_ON_ONCE(rdp->mynode != rnp);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
/*
* Decide where to queue the newly blocked task. In theory,
@@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
+ WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
+ !(rnp->qsmask & rdp->grpmask));
+ WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
+ !(rnp->expmask & rdp->grpmask));
raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/*
@@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t)
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
}
/*
@@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
+ struct task_struct *t;
+
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (rcu_preempt_has_tasks(rnp))
+ if (rcu_preempt_has_tasks(rnp)) {
rnp->gp_tasks = rnp->blkd_tasks.next;
+ t = container_of(rnp->gp_tasks, struct task_struct,
+ rcu_node_entry);
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
+ rnp->gpnum, t->pid);
+ }
WARN_ON_ONCE(rnp->qsmask);
}
@@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu)
}
/*
- * Kick the leader kthread for this NOCB group.
+ * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
+ * and this function releases it.
*/
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
- if (!READ_ONCE(rdp_leader->nocb_kthread))
+ lockdep_assert_held(&rdp->nocb_lock);
+ if (!READ_ONCE(rdp_leader->nocb_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
- if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ }
+ if (rdp_leader->nocb_leader_sleep || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ del_timer(&rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
+ } else {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
}
/*
+ * Kick the leader kthread for this NOCB group, but caller has not
+ * acquired locks.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ __wake_nocb_leader(rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the leader kthread for this NOCB group at some
+ * future time when it is safe to do so.
+ */
+static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+ mod_timer(&rdp->nocb_timer, jiffies + 1);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+}
+
+/*
* Does the specified CPU need an RCU callback for the specified flavor
* of rcu_barrier()?
*/
@@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeEmptyIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
@@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeOvfIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
} else {
@@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
* Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
* not a no-CBs CPU.
*/
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
- long ql = rsp->orphan_done.len;
- long qll = rsp->orphan_done.len_lazy;
-
- /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false;
-
- /* First, enqueue the donelist, if any. This preserves CB ordering. */
- if (rsp->orphan_done.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
- rcu_cblist_tail(&rsp->orphan_done),
- ql, qll, flags);
- }
- if (rsp->orphan_pend.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
- rcu_cblist_tail(&rsp->orphan_pend),
- ql, qll, flags);
- }
- rcu_cblist_init(&rsp->orphan_done);
- rcu_cblist_init(&rsp->orphan_pend);
+ return false; /* Not NOCBs CPU, caller must migrate CBs. */
+ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
+ rcu_segcblist_tail(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_disable(&rdp->cblist);
return true;
}
@@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
static void nocb_leader_wait(struct rcu_data *my_rdp)
{
bool firsttime = true;
+ unsigned long flags;
bool gotcbs;
struct rcu_data *rdp;
struct rcu_head **tail;
@@ -2039,13 +2076,17 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
- /* Memory barrier handled by smp_mb() calls below and repoll. */
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
+ my_rdp->nocb_leader_sleep = true;
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
}
/*
@@ -2054,7 +2095,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
- smp_mb(); /* wakeup before ->nocb_head reads. */
+ smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
@@ -2066,56 +2107,41 @@ wait_again:
gotcbs = true;
}
- /*
- * If there were no callbacks, sleep a bit, rescan after a
- * memory barrier, and go retry.
- */
+ /* No callbacks? Sleep a bit if polling, and go retry. */
if (unlikely(!gotcbs)) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
-
- /* Rescan in case we were a victim of memory ordering. */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (READ_ONCE(rdp->nocb_head)) {
- /* Found CB, so short-circuit next wait. */
- my_rdp->nocb_leader_sleep = false;
- break;
- }
+ if (rcu_nocb_poll) {
+ schedule_timeout_interruptible(1);
+ } else {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ TPS("WokeEmpty"));
+ }
goto wait_again;
}
/* Wait for one grace period. */
rcu_nocb_wait_gp(my_rdp);
- /*
- * We left ->nocb_leader_sleep unset to reduce cache thrashing.
- * We set it now, but recheck for new callbacks while
- * traversing our follower list.
- */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
-
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (READ_ONCE(rdp->nocb_head))
+ if (!rcu_nocb_poll &&
+ READ_ONCE(rdp->nocb_head) &&
+ READ_ONCE(my_rdp->nocb_leader_sleep)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
+ }
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
/* Append callbacks to follower's "done" list. */
- tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = rdp->nocb_gp_tail;
*tail = rdp->nocb_gp_head;
- smp_mb__after_atomic(); /* Store *tail before wakeup. */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /*
- * List was empty, wake up the follower.
- * Memory barriers supplied by atomic_long_add().
- */
+ /* List was empty, so wake up the follower. */
swake_up(&rdp->nocb_wq);
}
}
@@ -2131,28 +2157,16 @@ wait_again:
*/
static void nocb_follower_wait(struct rcu_data *rdp)
{
- bool firsttime = true;
-
for (;;) {
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "FollowerSleep");
- swait_event_interruptible(rdp->nocb_wq,
- READ_ONCE(rdp->nocb_follower_head));
- } else if (firsttime) {
- /* Don't drown trace log with "Poll"! */
- firsttime = false;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
- }
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
+ swait_event_interruptible(rdp->nocb_wq,
+ READ_ONCE(rdp->nocb_follower_head));
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */
return;
}
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
}
}
@@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
+ unsigned long flags;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg)
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
- list = READ_ONCE(rdp->nocb_follower_head);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ list = rdp->nocb_follower_head;
+ rdp->nocb_follower_head = NULL;
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
BUG_ON(!list);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- WRITE_ONCE(rdp->nocb_follower_head, NULL);
- tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name,
@@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
+ unsigned long flags;
int ndw;
- if (!rcu_nocb_need_deferred_wakeup(rdp))
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (!rcu_nocb_need_deferred_wakeup(rdp)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
+ }
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
+ __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(unsigned long x)
+{
+ do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ if (rcu_nocb_need_deferred_wakeup(rdp))
+ do_nocb_deferred_wakeup_common(rdp);
+}
+
void __init rcu_init_nohz(void)
{
int cpu;
@@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
rdp->nocb_tail = &rdp->nocb_head;
init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_lock_init(&rdp->nocb_lock);
+ setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
+ (unsigned long)rdp);
}
/*
@@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
return false;
}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c4..5033b66 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
+DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)
mutex_unlock(&rcu_tasks_kthread_mutex);
}
+/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_start(void)
+{
+ preempt_disable();
+ current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ preempt_enable();
+}
+
+/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_finish(void)
+{
+ preempt_disable();
+ __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
+ preempt_enable();
+}
+
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifndef CONFIG_TINY_RCU
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164..78f5493 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489..de6d7f4 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void)
goto out_fail;
tg = sched_create_group(&root_task_group);
-
if (IS_ERR(tg))
goto out_free;
@@ -101,7 +100,7 @@ out_free:
out_fail:
if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n",
- ag ? "sched_create_group()" : "kmalloc()");
+ ag ? "sched_create_group()" : "kzalloc()");
}
return autogroup_kref_get(&autogroup_default);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae..cc87307 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -32,6 +32,12 @@ void complete(struct completion *x)
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
+
+ /*
+ * Perform commit of crossrelease here.
+ */
+ complete_release_commit(x);
+
if (x->done != UINT_MAX)
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
@@ -47,6 +53,13 @@ EXPORT_SYMBOL(complete);
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
*/
void complete_all(struct completion *x)
{
@@ -92,9 +105,14 @@ __wait_for_common(struct completion *x,
{
might_sleep();
+ complete_acquire(x);
+
spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
+
+ complete_release(x);
+
return timeout;
}
@@ -297,9 +315,12 @@ EXPORT_SYMBOL(try_wait_for_completion);
* Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
+ * Note, this will always return true if complete_all() was called on @X.
*/
bool completion_done(struct completion *x)
{
+ unsigned long flags;
+
if (!READ_ONCE(x->done))
return false;
@@ -307,14 +328,9 @@ bool completion_done(struct completion *x)
* If ->done, we need to wait for complete() to release ->wait.lock
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
- *
- * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
- * the loads of ->done and ->wait.lock such that we cannot observe
- * the lock before complete() acquires it while observing the ->done
- * after it's acquired the lock.
*/
- smp_rmb();
- spin_unlock_wait(&x->wait.lock);
+ spin_lock_irqsave(&x->wait.lock, flags);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b..6d2c7ff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
+ if (p->flags & PF_KTHREAD) {
+ if (unlikely(!cpu_online(dest_cpu)))
+ return rq;
+ } else {
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+ }
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -1967,8 +1972,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
*/
- smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ smp_mb__after_spinlock();
if (!(p->state & state))
goto out;
@@ -2069,7 +2074,7 @@ out:
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
- * @cookie: context's cookie for pinning
+ * @rf: request-queue flags for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ *
+ * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+ * up adding a full barrier to switch_mm(), or we should figure
+ * out if a smp_mb__after_unlock_lock is really the proper API
+ * to use.
+ */
+ smp_mb__after_unlock_lock();
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -3281,8 +3296,8 @@ static void __sched notrace __schedule(bool preempt)
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
- smp_mb__before_spinlock();
rq_lock(rq, &rf);
+ smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
@@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space. For TSO
+ * (e.g. x86), the architecture must provide its own
+ * barrier in switch_mm(). For weakly ordered machines
+ * for which spin_unlock() acts as a full memory
+ * barrier, finish_lock_switch() in common code takes
+ * care of this barrier. For weakly ordered machines for
+ * which spin_unlock() acts as a RELEASE barrier (only
+ * arm64 and PowerPC), arm64 has a full barrier in
+ * switch_to(), and PowerPC has
+ * smp_mb__after_unlock_lock() before
+ * finish_lock_switch().
+ */
++*switch_count;
trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void)
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
- smp_mb();
- raw_spin_unlock_wait(&current->pi_lock);
+ raw_spin_lock_irq(&current->pi_lock);
+ raw_spin_unlock_irq(&current->pi_lock);
/* Causes final put_task_struct in finish_task_switch(): */
__set_current_state(TASK_DEAD);
@@ -5103,24 +5133,17 @@ out_unlock:
return retval;
}
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned long state = p->state;
-
- /* Make sure the string lines up properly with the number of task states: */
- BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
if (!try_get_task_stack(p))
return;
- if (state)
- state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
- if (state == TASK_RUNNING)
+
+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+ if (p->state == TASK_RUNNING)
printk(KERN_CONT " running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
@@ -5177,11 +5200,6 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void init_idle_bootup_task(struct task_struct *idle)
-{
- idle->sched_class = &idle_sched_class;
-}
-
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
@@ -5438,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
*/
next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
- next->sched_class->put_prev_task(rq, next);
+ put_prev_task(rq, next);
/*
* Rules for changing task_struct::cpus_allowed are holding
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index fba235c..8d9562d 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
* @p: the task
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
*
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: int - CPUs were found
*/
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask)
{
- int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
- best_cpu = cpumask_any(later_mask);
- goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
- dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
- best_cpu = cpudl_maximum(cp);
- if (later_mask)
- cpumask_set_cpu(best_cpu, later_mask);
- }
+ return 1;
+ } else {
+ int best_cpu = cpudl_maximum(cp);
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
-out:
- WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+ if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
- return best_cpu;
+ return 1;
+ }
+ }
+ return 0;
}
/*
@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 29a3970..9209d83 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -52,9 +52,11 @@ struct sugov_policy {
struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
+ unsigned int cpu;
- unsigned long iowait_boost;
- unsigned long iowait_boost_max;
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
u64 last_update;
/* The fields below are only needed when sharing a policy. */
@@ -76,6 +78,26 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
+ /*
+ * Since cpufreq_update_util() is called with rq->lock held for
+ * the @target_cpu, our per-cpu data is fully serialized.
+ *
+ * However, drivers cannot in general deal with cross-cpu
+ * requests, so while get_next_freq() will work, our
+ * sugov_update_commit() call may not for the fast switching platforms.
+ *
+ * Hence stop here for remote requests if they aren't supported
+ * by the hardware, as calculating the frequency is pointless if
+ * we cannot in fact act on it.
+ *
+ * For the slow switching platforms, the kthread is always scheduled on
+ * the right set of CPUs and any CPU can find the next frequency and
+ * schedule the kthread.
+ */
+ if (sg_policy->policy->fast_switch_enabled &&
+ !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+ return false;
+
if (sg_policy->work_in_progress)
return false;
@@ -106,7 +128,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
if (policy->fast_switch_enabled) {
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (next_freq == CPUFREQ_ENTRY_INVALID)
+ if (!next_freq)
return;
policy->cur = next_freq;
@@ -154,12 +176,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
{
- struct rq *rq = this_rq();
+ struct rq *rq = cpu_rq(cpu);
unsigned long cfs_max;
- cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+ cfs_max = arch_scale_cpu_capacity(NULL, cpu);
*util = min(rq->cfs.avg.util_avg, cfs_max);
*max = cfs_max;
@@ -169,30 +191,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
if (flags & SCHED_CPUFREQ_IOWAIT) {
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ if (sg_cpu->iowait_boost_pending)
+ return;
+
+ sg_cpu->iowait_boost_pending = true;
+
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }
} else if (sg_cpu->iowait_boost) {
s64 delta_ns = time - sg_cpu->last_update;
/* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC)
+ if (delta_ns > TICK_NSEC) {
sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
}
}
static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
unsigned long *max)
{
- unsigned long boost_util = sg_cpu->iowait_boost;
- unsigned long boost_max = sg_cpu->iowait_boost_max;
+ unsigned int boost_util, boost_max;
- if (!boost_util)
+ if (!sg_cpu->iowait_boost)
return;
+ if (sg_cpu->iowait_boost_pending) {
+ sg_cpu->iowait_boost_pending = false;
+ } else {
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ sg_cpu->iowait_boost = 0;
+ return;
+ }
+ }
+
+ boost_util = sg_cpu->iowait_boost;
+ boost_max = sg_cpu->iowait_boost_max;
+
if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
}
- sg_cpu->iowait_boost >>= 1;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -229,7 +275,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (flags & SCHED_CPUFREQ_RT_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, sg_cpu->cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
@@ -264,6 +310,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
+ j_sg_cpu->iowait_boost_pending = false;
continue;
}
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
@@ -290,7 +337,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, sg_cpu->cpu);
raw_spin_lock(&sg_policy->update_lock);
@@ -445,7 +492,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
- kthread_bind_mask(thread, policy->related_cpus);
+
+ /* Kthread is bound to all CPUs by default */
+ if (!policy->dvfs_possible_from_any_cpu)
+ kthread_bind_mask(thread, policy->related_cpus);
+
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -528,16 +579,7 @@ static int sugov_init(struct cpufreq_policy *policy)
goto stop_kthread;
}
- if (policy->transition_delay_us) {
- tunables->rate_limit_us = policy->transition_delay_us;
- } else {
- unsigned int lat;
-
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
- lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
- }
+ tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
@@ -655,6 +697,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
+ .dynamic_switching = true,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
@@ -671,6 +714,11 @@ struct cpufreq_governor *cpufreq_default_governor(void)
static int __init sugov_register(void)
{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(sugov_cpu, cpu).cpu = cpu;
+
return cpufreq_register_governor(&schedutil_gov);
}
fs_initcall(sugov_register);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7..2511aba 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
-
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 755bd3f..9e38df7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1136,7 +1136,7 @@ static void update_curr_dl(struct rq *rq)
}
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
return;
/*
@@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* see if it is pushed or pulled somewhere else.
*/
if (p->nr_cpus_allowed != 1 &&
- cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ cpudl_find(&rq->rd->cpudl, p, NULL))
return;
resched_curr(rq);
@@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-struct task_struct *
+static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
@@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task)
struct sched_domain *sd;
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
- int best_cpu, cpu = task_cpu(task);
+ int cpu = task_cpu(task);
/* Make sure the mask is initialized first */
if (unlikely(!later_mask))
@@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
- task, later_mask);
- if (best_cpu == -1)
+ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1;
/*
- * If we are here, some target has been found,
- * the most suitable of which is cached in best_cpu.
- * This is, among the runqueues where the current tasks
- * have later deadlines than the task's one, the rq
- * with the latest possible one.
+ * If we are here, some targets have been found, including
+ * the most suitable which is, among the runqueues where the
+ * current tasks have later deadlines than the task's one, the
+ * rq with the latest possible one.
*
* Now we check how well this matches with task's
* affinity and system topology.
@@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task)
rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
/*
* If possible, preempting this_cpu is
@@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
+ best_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
/*
- * Last chance: if best_cpu is valid and is
- * in the mask, that becomes our choice.
+ * Last chance: if a cpu being in both later_mask
+ * and current sd span is valid, that becomes our
+ * choice. Of course, the latest possible cpu is
+ * already under consideration through later_mask.
*/
- if (best_cpu < nr_cpu_ids &&
- cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fa66de..4a23bbc 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return table;
}
+static cpumask_var_t sd_sysctl_cpus;
static struct ctl_table_header *sd_sysctl_header;
+
void register_sched_domain_sysctl(void)
{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ static struct ctl_table *cpu_entries;
+ static struct ctl_table **cpu_idx;
char buf[32];
+ int i;
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
+ if (!cpu_entries) {
+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+ if (!cpu_entries)
+ return;
- if (entry == NULL)
- return;
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = cpu_entries;
+ }
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
+ if (!cpu_idx) {
+ struct ctl_table *e = cpu_entries;
+
+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+ if (!cpu_idx)
+ return;
+
+ /* deal with sparse possible map */
+ for_each_possible_cpu(i) {
+ cpu_idx[i] = e;
+ e++;
+ }
+ }
+
+ if (!cpumask_available(sd_sysctl_cpus)) {
+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+ return;
+
+ /* init to possible to not have holes in @cpu_entries */
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ }
+
+ for_each_cpu(i, sd_sysctl_cpus) {
+ struct ctl_table *e = cpu_idx[i];
+
+ if (e->child)
+ sd_free_ctl_entry(&e->child);
+
+ if (!e->procname) {
+ snprintf(buf, 32, "cpu%d", i);
+ e->procname = kstrdup(buf, GFP_KERNEL);
+ }
+ e->mode = 0555;
+ e->child = sd_alloc_ctl_cpu_table(i);
+
+ __cpumask_clear_cpu(i, sd_sysctl_cpus);
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
+void dirty_sched_domain_sysctl(int cpu)
+{
+ if (cpumask_available(sd_sysctl_cpus))
+ __cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
/* may be called multiple times per register */
void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
@@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg)
}
#endif
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
if (rq->curr == p)
- SEQ_printf(m, "R");
+ SEQ_printf(m, ">R");
else
- SEQ_printf(m, " ");
+ SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
@@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
- " task PID tree-key switches prio"
+ " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n"
- "------------------------------------------------------"
+ "-------------------------------------------------------"
"----------------------------------------------------\n");
rcu_read_lock();
@@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
#endif
}
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c95880e..8bc0a88 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
/*
* For !fair tasks do:
*
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
}
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+
+ return max(smin, period);
+}
+
static unsigned int task_scan_max(struct task_struct *p)
{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
return max(smin, smax);
}
@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
- int active_nodes;
-
- struct rcu_head rcu;
- unsigned long total_faults;
- unsigned long max_faults_cpu;
- /*
- * Faults_cpu is used to decide whether memory should move
- * towards the CPU. As a consequence, these stats are weighted
- * more by CPU use than by memory faults.
- */
- unsigned long *faults_cpu;
- unsigned long faults[0];
-};
-
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
@@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
cpus++;
@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
- int ratio;
+ int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
+ curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -2790,6 +2801,31 @@ static inline void update_cfs_shares(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (&rq->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq, 0);
+ }
+}
+
#ifdef CONFIG_SMP
/*
* Approximate:
@@ -2968,6 +3004,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time += delta << 10;
/*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!weight)
+ running = 0;
+
+ /*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
@@ -3276,29 +3324,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
- if (&this_rq()->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_of(cfs_rq), 0);
- }
-}
-
/*
* Unsigned subtract and clamp on underflow.
*
@@ -3320,7 +3345,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,7 +3358,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* call update_tg_load_avg() when this function returns true.
*/
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0;
@@ -3362,7 +3386,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (update_freq && (decayed || removed_util))
+ if (decayed || removed_util)
cfs_rq_util_change(cfs_rq);
return decayed || removed_load;
@@ -3390,7 +3414,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cpu, cfs_rq, se);
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG))
@@ -3534,7 +3558,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
@@ -3544,7 +3568,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
static inline void update_load_avg(struct sched_entity *se, int not_used1)
{
- cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+ cfs_rq_util_change(cfs_rq_of(se));
}
static inline void
@@ -4875,7 +4899,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* passed.
*/
if (p->in_iowait)
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
@@ -5125,9 +5149,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
}
/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+ return cfs_rq_runnable_load_avg(&rq->cfs);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5196,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
/*
* bail if there's load or we're actually up-to-date.
*/
- if (weighted_cpuload(cpu_of(this_rq)))
+ if (weighted_cpuload(this_rq))
return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5217,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same.
*/
- this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+ this_rq->cpu_load[0] = weighted_cpuload(this_rq);
}
/*
@@ -5209,7 +5233,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick)
return;
- load = weighted_cpuload(cpu_of(this_rq));
+ load = weighted_cpuload(this_rq);
rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5259,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
*/
void cpu_load_update_active(struct rq *this_rq)
{
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5277,7 @@ void cpu_load_update_active(struct rq *this_rq)
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5268,7 +5292,7 @@ static unsigned long source_load(int cpu, int type)
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5290,7 +5314,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(cpu);
+ unsigned long load_avg = weighted_cpuload(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5345,20 +5369,115 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+struct llc_stats {
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
+ int has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+ struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+ if (!sds)
+ return false;
+
+ stats->nr_running = READ_ONCE(sds->nr_running);
+ stats->load = READ_ONCE(sds->load);
+ stats->capacity = READ_ONCE(sds->capacity);
+ stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+ return true;
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ struct llc_stats prev_stats, this_stats;
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ if (!get_llc_stats(&prev_stats, prev_cpu) ||
+ !get_llc_stats(&this_stats, this_cpu))
+ return false;
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current LLC.
+ */
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ /* in this case load hits 0 and this LLC is considered 'idle' */
+ if (current_load > this_stats.load)
+ return true;
+
+ this_stats.load -= current_load;
+ }
+
+ /*
+ * The has_capacity stuff is not SMT aware, but by trying to balance
+ * the nr_running on both ends we try and fill the domain at equal
+ * rates, thereby first consuming cores before siblings.
+ */
+
+ /* if the old cache has capacity, stay there */
+ if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+ return false;
+
+ /* if this cache has capacity, come here */
+ if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ return true;
+
+ /*
+ * Check to see if we can move the load without causing too much
+ * imbalance.
+ */
+ task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_stats.capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_stats.capacity;
+
+ this_eff_load *= this_stats.load + task_load;
+ prev_eff_load *= prev_stats.load - task_load;
+
+ return this_eff_load <= prev_eff_load;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ bool affine;
/*
- * Common case: CPUs are in the same socket, and select_idle_sibling()
- * will do its thing regardless of what we return:
+ * Default to no affine wakeups; wake_affine() should not effect a task
+ * placement the load-balancer feels inclined to undo. The conservative
+ * option is therefore to not move tasks when they wake up.
*/
- if (cpus_share_cache(prev_cpu, this_cpu))
- affine = true;
- else
- affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+ affine = false;
+
+ /*
+ * If the wakeup is across cache domains, try to evaluate if movement
+ * makes sense, otherwise rely on select_idle_siblings() to do
+ * placement inside the cache domain.
+ */
+ if (!cpus_share_cache(prev_cpu, this_cpu))
+ affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -5550,7 +5669,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
+ load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
@@ -6187,10 +6306,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
goto idle;
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (prev->sched_class != &fair_sched_class)
goto simple;
@@ -6220,11 +6339,17 @@ again:
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
- * Therefore the 'simple' nr_running test will indeed
+ * Therefore the nr_running test will indeed
* be correct.
*/
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ cfs_rq = &rq->cfs;
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
goto simple;
+ }
}
se = pick_next_entity(cfs_rq, curr);
@@ -6264,12 +6389,8 @@ again:
return p;
simple:
- cfs_rq = &rq->cfs;
#endif
- if (!cfs_rq->nr_running)
- goto idle;
-
put_prev_task(rq, prev);
do {
@@ -6917,7 +7038,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -6990,7 +7111,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7036,6 +7157,7 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
@@ -7055,6 +7177,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
+ .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
@@ -7363,7 +7486,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(i);
+ sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -7490,6 +7613,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
+ struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -7546,6 +7670,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -7561,6 +7686,21 @@ next_group:
env->dst_rq->rd->overload = overload;
}
+ if (!shared)
+ return;
+
+ /*
+ * Since these are sums over groups they can contain some CPUs
+ * multiple times for the NUMA domains.
+ *
+ * Currently only wake_affine_llc() and find_busiest_group()
+ * uses these numbers, only the last is affected by this problem.
+ *
+ * XXX fix that.
+ */
+ WRITE_ONCE(shared->nr_running, sds->total_running);
+ WRITE_ONCE(shared->load, sds->total_load);
+ WRITE_ONCE(shared->capacity, sds->total_capacity);
}
/**
@@ -7790,6 +7930,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ /* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
@@ -7892,7 +8033,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(rq);
/*
* When comparing with imbalance, use weighted_cpuload()
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6c23e30..257f4f0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
}
/*
- * Suspend-to-idle ("freeze") is a system state in which all user space
+ * Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in iterrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
@@ -167,9 +167,9 @@ static void cpuidle_idle_call(void)
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze() || dev->use_deepest_state) {
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
+ if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+ if (idle_should_enter_s2idle()) {
+ entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 0000000..a92fddc
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h" /* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (num_online_cpus() == 1)
+ return;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm == current->mm) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ * barrier() smp_mb() sys_membarrier()
+ * barrier() X X O
+ * smp_mb() X O O
+ * sys_membarrier() O O O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_QUERY:
+ {
+ int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+ if (tick_nohz_full_enabled())
+ cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+ return cmd_mask;
+ }
+ case MEMBARRIER_CMD_SHARED:
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -EINVAL;
+ if (num_online_cpus() > 1)
+ synchronize_sched();
+ return 0;
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ membarrier_private_expedited();
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf93..0af5ca9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -970,7 +970,7 @@ static void update_curr_rt(struct rq *rq)
return;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3..6ed7962 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -769,7 +769,7 @@ struct rq {
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
- struct call_single_data hrtick_csd;
+ call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
@@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
void register_sched_domain_sysctl(void);
+void dirty_sched_domain_sysctl(int cpu);
void unregister_sched_domain_sysctl(void);
#else
static inline void register_sched_domain_sysctl(void)
{
}
+static inline void dirty_sched_domain_sysctl(int cpu)
+{
+}
static inline void unregister_sched_domain_sysctl(void)
{
}
@@ -2070,19 +2074,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
- data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+ cpu_of(rq)));
if (data)
data->func(data, rq_clock(rq), flags);
}
-
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-{
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_update_util(rq, flags);
-}
#else
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 3d5610d..2227e18 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q)
{
unsigned long flags;
- if (!swait_active(q))
- return;
-
raw_spin_lock_irqsave(&q->lock, flags);
swake_up_locked(q);
raw_spin_unlock_irqrestore(&q->lock, flags);
@@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q)
struct swait_queue *curr;
LIST_HEAD(tmp);
- if (!swait_active(q))
- return;
-
raw_spin_lock_irq(&q->lock);
list_splice_init(&q->task_list, &tmp);
while (!list_empty(&tmp)) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79895ae..6f7b439 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
static int init_rootdomain(struct root_domain *rd)
{
- memset(rd, 0, sizeof(*rd));
-
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
@@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void)
{
struct root_domain *rd;
- rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
if (!rd)
return NULL;
@@ -337,7 +335,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
kfree(sg->sgc);
- kfree(sg);
+ if (atomic_dec_and_test(&sg->ref))
+ kfree(sg);
sg = tmp;
} while (sg != first);
}
@@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
static void destroy_sched_domain(struct sched_domain *sd)
{
/*
- * If its an overlapping domain it has private groups, iterate and
- * nuke them all.
+ * A normal sched domain may have multiple group references, an
+ * overlapping domain, having private groups, only one. Iterate,
+ * dropping group/capacity references, freeing where none remain.
*/
- if (sd->flags & SD_OVERLAP) {
- free_sched_groups(sd->groups, 1);
- } else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgc);
- kfree(sd->groups);
- }
+ free_sched_groups(sd->groups, 1);
+
if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
kfree(sd->shared);
kfree(sd);
@@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rq_attach_root(rq, rd);
tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
update_top_cache_domain(cpu);
@@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
else
cpumask_copy(sg_span, sched_domain_span(sd));
+ atomic_inc(&sg->ref);
return sg;
}
@@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
@@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
- n = doms_new ? ndoms_new : 0;
+ if (!doms_new) {
+ WARN_ON_ONCE(dattr_new);
+ n = 0;
+ doms_new = alloc_sched_domains(1);
+ if (doms_new) {
+ n = 1;
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ }
+ } else {
+ n = ndoms_new;
+ }
/* Destroy deleted domains: */
for (i = 0; i < ndoms_cur; i++) {
@@ -1870,11 +1878,10 @@ match1:
}
n = ndoms_cur;
- if (doms_new == NULL) {
+ if (!doms_new) {
n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
- WARN_ON_ONCE(dattr_new);
}
/* Build new domains: */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6..d6afed6 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ int ret = curr->func(curr, mode, wake_flags, key);
+ if (ret < 0)
+ break;
+ if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
diff --git a/kernel/signal.c b/kernel/signal.c
index caed913..ed804a4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
recalc_sigpending_and_wake(t);
}
}
- if (action->sa.sa_handler == SIG_DFL)
+ /*
+ * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
+ * debugging to leave init killable.
+ */
+ if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -3303,12 +3307,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
+#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sizeof(old_sigset_t));
- if (err == 0)
- if (copy_to_user(set32, &set, sizeof(old_sigset_t)))
- err = -EFAULT;
+ int err = do_sigpending(&set, sizeof(set.sig[0]));
+ if (!err)
+ err = put_user(set.sig[0], set32);
return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
+#endif
}
#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 3061483..81cfca9 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,7 +28,7 @@ enum {
};
struct call_function_data {
- struct call_single_data __percpu *csd;
+ call_single_data_t __percpu *csd;
cpumask_var_t cpumask;
cpumask_var_t cpumask_ipi;
};
@@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
free_cpumask_var(cfd->cpumask);
return -ENOMEM;
}
- cfd->csd = alloc_percpu(struct call_single_data);
+ cfd->csd = alloc_percpu(call_single_data_t);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
@@ -103,12 +103,12 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static __always_inline void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}
-static __always_inline void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
{
csd_lock_wait(csd);
csd->flags |= CSD_FLAG_LOCK;
@@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
/*
* prevent CPU from reordering the above assignment
* to ->flags with any subsequent assignments to other
- * fields of the specified call_single_data structure:
+ * fields of the specified call_single_data_t structure:
*/
smp_wmb();
}
-static __always_inline void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
{
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
@@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
smp_store_release(&csd->flags, 0);
}
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
/*
- * Insert a previously allocated call_single_data element
+ * Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, struct call_single_data *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd,
smp_call_func_t func, void *info)
{
if (cpu == smp_processor_id()) {
@@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
struct llist_head *head;
struct llist_node *entry;
- struct call_single_data *csd, *csd_next;
+ call_single_data_t *csd, *csd_next;
static bool warned;
WARN_ON(!irqs_disabled());
@@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
- struct call_single_data *csd;
- struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+ call_single_data_t *csd;
+ call_single_data_t csd_stack = {
+ .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+ };
int this_cpu;
int err;
@@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
*/
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
int err = 0;
@@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
- struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+ call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock(csd);
if (wait)
@@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
- struct call_single_data *csd;
+ call_single_data_t *csd;
csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051..836a72a 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -96,20 +96,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
+ raw_spin_lock_irq(&task->pi_lock);
do {
work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
+ raw_spin_unlock_irq(&task->pi_lock);
if (!work)
break;
- /*
- * Synchronize with task_work_cancel(). It can't remove
- * the first entry == work, cmpxchg(task_works) should
- * fail, but it can play with *work and other entries.
- */
- raw_spin_unlock_wait(&task->pi_lock);
do {
next = work->next;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0b8ff7d..ec09ce9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -28,6 +28,7 @@
#include <linux/workqueue.h>
#include <linux/freezer.h>
#include <linux/compat.h>
+#include <linux/module.h>
#include "posix-timers.h"
@@ -56,9 +57,9 @@ static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
#endif
+#ifdef CONFIG_RTC_CLASS
static struct wakeup_source *ws;
-#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
@@ -89,6 +90,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
+ struct wakeup_source *__ws;
if (rtcdev)
return -EBUSY;
@@ -98,13 +100,25 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (!device_may_wakeup(rtc->dev.parent))
return -1;
+ __ws = wakeup_source_register("alarmtimer");
+
spin_lock_irqsave(&rtcdev_lock, flags);
if (!rtcdev) {
+ if (!try_module_get(rtc->owner)) {
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+ return -1;
+ }
+
rtcdev = rtc;
/* hold a reference so it doesn't go away */
get_device(dev);
+ ws = __ws;
+ __ws = NULL;
}
spin_unlock_irqrestore(&rtcdev_lock, flags);
+
+ wakeup_source_unregister(__ws);
+
return 0;
}
@@ -860,7 +874,6 @@ static int __init alarmtimer_init(void)
error = PTR_ERR(pdev);
goto out_drv;
}
- ws = wakeup_source_register("alarmtimer");
return 0;
out_drv:
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a3bd5db..8585ad6 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -799,7 +799,6 @@ static void check_thread_timers(struct task_struct *tsk,
struct list_head *firing)
{
struct list_head *timers = tsk->cpu_timers;
- struct signal_struct *const sig = tsk->signal;
struct task_cputime *tsk_expires = &tsk->cputime_expires;
u64 expires;
unsigned long soft;
@@ -823,10 +822,9 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = task_rlimit(tsk, RLIMIT_RTTIME);
if (soft != RLIM_INFINITY) {
- unsigned long hard =
- READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -847,7 +845,8 @@ static void check_thread_timers(struct task_struct *tsk,
*/
if (soft < hard) {
soft += USEC_PER_SEC;
- sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
+ tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur =
+ soft;
}
if (print_fatal_signals) {
pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
@@ -938,11 +937,10 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = task_rlimit(tsk, RLIMIT_CPU);
if (soft != RLIM_INFINITY) {
unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
- unsigned long hard =
- READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
u64 x;
if (psecs >= hard) {
/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cedafa0..8ea4fb3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->ktime_sec = seconds;
/* Update the monotonic raw base */
- seconds = tk->raw_sec;
- nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
- tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
@@ -2066,7 +2064,7 @@ void update_wall_time(void)
goto out;
/* Do some additional sanity checking */
- timekeeping_check_update(real_tk, offset);
+ timekeeping_check_update(tk, offset);
/*
* With NO_HZ we may have to accumulate many cycle_intervals
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 38bc4d2..0754cad 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
#include <linux/time.h>
#include "timekeeping_internal.h"
@@ -75,7 +76,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
int bin = min(fls(t->tv_sec), NUM_BINS-1);
sleep_time_bin[bin]++;
- printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
- (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
+ pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n",
+ (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 71ce3f4..f2674a0 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
bool migration_enabled;
bool nohz_active;
bool is_idle;
+ bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
- unsigned long jnow = READ_ONCE(jiffies);
+ unsigned long jnow;
/*
- * We only forward the base when it's idle and we have a delta between
- * base clock and jiffies.
+ * We only forward the base when we are idle or have just come out of
+ * idle (must_forward_clk logic), and have a delta between base clock
+ * and jiffies. In the common case, run_timers will take care of it.
*/
- if (!base->is_idle || (long) (jnow - base->clk) < 2)
+ if (likely(!base->must_forward_clk))
+ return;
+
+ jnow = READ_ONCE(jiffies);
+ base->must_forward_clk = base->is_idle;
+ if ((long)(jnow - base->clk) < 2)
return;
/*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* same array bucket then just return:
*/
if (timer_pending(timer)) {
+ /*
+ * The downside of this optimization is that it can result in
+ * larger granularity than you would get from adding a new
+ * timer with this expiry.
+ */
if (timer->expires == expires)
return 1;
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
clk = base->clk;
idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
} else {
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
}
ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
raw_spin_lock(&base->lock);
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | base->cpu);
+ forward_timer_base(base);
}
}
- /* Try to forward a stale timer base clock */
- forward_timer_base(base);
-
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
+ forward_timer_base(base);
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
@@ -1495,12 +1508,18 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
base->is_idle = false;
} else {
if (!is_max_delta)
- expires = basem + (nextevt - basej) * TICK_NSEC;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle:
+ * If we expect to sleep more than a tick, mark the base idle.
+ * Also the tick is stopped so any added timer must forward
+ * the base clk itself to keep granularity small. This idle
+ * logic is only maintained for the BASE_STD base, deferrable
+ * timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC)
+ if ((expires - basem) > TICK_NSEC) {
+ base->must_forward_clk = true;
base->is_idle = true;
+ }
}
raw_spin_unlock(&base->lock);
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ /*
+ * must_forward_clk must be cleared before running timers so that any
+ * timer functions that call mod_timer will not try to forward the
+ * base. idle trcking / clock forwarding logic is only used with
+ * BASE_STD timers.
+ *
+ * The deferrable base does not do idle tracking at all, so we do
+ * not forward it. This can result in very large variations in
+ * granularity for deferrable timers, but they can be deferred for
+ * long periods due to idle.
+ */
+ base->must_forward_clk = false;
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
diff --git a/kernel/torture.c b/kernel/torture.c
index 55de965..637e172 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
torture_type, cpu);
(*n_offl_successes)++;
delta = jiffies - starttime;
- sum_offl += delta;
+ *sum_offl += delta;
if (*min_offl < 0) {
*min_offl = delta;
*max_offl = delta;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f8..2a685b4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@
#include <linux/time.h>
#include <linux/uaccess.h>
#include <linux/list.h>
+#include <linux/blk-cgroup.h>
#include "../../block/blk.h"
@@ -46,10 +47,16 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
+#define TRACE_BLK_OPT_CGROUP 0x2
+#define TRACE_BLK_OPT_CGNAME 0x4
static struct tracer_opt blk_tracer_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+ { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+ { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
+#endif
{ }
};
@@ -68,7 +75,8 @@ static void blk_unregister_tracepoints(void);
* Send out a notify message.
*/
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
- const void *data, size_t len)
+ const void *data, size_t len,
+ union kernfs_node_id *cgid)
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
@@ -76,12 +84,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len,
+ sizeof(*t) + len + cgid_len,
0, pc);
if (!event)
return;
@@ -92,17 +101,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
if (!bt->rchan)
return;
- t = relay_reserve(bt->rchan, sizeof(*t) + len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = ktime_to_ns(ktime_get());
record_it:
t->device = bt->dev;
- t->action = action;
+ t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
t->pid = pid;
t->cpu = cpu;
- t->pdu_len = len;
- memcpy((void *) t + sizeof(*t), data, len);
+ t->pdu_len = len + cgid_len;
+ if (cgid)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+ memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -122,7 +133,7 @@ static void trace_note_tsk(struct task_struct *tsk)
spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
- sizeof(tsk->comm));
+ sizeof(tsk->comm), NULL);
}
spin_unlock_irqrestore(&running_trace_lock, flags);
}
@@ -139,11 +150,12 @@ static void trace_note_time(struct blk_trace *bt)
words[1] = now.tv_nsec;
local_irq_save(flags);
- trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
local_irq_restore(flags);
}
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
+ const char *fmt, ...)
{
int n;
va_list args;
@@ -167,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ blkcg = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+ blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL);
+#else
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
+#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -204,7 +223,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data)
+ void *pdu_data, union kernfs_node_id *cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -215,6 +234,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
pid_t pid;
int cpu, pc = 0;
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -229,6 +249,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
what |= BLK_TC_ACT(BLK_TC_FLUSH);
+ if (cgid)
+ what |= __BLK_TA_CGROUP;
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -241,7 +263,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + pdu_len,
+ sizeof(*t) + pdu_len + cgid_len,
0, pc);
if (!event)
return;
@@ -258,7 +280,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -280,10 +302,12 @@ record_it:
t->action = what;
t->device = bt->dev;
t->error = error;
- t->pdu_len = pdu_len;
+ t->pdu_len = pdu_len + cgid_len;
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
if (pdu_len)
- memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+ memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -359,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return PTR_ERR(msg);
bt = filp->private_data;
- __trace_note_message(bt, "%s", msg);
+ __trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
@@ -684,6 +708,36 @@ void blk_trace_shutdown(struct request_queue *q)
}
}
+#ifdef CONFIG_BLK_CGROUP
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ return NULL;
+
+ if (!bio->bi_css)
+ return NULL;
+ return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+}
+#else
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ return NULL;
+}
+#endif
+
+static union kernfs_node_id *
+blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+{
+ if (!rq->bio)
+ return NULL;
+ /* Use the first bio */
+ return blk_trace_bio_get_cgid(q, rq->bio);
+}
+
/*
* blktrace probes
*/
@@ -694,13 +748,15 @@ void blk_trace_shutdown(struct request_queue *q)
* @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
+ * @cgid: the cgroup info
*
* Description:
* Records an action against a request. Will log the bio offset + size.
*
**/
static void blk_add_trace_rq(struct request *rq, int error,
- unsigned int nr_bytes, u32 what)
+ unsigned int nr_bytes, u32 what,
+ union kernfs_node_id *cgid)
{
struct blk_trace *bt = rq->q->blk_trace;
@@ -713,32 +769,36 @@ static void blk_add_trace_rq(struct request *rq, int error,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL);
+ rq->cmd_flags, what, error, 0, NULL, cgid);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
+ blk_trace_request_get_cgid(rq->q, rq));
}
/**
@@ -753,7 +813,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error)
+ u32 what, int error, union kernfs_node_id *cgid)
{
struct blk_trace *bt = q->blk_trace;
@@ -761,20 +821,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -782,7 +844,8 @@ static void blk_add_trace_bio_backmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -790,13 +853,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_getrq(void *ignore,
@@ -804,13 +869,14 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL);
+ NULL, NULL);
}
}
@@ -820,13 +886,14 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL);
+ 0, 0, NULL, NULL);
}
}
@@ -835,7 +902,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -852,7 +919,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
}
}
@@ -868,7 +935,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu);
+ &rpdu, blk_trace_bio_get_cgid(q, bio));
}
}
@@ -896,12 +963,12 @@ static void blk_add_trace_bio_remap(void *ignore,
return;
r.device_from = cpu_to_be32(dev);
- r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.device_to = cpu_to_be32(bio_dev(bio));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
}
/**
@@ -934,7 +1001,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
}
/**
@@ -958,7 +1025,8 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, 0, len, data);
+ BLK_TA_DRV_DATA, 0, len, data,
+ blk_trace_request_get_cgid(q, rq));
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1031,7 +1099,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
- if (t->action == BLK_TN_MESSAGE) {
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
rwbs[i++] = 'N';
goto out;
}
@@ -1066,9 +1134,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
return (const struct blk_io_trace *)ent;
}
-static inline const void *pdu_start(const struct trace_entry *ent)
+static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
+{
+ return (void *)(te_blk_io_trace(ent) + 1) +
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
+}
+
+static inline const void *cgid_start(const struct trace_entry *ent)
+{
+ return (void *)(te_blk_io_trace(ent) + 1);
+}
+
+static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
{
- return te_blk_io_trace(ent) + 1;
+ return te_blk_io_trace(ent)->pdu_len -
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
}
static inline u32 t_action(const struct trace_entry *ent)
@@ -1096,16 +1176,16 @@ static inline __u16 t_error(const struct trace_entry *ent)
return te_blk_io_trace(ent)->error;
}
-static __u64 get_pdu_int(const struct trace_entry *ent)
+static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent);
+ const __u64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r)
+ struct blk_io_trace_remap *r, bool has_cg)
{
- const struct blk_io_trace_remap *__r = pdu_start(ent);
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
__u64 sector_from = __r->sector_from;
r->device_from = be32_to_cpu(__r->device_from);
@@ -1113,9 +1193,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
+ bool has_cg);
-static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1131,24 +1213,43 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static void blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
-}
-
-static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+ if (has_cg) {
+ const union kernfs_node_id *id = cgid_start(iter->ent);
+
+ if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
+ char blkcg_name_buf[NAME_MAX + 1] = "<...>";
+
+ cgroup_path_from_kernfs_id(id, blkcg_name_buf,
+ sizeof(blkcg_name_buf));
+ trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ blkcg_name_buf, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %x,%-x %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ id->ino, id->generation, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static void blk_log_dump_pdu(struct trace_seq *s,
+ const struct trace_entry *ent, bool has_cg)
{
const unsigned char *pdu_buf;
int pdu_len;
int i, end;
- pdu_buf = pdu_start(ent);
- pdu_len = te_blk_io_trace(ent)->pdu_len;
+ pdu_buf = pdu_start(ent, has_cg);
+ pdu_len = pdu_real_len(ent, has_cg);
if (!pdu_len)
return;
@@ -1179,7 +1280,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_puts(s, ") ");
}
-static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1187,7 +1288,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
trace_seq_printf(s, "%u ", t_bytes(ent));
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
@@ -1199,10 +1300,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
}
static void blk_log_with_error(struct trace_seq *s,
- const struct trace_entry *ent)
+ const struct trace_entry *ent, bool has_cg)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
@@ -1215,18 +1316,18 @@ static void blk_log_with_error(struct trace_seq *s,
}
}
-static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
struct blk_io_trace_remap r = { .device_from = 0, };
- get_pdu_remap(ent, &r);
+ get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
MAJOR(r.device_from), MINOR(r.device_from),
(unsigned long long)r.sector_from);
}
-static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1235,30 +1336,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_printf(s, "[%s]\n", cmd);
}
-static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
}
-static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ get_pdu_int(ent, has_cg), cmd);
}
-static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg)
{
- const struct blk_io_trace *t = te_blk_io_trace(ent);
- trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putmem(s, pdu_start(ent, has_cg),
+ pdu_real_len(ent, has_cg));
trace_seq_putc(s, '\n');
}
@@ -1298,7 +1400,8 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- void (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1326,23 +1429,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
u16 what;
bool long_act;
blk_log_action_t *log_action;
+ bool has_cg;
t = te_blk_io_trace(iter->ent);
- what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+ what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
log_action = classic ? &blk_log_action_classic : &blk_log_action;
+ has_cg = t->action & __BLK_TA_CGROUP;
- if (t->action == BLK_TN_MESSAGE) {
- log_action(iter, long_act ? "message" : "m");
- blk_log_msg(s, iter->ent);
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
+ log_action(iter, long_act ? "message" : "m", has_cg);
+ blk_log_msg(s, iter->ent, has_cg);
return trace_handle_return(s);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
trace_seq_printf(s, "Unknown action %x\n", what);
else {
- log_action(iter, what2act[what].act[long_act]);
- what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act], has_cg);
+ what2act[what].print(s, iter->ent, has_cg);
}
return trace_handle_return(s);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3738519..dc498b6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
fmt_cnt++;
}
- return __trace_printk(1/* fake ip will not be printed */, fmt,
- mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
- mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
- mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_TP_EMIT() __BPF_ARG3_TP()
+#define __BPF_TP(...) \
+ __trace_printk(1 /* Fake ip will not be printed. */, \
+ fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_TP(...) \
+ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_TP(arg1, ##__VA_ARGS__) \
+ : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
+ : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_TP(...) \
+ ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
+ : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
+ : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
+
+#define __BPF_ARG3_TP(...) \
+ ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
+ : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
+ : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
+
+ return __BPF_TP_EMIT();
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02004ae9..96cea88 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
function_profile_call(trace->func, 0, NULL, NULL);
+ /* If function graph is shutting down, ret_stack can be NULL */
+ if (!current->ret_stack)
+ return 0;
+
if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
current->ret_stack[index].subtime = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 529cc50..81279c6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* the page that was allocated, with the read page of the buffer.
*
* Returns:
- * The page allocated, or NULL on error.
+ * The page allocated, or ERR_PTR
*/
void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = NULL;
unsigned long flags;
struct page *page;
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-ENODEV);
+
+ cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bpage = page_address(page);
@@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
*
* for example:
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
- * if (!rpage)
- * return error;
+ * if (IS_ERR(rpage))
+ * return PTR_ERR(rpage);
* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
* if (ret >= 0)
* process_page(rpage, ret);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 9fbcaf5..68ee79a 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)
int i;
bpage = ring_buffer_alloc_read_page(buffer, cpu);
- if (!bpage)
+ if (IS_ERR(bpage))
return EVENT_DROPPED;
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 42b9355..44004d8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
- ssize_t ret;
+ ssize_t ret = 0;
ssize_t size;
if (!count)
@@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
iter->cpu_file);
- info->spare_cpu = iter->cpu_file;
+ if (IS_ERR(info->spare)) {
+ ret = PTR_ERR(info->spare);
+ info->spare = NULL;
+ } else {
+ info->spare_cpu = iter->cpu_file;
+ }
}
if (!info->spare)
- return -ENOMEM;
+ return ret;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
@@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
- if (!ref->page) {
- ret = -ENOMEM;
+ if (IS_ERR(ref->page)) {
+ ret = PTR_ERR(ref->page);
+ ref->page = NULL;
kfree(ref);
break;
}
@@ -8293,6 +8299,7 @@ __init static int tracer_alloc_buffers(void)
if (ret < 0)
goto out_free_cpumask;
/* Used for event triggers */
+ ret = -ENOMEM;
temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
if (!temp_buffer)
goto out_rm_hp_state;
@@ -8407,4 +8414,4 @@ __init static int clear_boot_tracer(void)
}
fs_initcall(tracer_init_tracefs);
-late_initcall(clear_boot_tracer);
+late_initcall_sync(clear_boot_tracer);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69..13ba2d3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,6 +306,7 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
+ struct perf_event *event;
struct ftrace_entry *entry;
struct hlist_head *head;
struct pt_regs regs;
@@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
+ event = container_of(ops, struct perf_event, ftrace_ops);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL);
+ 1, &regs, head, NULL, event);
#undef ENTRY_SIZE
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 59a411f..181e139 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,
if (err && set_str)
append_filter_err(ps, filter);
}
+ if (err && !set_str) {
+ free_event_filter(filter);
+ filter = NULL;
+ }
create_filter_finish(ps);
*filterp = filter;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c9b5aa1..8a907e1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395..9c4eef2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
+static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
+ struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *rec) {
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long args[sys_data->nb_args];
+ } param;
+ int i;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ for (i = 0; i < sys_data->nb_args; i++)
+ param.args[i] = rec->args[i];
+ return trace_call_bpf(prog, &param);
+}
+
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
+ prog = READ_ONCE(sys_data->enter_event->prog);
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (hlist_empty(head))
+ if (!prog && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -594,9 +613,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
+
+ if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
+static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
+ struct syscall_trace_exit *rec) {
+ struct syscall_tp_t {
+ unsigned long long regs;
+ unsigned long syscall_nr;
+ unsigned long ret;
+ } param;
+
+ *(struct pt_regs **)&param = regs;
+ param.syscall_nr = rec->nr;
+ param.ret = rec->ret;
+ return trace_call_bpf(prog, &param);
+}
+
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
+ struct bpf_prog *prog;
int syscall_nr;
int rctx;
int size;
@@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ prog = READ_ONCE(sys_data->exit_event->prog);
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (hlist_empty(head))
+ if (!prog && hlist_empty(head))
return;
/* We can probably do that at build time */
@@ -666,8 +708,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
+
+ if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+ hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
- 1, regs, head, NULL);
+ 1, regs, head, NULL, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a7581fe..4525e02 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
out:
preempt_enable();
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 0a689bb..305039b 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
if (!a)
return;
- if (!a->pages) {
- kfree(a);
- return;
- }
+ if (!a->pages)
+ goto free;
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
free_page((unsigned long)a->pages[i]);
}
+
+ kfree(a->pages);
+
+ free:
+ kfree(a);
}
struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
diff --git a/kernel/up.c b/kernel/up.c
index ee81ac9..42c46bf 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
unsigned long flags;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 06d3389..f5d5202 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -240,6 +240,7 @@ static void set_sample_period(void)
* hardlockup detector generates a warning
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ watchdog_update_hrtimer_threshold(sample_period);
}
/* Commands for resetting the watchdog */
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 295a0d8..3a09ea1 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void)
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
+static DEFINE_PER_CPU(ktime_t, last_timestamp);
+static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
+static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
+
+void watchdog_update_hrtimer_threshold(u64 period)
+{
+ /*
+ * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
+ *
+ * So it runs effectively with 2.5 times the rate of the NMI
+ * watchdog. That means the hrtimer should fire 2-3 times before
+ * the NMI watchdog expires. The NMI watchdog on x86 is based on
+ * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
+ * might run way faster than expected and the NMI fires in a
+ * smaller period than the one deduced from the nominal CPU
+ * frequency. Depending on the Turbo-Mode factor this might be fast
+ * enough to get the NMI period smaller than the hrtimer watchdog
+ * period and trigger false positives.
+ *
+ * The sample threshold is used to check in the NMI handler whether
+ * the minimum time between two NMI samples has elapsed. That
+ * prevents false positives.
+ *
+ * Set this to 4/5 of the actual watchdog threshold period so the
+ * hrtimer is guaranteed to fire at least once within the real
+ * watchdog threshold.
+ */
+ watchdog_hrtimer_sample_threshold = period * 2;
+}
+
+static bool watchdog_check_timestamp(void)
+{
+ ktime_t delta, now = ktime_get_mono_fast_ns();
+
+ delta = now - __this_cpu_read(last_timestamp);
+ if (delta < watchdog_hrtimer_sample_threshold) {
+ /*
+ * If ktime is jiffies based, a stalled timer would prevent
+ * jiffies from being incremented and the filter would look
+ * at a stale timestamp and never trigger.
+ */
+ if (__this_cpu_inc_return(nmi_rearmed) < 10)
+ return false;
+ }
+ __this_cpu_write(nmi_rearmed, 0);
+ __this_cpu_write(last_timestamp, now);
+ return true;
+}
+#else
+static inline bool watchdog_check_timestamp(void)
+{
+ return true;
+}
+#endif
+
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
+ if (!watchdog_check_timestamp())
+ return;
+
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a86688f..64d0edf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -21,7 +21,7 @@
* pools for workqueues which are not bound to any specific CPU - the
* number of these backing pools is dynamic.
*
- * Please read Documentation/workqueue.txt for details.
+ * Please read Documentation/core-api/workqueue.rst for details.
*/
#include <linux/export.h>
@@ -2091,8 +2091,30 @@ __acquires(&pool->lock)
spin_unlock_irq(&pool->lock);
- lock_map_acquire_read(&pwq->wq->lockdep_map);
+ lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
+ /*
+ * Strictly speaking we should mark the invariant state without holding
+ * any locks, that is, before these two lock_map_acquire()'s.
+ *
+ * However, that would result in:
+ *
+ * A(W1)
+ * WFC(C)
+ * A(W1)
+ * C(C)
+ *
+ * Which would create W1->C->W1 dependencies, even though there is no
+ * actual deadlock possible. There are two solutions, using a
+ * read-recursive acquire on the work(queue) 'locks', but this will then
+ * hit the lockdep limitation on recursive locks, or simply discard
+ * these locks.
+ *
+ * AFAICT there is no possible deadlock scenario between the
+ * flush_work() and complete() primitives (except for single-threaded
+ * workqueues), so hiding them isn't a problem.
+ */
+ lockdep_invariant_state(true);
trace_workqueue_execute_start(work);
worker->current_func(work);
/*
@@ -2247,7 +2269,7 @@ sleep:
* event.
*/
worker_enter_idle(worker);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_IDLE);
spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
@@ -2289,7 +2311,7 @@ static int rescuer_thread(void *__rescuer)
*/
rescuer->task->flags |= PF_WQ_WORKER;
repeat:
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_IDLE);
/*
* By the time the rescuer is requested to stop, the workqueue
@@ -2474,7 +2496,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
*/
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
- init_completion(&barr->done);
+
+ /*
+ * Explicitly init the crosslock for wq_barrier::done, make its lock
+ * key a subkey of the corresponding work. As a result we won't
+ * build a dependency between wq_barrier::done and unrelated work.
+ */
+ lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
+ "(complete)wq_barr::done",
+ target->lockdep_map.key, 1);
+ __init_completion(&barr->done);
barr->task = current;
/*
@@ -2815,16 +2846,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
spin_unlock_irq(&pool->lock);
/*
- * If @max_active is 1 or rescuer is in use, flushing another work
- * item on the same workqueue may lead to deadlock. Make sure the
- * flusher is not running on the same workqueue by verifying write
- * access.
+ * Force a lock recursion deadlock when using flush_work() inside a
+ * single-threaded or rescuer equipped workqueue.
+ *
+ * For single threaded workqueues the deadlock happens when the work
+ * is after the work issuing the flush_work(). For rescuer equipped
+ * workqueues the deadlock happens when the rescuer stalls, blocking
+ * forward progress.
*/
- if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+ if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {
lock_map_acquire(&pwq->wq->lockdep_map);
- else
- lock_map_acquire_read(&pwq->wq->lockdep_map);
- lock_map_release(&pwq->wq->lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
+ }
return true;
already_gone:
@@ -3577,6 +3610,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
/* yeap, return possible CPUs in @node that @attrs wants */
cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+
+ if (cpumask_empty(cpumask)) {
+ pr_warn_once("WARNING: workqueue cpumask: online intersect > "
+ "possible intersect\n");
+ return false;
+ }
+
return !cpumask_equal(cpumask, attrs->cpumask);
use_dfl:
@@ -3744,8 +3784,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ if (!list_empty(&wq->pwqs)) {
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ return -EINVAL;
+
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
@@ -3929,6 +3973,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /*
+ * Unbound && max_active == 1 used to imply ordered, which is no
+ * longer the case on NUMA machines due to per-node pools. While
+ * alloc_ordered_workqueue() is the right way to create an ordered
+ * workqueue, keep the previous behavior to avoid subtle breakages
+ * on NUMA.
+ */
+ if ((flags & WQ_UNBOUND) && max_active == 1)
+ flags |= __WQ_ORDERED;
+
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
@@ -4119,13 +4173,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
struct pool_workqueue *pwq;
/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
+ wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
for_each_pwq(pwq, wq)
@@ -5253,7 +5308,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
OpenPOWER on IntegriCloud