summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c4
-rw-r--r--kernel/bpf/hashtab.c64
-rw-r--r--kernel/bpf/inode.c20
-rw-r--r--kernel/bpf/syscall.c22
-rw-r--r--kernel/bpf/verifier.c10
-rw-r--r--kernel/cgroup.c207
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c6
-rw-r--r--kernel/cpuset.c12
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/cputime.c3
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c79
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/ftrace.c451
-rw-r--r--kernel/trace/ring_buffer.c57
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events.c28
-rw-r--r--kernel/trace/trace_events_trigger.c25
-rw-r--r--kernel/user_namespace.c21
25 files changed, 601 insertions, 468 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bd..972d9a8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
ARG1 = (u64) (unsigned long) ctx;
- /* Registers used in classic BPF programs need to be reset first. */
- regs[BPF_REG_A] = 0;
- regs[BPF_REG_X] = 0;
-
select_insn:
goto *jumptable[insn->code];
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34777b3..c5b30fd 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,11 +14,15 @@
#include <linux/filter.h>
#include <linux/vmalloc.h>
+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
struct bpf_htab {
struct bpf_map map;
- struct hlist_head *buckets;
- raw_spinlock_t lock;
- u32 count; /* number of elements in this hashtable */
+ struct bucket *buckets;
+ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
@@ -79,34 +83,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
+ if ((u64) htab->n_buckets * sizeof(struct bucket) +
(u64) htab->elem_size * htab->map.max_entries >=
U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
goto free_htab;
- htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
htab->elem_size * htab->map.max_entries,
PAGE_SIZE) >> PAGE_SHIFT;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
GFP_USER | __GFP_NOWARN);
if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
if (!htab->buckets)
goto free_htab;
}
- for (i = 0; i < htab->n_buckets; i++)
- INIT_HLIST_HEAD(&htab->buckets[i]);
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
- raw_spin_lock_init(&htab->lock);
- htab->count = 0;
+ atomic_set(&htab->count, 0);
return &htab->map;
@@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
return jhash(key, key_len, 0);
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
void *key, u32 key_size)
{
@@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old;
struct hlist_head *head;
+ struct bucket *b;
unsigned long flags;
u32 key_size;
int ret;
@@ -248,15 +259,15 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
l_new->hash = htab_map_hash(l_new->key, key_size);
+ b = __select_bucket(htab, l_new->hash);
+ head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, l_new->hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
- if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
/* if elem with this 'key' doesn't exist and we've reached
* max_entries limit, fail insertion of new elem
*/
@@ -284,13 +295,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_del_rcu(&l_old->hash_node);
kfree_rcu(l_old, rcu);
} else {
- htab->count++;
+ atomic_inc(&htab->count);
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return 0;
err:
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
kfree(l_new);
return ret;
}
@@ -300,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
+ struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
@@ -310,21 +322,21 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
+ atomic_dec(&htab->count);
kfree_rcu(l, rcu);
ret = 0;
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return ret;
}
@@ -339,7 +351,7 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
+ atomic_dec(&htab->count);
kfree(l);
}
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8a797..f2ece3c 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
}
}
+static int bpf_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ if (bpf_dname_reserved(new_dentry))
+ return -EPERM;
+
+ return simple_link(old_dentry, dir, new_dentry);
+}
+
+static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ if (bpf_dname_reserved(new_dentry))
+ return -EPERM;
+
+ return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static const struct inode_operations bpf_dir_iops = {
.lookup = simple_lookup,
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.rmdir = simple_rmdir,
+ .rename = bpf_rename,
+ .link = bpf_link,
.unlink = simple_unlink,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3b39550..6373970 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -113,8 +113,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
return 0;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct bpf_map *map = filp->private_data;
+
+ seq_printf(m,
+ "map_type:\t%u\n"
+ "key_size:\t%u\n"
+ "value_size:\t%u\n"
+ "max_entries:\t%u\n",
+ map->map_type,
+ map->key_size,
+ map->value_size,
+ map->max_entries);
+}
+#endif
+
static const struct file_operations bpf_map_fops = {
- .release = bpf_map_release,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_map_show_fdinfo,
+#endif
+ .release = bpf_map_release,
};
int bpf_map_new_fd(struct bpf_map *map)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a7945d1..d1d3e8f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1121,6 +1121,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if ((opcode == BPF_LSH || opcode == BPF_RSH ||
+ opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
+ int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
+
+ if (insn->imm < 0 || insn->imm >= size) {
+ verbose("invalid shift %d\n", insn->imm);
+ return -EINVAL;
+ }
+ }
+
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
regs[insn->dst_reg].type == FRAME_PTR &&
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 470f653..c03a640 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,8 +57,8 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
-
#include <linux/atomic.h>
+#include <net/sock.h>
/*
* pidlists linger the following amount before being destroyed. The goal
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;
+static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -440,11 +441,6 @@ static bool cgroup_tryget(struct cgroup *cgrp)
return css_tryget(&cgrp->self);
}
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -465,25 +461,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
}
EXPORT_SYMBOL_GPL(of_css);
-/**
- * cgroup_is_descendant - test ancestry
- * @cgrp: the cgroup to be tested
- * @ancestor: possible ancestor of @cgrp
- *
- * Test whether @cgrp is a descendant of @ancestor. It also returns %true
- * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
- * and @ancestor are accessible.
- */
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
-{
- while (cgrp) {
- if (cgrp == ancestor)
- return true;
- cgrp = cgroup_parent(cgrp);
- }
- return false;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -1647,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
- if (!strcmp(token, "__DEVEL__sane_behavior")) {
- opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
- continue;
- }
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
@@ -1717,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
- pr_err("sane_behavior: no other mount options allowed\n");
- return -EINVAL;
- }
- return 0;
- }
-
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
@@ -1924,6 +1888,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
if (ret < 0)
goto out;
root_cgrp->id = ret;
+ root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
GFP_KERNEL);
@@ -2004,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root;
@@ -2020,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
+ if (is_v2) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ goto out_mount;
+ }
+
mutex_lock(&cgroup_mutex);
/* First find the desired set of subsystems */
@@ -2027,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (ret)
goto out_unlock;
- /* look for a matching existing root */
- if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
- cgrp_dfl_root_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- ret = 0;
- goto out_unlock;
- }
-
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
@@ -2146,9 +2114,10 @@ out_free:
if (ret)
return ERR_PTR(ret);
-
+out_mount:
dentry = kernfs_mount(fs_type, flags, root->kf_root,
- CGROUP_SUPER_MAGIC, &new_sb);
+ is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+ &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
@@ -2191,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = {
.kill_sb = cgroup_kill_sb,
};
+static struct file_system_type cgroup2_fs_type = {
+ .name = "cgroup2",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+};
+
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
@@ -4062,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
goto out_err;
/*
- * Migrate tasks one-by-one until @form is empty. This fails iff
+ * Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
do {
@@ -4903,11 +4878,11 @@ err_free_css:
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
umode_t mode)
{
- struct cgroup *parent, *cgrp;
+ struct cgroup *parent, *cgrp, *tcgrp;
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
- int ssid, ret;
+ int level, ssid, ret;
/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
*/
@@ -4918,9 +4893,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (!parent)
return -ENODEV;
root = parent->root;
+ level = parent->level + 1;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+ cgrp = kzalloc(sizeof(*cgrp) +
+ sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
if (!cgrp) {
ret = -ENOMEM;
goto out_unlock;
@@ -4944,6 +4921,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
cgrp->self.parent = &parent->self;
cgrp->root = root;
+ cgrp->level = level;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5188,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
- printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ pr_debug("Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
@@ -5346,6 +5327,7 @@ int __init cgroup_init(void)
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
return 0;
@@ -5489,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
-static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
- return &ss_priv[i - CGROUP_CANFORK_START];
- return NULL;
-}
-
-static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- void **private = subsys_canfork_priv_p(ss_priv, i);
- return private ? *private : NULL;
-}
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5524,14 +5493,13 @@ void cgroup_fork(struct task_struct *child)
* returns an error, the fork aborts with that error code. This allows for
* a cgroup subsystem to conditionally allow or deny new forks.
*/
-int cgroup_can_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+int cgroup_can_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i, j, ret;
for_each_subsys_which(ss, i, &have_canfork_callback) {
- ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ ret = ss->can_fork(child);
if (ret)
goto out_revert;
}
@@ -5543,7 +5511,7 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ ss->cancel_fork(child);
}
return ret;
@@ -5556,15 +5524,14 @@ out_revert:
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded.
*/
-void cgroup_cancel_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_cancel_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+ ss->cancel_fork(child);
}
/**
@@ -5577,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child,
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child,
- void *old_ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
@@ -5622,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child,
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
+ ss->fork(child);
}
/**
@@ -5822,6 +5788,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
}
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it. Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+
+ kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ if (kn) {
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ cgrp = kn->priv;
+ cgroup_get(cgrp);
+ } else {
+ cgrp = ERR_PTR(-ENOTDIR);
+ }
+ kernfs_put(kn);
+ } else {
+ cgrp = ERR_PTR(-ENOENT);
+ }
+
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
+/*
+ * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+DEFINE_SPINLOCK(cgroup_sk_update_lock);
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+ pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+ cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+
+ rcu_read_lock();
+
+ while (true) {
+ struct css_set *cset;
+
+ cset = task_css_set(current);
+ if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+ skcd->val = (unsigned long)cset->dfl_cgrp;
+ break;
+ }
+ cpu_relax();
+ }
+
+ rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+ cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2d3df82..1b72d56 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task, void *private)
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index b50d5a1..303097b 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
- * succeded, otherwise -EAGAIN.
+ * succeeded, otherwise -EAGAIN.
*/
static int pids_try_charge(struct pids_cgroup *pids, int num)
{
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on threadgroup_change_begin() held by the copy_process().
*/
-static int pids_can_fork(struct task_struct *task, void **priv_p)
+static int pids_can_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
return pids_try_charge(pids, 1);
}
-static void pids_cancel_fork(struct task_struct *task, void *priv)
+static void pids_cancel_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02a8ea5..3e945fc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -51,6 +51,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
+#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
struct fmeter {
int cnt; /* unprocessed events count */
int val; /* most recent output value */
- time_t time; /* clock (secs) when val computed */
+ time64_t time; /* clock (secs) when val computed */
spinlock_t lock; /* guards read or write of above */
};
@@ -1374,7 +1375,7 @@ out:
*/
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
#define FM_SCALE 1000 /* faux fixed point scale */
@@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp)
/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
- time_t now = get_seconds();
- time_t ticks = now - fmp->time;
+ time64_t now;
+ u32 ticks;
+
+ now = ktime_get_seconds();
+ ticks = now - fmp->time;
if (ticks == 0)
return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 291b08c..6774e6b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1250,7 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1527,7 +1526,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
- retval = cgroup_can_fork(p, cgrp_ss_priv);
+ retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_free_pid;
@@ -1609,7 +1608,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p, cgrp_ss_priv);
+ cgroup_post_fork(p);
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1619,7 +1618,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
bad_fork_cancel_cgroup:
- cgroup_cancel_fork(p, cgrp_ss_priv);
+ cgroup_cancel_fork(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b2dd4d9..2794697 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
}
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
@@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
return show_trace_dev_match(buf, PAGE_SIZE);
}
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index caadb56..efe1b3b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = S_IRUGO, \
+ }, \
+ .show = _name##_show, \
+}
+
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 77d97a6..44253ad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task, void *private)
+static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d5ff5c6..b2ab2ff 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -5,6 +5,9 @@
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include "sched.h"
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0623787..2c5e3a8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid);
cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dc6858d..5faf89a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2047,9 +2047,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
void *data)
{
int *i, vleft, first = 1, err = 0;
- unsigned long page = 0;
size_t left;
- char *kbuf;
+ char *kbuf = NULL, *p;
if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
@@ -2078,15 +2077,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- err = -EFAULT;
- goto free;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
}
for (; left && vleft--; i++, first=0) {
@@ -2094,11 +2087,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
bool neg;
if (write) {
- left -= proc_skip_spaces(&kbuf);
+ left -= proc_skip_spaces(&p);
if (!left)
break;
- err = proc_get_long(&kbuf, &left, &lval, &neg,
+ err = proc_get_long(&p, &left, &lval, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
if (err)
@@ -2125,10 +2118,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
if (write && !err && left)
- left -= proc_skip_spaces(&kbuf);
-free:
+ left -= proc_skip_spaces(&p);
if (write) {
- free_page(page);
+ kfree(kbuf);
if (first)
return err ? : -EINVAL;
}
@@ -2310,9 +2302,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
{
unsigned long *i, *min, *max;
int vleft, first = 1, err = 0;
- unsigned long page = 0;
size_t left;
- char *kbuf;
+ char *kbuf = NULL, *p;
if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
@@ -2340,15 +2331,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- err = -EFAULT;
- goto free;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
}
for (; left && vleft--; i++, first = 0) {
@@ -2357,9 +2342,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (write) {
bool neg;
- left -= proc_skip_spaces(&kbuf);
+ left -= proc_skip_spaces(&p);
- err = proc_get_long(&kbuf, &left, &val, &neg,
+ err = proc_get_long(&p, &left, &val, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
if (err)
@@ -2385,10 +2370,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
if (write && !err)
- left -= proc_skip_spaces(&kbuf);
-free:
+ left -= proc_skip_spaces(&p);
if (write) {
- free_page(page);
+ kfree(kbuf);
if (first)
return err ? : -EINVAL;
}
@@ -2650,34 +2634,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
if (write) {
- unsigned long page = 0;
- char *kbuf;
+ char *kbuf, *p;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- free_page(page);
- return -EFAULT;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
GFP_KERNEL);
if (!tmp_bitmap) {
- free_page(page);
+ kfree(kbuf);
return -ENOMEM;
}
- proc_skip_char(&kbuf, &left, '\n');
+ proc_skip_char(&p, &left, '\n');
while (!err && left) {
unsigned long val_a, val_b;
bool neg;
- err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+ err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
sizeof(tr_a), &c);
if (err)
break;
@@ -2688,12 +2665,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
val_b = val_a;
if (left) {
- kbuf++;
+ p++;
left--;
}
if (c == '-') {
- err = proc_get_long(&kbuf, &left, &val_b,
+ err = proc_get_long(&p, &left, &val_b,
&neg, tr_b, sizeof(tr_b),
&c);
if (err)
@@ -2704,16 +2681,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
break;
}
if (left) {
- kbuf++;
+ p++;
left--;
}
}
bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
first = 0;
- proc_skip_char(&kbuf, &left, '\n');
+ proc_skip_char(&p, &left, '\n');
}
- free_page(page);
+ kfree(kbuf);
} else {
unsigned long bit_a, bit_b = 0;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a990824..2aeb6ff 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
if (count >= BLK_TN_MAX_MSG)
return -EINVAL;
- msg = kmalloc(count + 1, GFP_KERNEL);
- if (msg == NULL)
- return -ENOMEM;
-
- if (copy_from_user(msg, buffer, count)) {
- kfree(msg);
- return -EFAULT;
- }
+ msg = memdup_user_nul(buffer, count);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
- msg[count] = '\0';
bt = filp->private_data;
__trace_note_message(bt, "%s", msg);
kfree(msg);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4228fd3..45dd798 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -316,7 +316,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-static struct bpf_verifier_ops kprobe_prog_ops = {
+static const struct bpf_verifier_ops kprobe_prog_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f743b1..eca592f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
-
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
@@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
-static struct ftrace_ops control_ops;
-
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs);
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -203,7 +196,7 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}
-static void control_ops_disable_all(struct ftrace_ops *ops)
+static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
{
int cpu;
@@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops)
*per_cpu_ptr(ops->disabled, cpu) = 1;
}
-static int control_ops_alloc(struct ftrace_ops *ops)
+static int per_cpu_ops_alloc(struct ftrace_ops *ops)
{
int __percpu *disabled;
+ if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
+ return -EINVAL;
+
disabled = alloc_percpu(int);
if (!disabled)
return -ENOMEM;
ops->disabled = disabled;
- control_ops_disable_all(ops);
+ per_cpu_ops_disable_all(ops);
return 0;
}
@@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { }
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic ops or we force list func,
+ * If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
+ FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;
return ftrace_ops_get_func(ops);
@@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
return 0;
}
-static void add_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int first = *list == &ftrace_list_end;
- add_ftrace_ops(list, ops);
- if (first)
- add_ftrace_ops(&ftrace_ops_list, main_ops);
-}
-
-static int remove_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int ret = remove_ftrace_ops(list, ops);
- if (!ret && *list == &ftrace_list_end)
- ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
- return ret;
-}
-
static void ftrace_update_trampoline(struct ftrace_ops *ops);
static int __register_ftrace_function(struct ftrace_ops *ops)
@@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- if (control_ops_alloc(ops))
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
+ if (per_cpu_ops_alloc(ops))
return -ENOMEM;
- add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
- /* The control_ops needs the trampoline update */
- ops = &control_ops;
- } else
- add_ftrace_ops(&ftrace_ops_list, ops);
+ }
+
+ add_ftrace_ops(&ftrace_ops_list, ops);
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
@@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- ret = remove_ftrace_list_ops(&ftrace_control_list,
- &control_ops, ops);
- } else
- ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+ ret = remove_ftrace_ops(&ftrace_ops_list, ops);
if (ret < 0)
return ret;
@@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (all) {
/*
* Only the filter_hash affects all records.
@@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
}
-static void print_ip_ins(const char *fmt, unsigned char *p)
+static void print_ip_ins(const char *fmt, const unsigned char *p)
{
int i;
@@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
static struct ftrace_ops *
ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+
+enum ftrace_bug_type ftrace_bug_type;
+const void *ftrace_expected;
+
+static void print_bug_type(void)
+{
+ switch (ftrace_bug_type) {
+ case FTRACE_BUG_UNKNOWN:
+ break;
+ case FTRACE_BUG_INIT:
+ pr_info("Initializing ftrace call sites\n");
+ break;
+ case FTRACE_BUG_NOP:
+ pr_info("Setting ftrace call site to NOP\n");
+ break;
+ case FTRACE_BUG_CALL:
+ pr_info("Setting ftrace call site to call ftrace function\n");
+ break;
+ case FTRACE_BUG_UPDATE:
+ pr_info("Updating ftrace call site to call a different ftrace function\n");
+ break;
+ }
+}
/**
* ftrace_bug - report and shutdown function tracer
@@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
- print_ip_ins(" actual: ", (unsigned char *)ip);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
+ if (ftrace_expected) {
+ print_ip_ins(" expected: ", ftrace_expected);
+ pr_cont("\n");
+ }
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ print_bug_type();
if (rec) {
struct ftrace_ops *ops = NULL;
@@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
rec->flags & FTRACE_FL_REGS ? " R" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- pr_cont("\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ pr_cont("\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
pr_cont("\ttramp: ERROR!");
}
ip = ftrace_get_addr_curr(rec);
- pr_cont(" expected tramp: %lx\n", ip);
+ pr_cont("\n expected tramp: %lx\n", ip);
}
}
@@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
unsigned long flag = 0UL;
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ return FTRACE_UPDATE_IGNORE;
+
/*
* If we are updating calls:
*
@@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* from the save regs, to a non-save regs function or
* vice versa, or from a trampoline call.
*/
- if (flag & FTRACE_FL_ENABLED)
+ if (flag & FTRACE_FL_ENABLED) {
+ ftrace_bug_type = FTRACE_BUG_CALL;
return FTRACE_UPDATE_MAKE_CALL;
+ }
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return FTRACE_UPDATE_MODIFY_CALL;
}
@@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
FTRACE_FL_REGS_EN);
}
+ ftrace_bug_type = FTRACE_BUG_NOP;
return FTRACE_UPDATE_MAKE_NOP;
}
@@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
+ struct ftrace_ops *op)
+{
+ unsigned long ip = rec->ip;
+
+ while_for_each_ftrace_op(op) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ }
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
@@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
+ ftrace_bug_type = FTRACE_BUG_CALL;
return ftrace_make_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
+ ftrace_bug_type = FTRACE_BUG_NOP;
return ftrace_make_nop(NULL, rec, ftrace_old_addr);
case FTRACE_UPDATE_MODIFY_CALL:
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
@@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
+ ftrace_bug_type = FTRACE_BUG_INIT;
ftrace_bug(ret, rec);
return 0;
}
@@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
-static void control_ops_free(struct ftrace_ops *ops)
+static void per_cpu_ops_free(struct ftrace_ops *ops)
{
free_percpu(ops->disabled);
}
@@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are control ops, they still need their
+ * If these are per_cpu ops, they still need their
* per_cpu field freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
return 0;
}
@@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
- * The same goes for freeing the per_cpu data of the control
+ * The same goes for freeing the per_cpu data of the per_cpu
* ops.
*
* Again, normal synchronize_sched() is not good enough.
@@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
schedule_on_each_cpu(ftrace_sync);
arch_ftrace_trampoline_free(ops);
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
}
return 0;
@@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return 0;
- /* If ops traces all mods, we already accounted for it */
+ /* If ops traces all then it includes this function */
if (ops_traces_mod(ops))
- return 0;
+ return 1;
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
@@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
return 1;
}
-static int referenced_filters(struct dyn_ftrace *rec)
-{
- struct ftrace_ops *ops;
- int cnt = 0;
-
- for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
- }
-
- return cnt;
-}
-
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long update_cnt = 0;
- unsigned long ref = 0;
- bool test = false;
+ unsigned long rec_flags = 0;
int i;
+ start = ftrace_now(raw_smp_processor_id());
+
/*
- * When adding a module, we need to check if tracers are
- * currently enabled and if they are set to trace all functions.
- * If they are, we need to enable the module functions as well
- * as update the reference counts for those function records.
+ * When a module is loaded, this function is called to convert
+ * the calls to mcount in its text to nops, and also to create
+ * an entry in the ftrace data. Now, if ftrace is activated
+ * after this call, but before the module sets its text to
+ * read-only, the modification of enabling ftrace can fail if
+ * the read-only is done while ftrace is converting the calls.
+ * To prevent this, the module's records are set as disabled
+ * and will be enabled after the call to set the module's text
+ * to read-only.
*/
- if (mod) {
- struct ftrace_ops *ops;
-
- for (ops = ftrace_ops_list;
- ops != &ftrace_list_end; ops = ops->next) {
- if (ops->flags & FTRACE_OPS_FL_ENABLED) {
- if (ops_traces_mod(ops))
- ref++;
- else
- test = true;
- }
- }
- }
-
- start = ftrace_now(raw_smp_processor_id());
+ if (mod)
+ rec_flags |= FTRACE_FL_DISABLED;
for (pg = new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
- int cnt = ref;
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
- if (test)
- cnt += referenced_filters(p);
- p->flags = cnt;
+ p->flags = rec_flags;
/*
* Do the initial record conversion from mcount jump
@@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
break;
update_cnt++;
-
- /*
- * If the tracing is enabled, go ahead and enable the record.
- *
- * The reason not to enable the record immediatelly is the
- * inherent check of ftrace_make_nop/ftrace_make_call for
- * correct previous instructions. Making first the NOP
- * conversion puts the module to the correct state, thus
- * passing the ftrace_make_call check.
- */
- if (ftrace_start_up && cnt) {
- int failed = __ftrace_replace_code(p, 1);
- if (failed)
- ftrace_bug(failed, p);
- }
}
}
@@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED) {
- struct ftrace_ops *ops = NULL;
+ struct ftrace_ops *ops;
seq_printf(m, " (%ld)%s%s",
ftrace_rec_count(rec),
@@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v)
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- seq_printf(m, "\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ seq_printf(m, "\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ add_trampoline_func(m, ops, rec);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
seq_puts(m, "\ttramp: ERROR!");
-
+ } else {
+ add_trampoline_func(m, NULL, rec);
}
- add_trampoline_func(m, ops, rec);
}
seq_putc(m, '\n');
@@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod,
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
+}
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
@@ -4940,41 +4961,112 @@ void ftrace_release_mod(struct module *mod)
mutex_unlock(&ftrace_lock);
}
-static void ftrace_init_module(struct module *mod,
- unsigned long *start, unsigned long *end)
+static void ftrace_module_enable(struct module *mod)
{
- if (ftrace_disabled || start == end)
- return;
- ftrace_process_locs(mod, start, end);
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+
+ mutex_lock(&ftrace_lock);
+
+ if (ftrace_disabled)
+ goto out_unlock;
+
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ *
+ * We also delay this to after the module code already set the
+ * text to read-only, as we now need to set it back to read-write
+ * so that we can modify the text.
+ */
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_prepare();
+
+ do_for_each_ftrace_rec(pg, rec) {
+ int cnt;
+ /*
+ * do_for_each_ftrace_rec() is a double loop.
+ * module text shares the pg. If a record is
+ * not part of this module, then skip this pg,
+ * which the "break" will do.
+ */
+ if (!within_module_core(rec->ip, mod))
+ break;
+
+ cnt = 0;
+
+ /*
+ * When adding a module, we need to check if tracers are
+ * currently enabled and if they are, and can trace this record,
+ * we need to enable the module functions as well as update the
+ * reference counts for those function records.
+ */
+ if (ftrace_start_up)
+ cnt += referenced_filters(rec);
+
+ /* This clears FTRACE_FL_DISABLED */
+ rec->flags = cnt;
+
+ if (ftrace_start_up && cnt) {
+ int failed = __ftrace_replace_code(rec, 1);
+ if (failed) {
+ ftrace_bug(failed, rec);
+ goto out_loop;
+ }
+ }
+
+ } while_for_each_ftrace_rec();
+
+ out_loop:
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_post_process();
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
}
void ftrace_module_init(struct module *mod)
{
- ftrace_init_module(mod, mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
+ if (ftrace_disabled || !mod->num_ftrace_callsites)
+ return;
+
+ ftrace_process_locs(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites + mod->num_ftrace_callsites);
}
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
- if (val == MODULE_STATE_GOING)
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ftrace_module_enable(mod);
+ break;
+ case MODULE_STATE_GOING:
ftrace_release_mod(mod);
+ break;
+ default:
+ break;
+ }
return 0;
}
#else
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
return 0;
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_exit_nb = {
- .notifier_call = ftrace_module_notify_exit,
+struct notifier_block ftrace_module_nb = {
+ .notifier_call = ftrace_module_notify,
.priority = INT_MIN, /* Run after anything that can remove kprobes */
};
@@ -5006,7 +5098,7 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_exit_nb);
+ ret = register_module_notifier(&ftrace_module_nb);
if (ret)
pr_warning("Failed to register trace ftrace module exit notifier\n");
@@ -5116,44 +5208,6 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
-{
- if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
- return;
-
- /*
- * Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_sched().
- */
- preempt_disable_notrace();
- trace_recursion_set(TRACE_CONTROL_BIT);
-
- /*
- * Control funcs (perf) uses RCU. Only trace if
- * RCU is currently active.
- */
- if (!rcu_is_watching())
- goto out;
-
- do_for_each_ftrace_op(op, ftrace_control_list) {
- if (!(op->flags & FTRACE_OPS_FL_STUB) &&
- !ftrace_function_local_disabled(op) &&
- ftrace_ops_test(op, ip, regs))
- op->func(ip, parent_ip, op, regs);
- } while_for_each_ftrace_op(op);
- out:
- trace_recursion_clear(TRACE_CONTROL_BIT);
- preempt_enable_notrace();
-}
-
-static struct ftrace_ops control_ops = {
- .func = ftrace_ops_control_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(control_ops)
-};
-
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
@@ -5170,8 +5224,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* they must be freed after a synchronize_sched().
*/
preempt_disable_notrace();
+
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (ftrace_ops_test(op, ip, regs)) {
+ /*
+ * Check the following for each ops before calling their func:
+ * if RCU flag is set, then rcu_is_watching() must be true
+ * if PER_CPU is set, then ftrace_function_local_disable()
+ * must be false
+ * Otherwise test if the ip matches the ops filter
+ *
+ * If any of the above fails then the op->func() is not executed.
+ */
+ if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
+ (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) &&
+ ftrace_ops_test(op, ip, regs)) {
+
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -5195,7 +5263,7 @@ out:
* being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
* Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
- * set the ARCH_SUPPORT_FTARCE_OPS.
+ * set the ARCH_SUPPORTS_FTRACE_OPS.
*/
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -5212,20 +5280,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
/*
* If there's only one function registered but it does not support
- * recursion, this function will be called by the mcount trampoline.
- * This function will handle recursion protection.
+ * recursion, needs RCU protection and/or requires per cpu handling, then
+ * this function will be called by the mcount trampoline.
*/
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
int bit;
+ if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
+ return;
+
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
- op->func(ip, parent_ip, op, regs);
+ preempt_disable_notrace();
+ if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) {
+ op->func(ip, parent_ip, op, regs);
+ }
+
+ preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -5243,12 +5320,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If the func handles its own recursion, call it directly.
- * Otherwise call the recursion protected function that
- * will call the ftrace ops function.
+ * If the function does not handle recursion, needs to be RCU safe,
+ * or does per cpu logic, then we need to call the assist handler.
*/
- if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
- return ftrace_ops_recurs_func;
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
+ ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ return ftrace_ops_assist_func;
return ops->func;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c6045a..95181e3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old,
/*
* rb_tail_page_update - move the tail page forward
- *
- * Returns 1 if moved tail page, 0 if someone else did.
*/
-static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *tail_page,
struct buffer_page *next_page)
{
- struct buffer_page *old_tail;
unsigned long old_entries;
unsigned long old_write;
- int ret = 0;
/*
* The tail page now needs to be moved forward.
@@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
* it is, then it is up to us to update the tail
* pointer.
*/
- if (tail_page == cpu_buffer->tail_page) {
+ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) {
/* Zero the write counter */
unsigned long val = old_write & ~RB_WRITE_MASK;
unsigned long eval = old_entries & ~RB_WRITE_MASK;
@@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_set(&next_page->page->commit, 0);
- old_tail = cmpxchg(&cpu_buffer->tail_page,
- tail_page, next_page);
-
- if (old_tail == tail_page)
- ret = 1;
+ /* Again, either we update tail_page or an interrupt does */
+ (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
}
-
- return ret;
}
static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the tail page would have moved.
*/
if (ret == RB_PAGE_NORMAL) {
+ struct buffer_page *buffer_tail_page;
+
+ buffer_tail_page = READ_ONCE(cpu_buffer->tail_page);
/*
* If the tail had moved passed next, then we need
* to reset the pointer.
*/
- if (cpu_buffer->tail_page != tail_page &&
- cpu_buffer->tail_page != next_page)
+ if (buffer_tail_page != tail_page &&
+ buffer_tail_page != next_page)
rb_head_page_set_normal(cpu_buffer, new_head,
next_page,
RB_PAGE_HEAD);
@@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write);
}
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
+
/*
* This is the slow path, force gcc not to inline it.
*/
@@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
- u64 ts;
next_page = tail_page;
@@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
}
- ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
- if (ret) {
- /*
- * Nested commits always have zero deltas, so
- * just reread the time stamp
- */
- ts = rb_time_stamp(buffer);
- next_page->page->time_stamp = ts;
- }
+ rb_tail_page_update(cpu_buffer, tail_page, next_page);
out_again:
rb_reset_tail(cpu_buffer, tail, info);
+ /* Commit what we have for now. */
+ rb_end_commit(cpu_buffer);
+ /* rb_end_commit() decs committing */
+ local_inc(&cpu_buffer->committing);
+
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
@@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
addr = (unsigned long)event;
addr &= PAGE_MASK;
- bpage = cpu_buffer->tail_page;
+ bpage = READ_ONCE(cpu_buffer->tail_page);
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
@@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
again:
max_count = cpu_buffer->nr_pages * 100;
- while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
return;
if (RB_WARN_ON(cpu_buffer,
@@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
+ /* Only update the write stamp if the page has an event */
+ if (rb_page_write(cpu_buffer->commit_page))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* and pushed the tail page forward, we will be left with
* a dangling commit that will never go forward.
*/
- if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)))
goto again;
}
@@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp))
info->length += RB_LEN_TIME_EXTEND;
- tail_page = info->tail_page = cpu_buffer->tail_page;
+ /* Don't let the compiler play games with cpu_buffer->tail_page */
+ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 919d9d0..8414fa4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -363,8 +363,8 @@ struct trace_option_dentry {
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
- * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
- * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @start: called when tracing is unpaused (echo 1 > tracing_on)
+ * @stop: called when tracing is paused (echo 0 > tracing_on)
* @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
@@ -467,8 +467,6 @@ enum {
TRACE_INTERNAL_IRQ_BIT,
TRACE_INTERNAL_SIRQ_BIT,
- TRACE_CONTROL_BIT,
-
TRACE_BRANCH_BIT,
/*
* Abuse of the trace_recursion.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a9..00df25f 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags |= FTRACE_OPS_FL_CONTROL;
+ ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
ops->func = perf_ftrace_function_call;
return register_ftrace_function(ops);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4f6ef69..f333e57 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1340,15 +1340,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long) buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
mutex_lock(&event_mutex);
file = event_file_data(filp);
@@ -1356,7 +1350,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
err = apply_event_filter(file, buf);
mutex_unlock(&event_mutex);
- free_page((unsigned long) buf);
+ kfree(buf);
if (err < 0)
return err;
@@ -1507,18 +1501,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long) buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
err = apply_subsystem_event_filter(dir, buf);
- free_page((unsigned long) buf);
+ kfree(buf);
if (err < 0)
return err;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 42a4009..b38f617 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -237,28 +237,23 @@ static ssize_t event_trigger_regex_write(struct file *file,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long)buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
strim(buf);
mutex_lock(&event_mutex);
event_file = event_file_data(file);
if (unlikely(!event_file)) {
mutex_unlock(&event_mutex);
- free_page((unsigned long)buf);
+ kfree(buf);
return -ENODEV;
}
ret = trigger_process_regex(event_file, buf);
mutex_unlock(&event_mutex);
- free_page((unsigned long)buf);
+ kfree(buf);
if (ret < 0)
goto out;
@@ -543,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -575,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
@@ -1319,11 +1315,12 @@ static int event_enable_register_trigger(char *glob,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -1344,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob,
(enable_data->file == test_enable_data->file)) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 88fefa6..9bafc21 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent *extent = NULL;
- unsigned long page = 0;
- char *kbuf, *pos, *next_line;
+ char *kbuf = NULL, *pos, *next_line;
ssize_t ret = -EINVAL;
/*
@@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
- /* Get a buffer */
- ret = -ENOMEM;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!page)
- goto out;
-
/* Only allow < page size writes at the beginning of the file */
ret = -EINVAL;
if ((*ppos != 0) || (count >= PAGE_SIZE))
goto out;
/* Slurp in the user data */
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, count))
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf)) {
+ ret = PTR_ERR(kbuf);
+ kbuf = NULL;
goto out;
- kbuf[count] = '\0';
+ }
/* Parse the user data */
ret = -EINVAL;
@@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = count;
out:
mutex_unlock(&userns_state_mutex);
- if (page)
- free_page(page);
+ kfree(kbuf);
return ret;
}
OpenPOWER on IntegriCloud