diff options
author | James Morris <jmorris@namei.org> | 2009-06-30 09:10:35 +1000 |
---|---|---|
committer | James Morris <jmorris@namei.org> | 2009-06-30 09:10:35 +1000 |
commit | ac7242142b03421c96b0a2f8d99f146d075614c2 (patch) | |
tree | b0b2ead65858c7a343d38affed86fe815e37e7e9 /kernel | |
parent | 89c86576ecde504da1eeb4f4882b2189ac2f9c4a (diff) | |
parent | 2bfdd79eaa0043346e773ba5f6cfd811ea31b73d (diff) | |
download | op-kernel-dev-ac7242142b03421c96b0a2f8d99f146d075614c2.zip op-kernel-dev-ac7242142b03421c96b0a2f8d99f146d075614c2.tar.gz |
Merge branch 'master' into next
Diffstat (limited to 'kernel')
33 files changed, 1362 insertions, 1050 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb2..2093a69 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,7 +69,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o -obj-$(CONFIG_AUDIT) += audit.o auditfilter.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o @@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o diff --git a/kernel/audit.c b/kernel/audit.c index 9442c35..defc2e6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -/* Inotify handle. */ -struct inotify_handle *audit_ih; - /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); /* Serialize requests from userspace. */ -static DEFINE_MUTEX(audit_cmd_mutex); +DEFINE_MUTEX(audit_cmd_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb) kfree_skb(skb); } +/* + * For one reason or another this nlh isn't getting delivered to the userspace + * audit daemon, just send it to printk. + */ +static void audit_printk_skb(struct sk_buff *skb) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + char *data = NLMSG_DATA(nlh); + + if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) + printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); + else + audit_log_lost("printk limit exceeded\n"); + } + + audit_hold_skb(skb); +} + static void kauditd_send_skb(struct sk_buff *skb) { int err; @@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy) if (skb) { if (audit_pid) kauditd_send_skb(skb); - else { - if (printk_ratelimit()) - printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); - else - audit_log_lost("printk limit exceeded\n"); - - audit_hold_skb(skb); - } + else + audit_printk_skb(skb); } else { DECLARE_WAITQUEUE(wait, current); set_current_state(TASK_INTERRUPTIBLE); @@ -495,42 +505,25 @@ int audit_send_list(void *_dest) return 0; } -#ifdef CONFIG_AUDIT_TREE -static int prune_tree_thread(void *unused) -{ - mutex_lock(&audit_cmd_mutex); - audit_prune_trees(); - mutex_unlock(&audit_cmd_mutex); - return 0; -} - -void audit_schedule_prune(void) -{ - kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); -} -#endif - struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; - int len = NLMSG_SPACE(size); void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; - skb = alloc_skb(len, GFP_KERNEL); + skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; - nlh = NLMSG_PUT(skb, pid, seq, t, size); - nlh->nlmsg_flags = flags; - data = NLMSG_DATA(nlh); + nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); + data = NLMSG_DATA(nlh); memcpy(data, payload, size); return skb; -nlmsg_failure: /* Used by NLMSG_PUT */ +nlmsg_failure: /* Used by NLMSG_NEW */ if (skb) kfree_skb(skb); return NULL; @@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* - * Get message from skb (based on rtnetlink_rcv_skb). Each message is - * processed by audit_receive_msg. Malformed skbs with wrong length are - * discarded silently. + * Get message from skb. Each message is processed by audit_receive_msg. + * Malformed skbs with wrong length are discarded silently. */ static void audit_receive_skb(struct sk_buff *skb) { - int err; - struct nlmsghdr *nlh; - u32 rlen; + struct nlmsghdr *nlh; + /* + * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 + * if the nlmsg_len was not aligned + */ + int len; + int err; - while (skb->len >= NLMSG_SPACE(0)) { - nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if ((err = audit_receive_msg(skb, nlh))) { + nlh = nlmsg_hdr(skb); + len = skb->len; + + while (NLMSG_OK(nlh, len)) { + err = audit_receive_msg(skb, nlh); + /* if err or if this message says it wants a response */ + if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); + + nlh = NLMSG_NEXT(nlh, len); } } @@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb) mutex_unlock(&audit_cmd_mutex); } -#ifdef CONFIG_AUDITSYSCALL -static const struct inotify_operations audit_inotify_ops = { - .handle_event = audit_handle_ievent, - .destroy_watch = audit_free_parent, -}; -#endif - /* Initialize audit support at boot time. */ static int __init audit_init(void) { @@ -991,12 +978,6 @@ static int __init audit_init(void) audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); -#ifdef CONFIG_AUDITSYSCALL - audit_ih = inotify_init(&audit_inotify_ops); - if (IS_ERR(audit_ih)) - audit_panic("cannot initialize inotify handle"); -#endif - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); @@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, goto err; } - ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); - if (!ab->skb) - goto err; - ab->ctx = ctx; ab->gfp_mask = gfp_mask; - nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); - nlh->nlmsg_type = type; - nlh->nlmsg_flags = 0; - nlh->nlmsg_pid = 0; - nlh->nlmsg_seq = 0; + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); + if (!ab->skb) + goto nlmsg_failure; + + nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); + return ab; + +nlmsg_failure: /* Used by NLMSG_NEW */ + kfree_skb(ab->skb); + ab->skb = NULL; err: audit_buffer_free(ab); return NULL; @@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, kfree(pathname); } +void audit_log_key(struct audit_buffer *ab, char *key) +{ + audit_log_format(ab, " key="); + if (key) + audit_log_untrustedstring(ab, key); + else + audit_log_format(ab, "(null)"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer @@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab) skb_queue_tail(&audit_skb_queue, ab->skb); wake_up_interruptible(&kauditd_wait); } else { - if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) { - printk(KERN_NOTICE "type=%d %s\n", - nlh->nlmsg_type, - ab->skb->data + NLMSG_SPACE(0)); - } else - audit_log_lost("printk limit exceeded\n"); - } - audit_hold_skb(ab->skb); + audit_printk_skb(ab->skb); } ab->skb = NULL; } diff --git a/kernel/audit.h b/kernel/audit.h index 16f18ca..208687b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -53,18 +53,7 @@ enum audit_state { }; /* Rule lists */ -struct audit_parent; - -struct audit_watch { - atomic_t count; /* reference count */ - char *path; /* insertion path */ - dev_t dev; /* associated superblock device */ - unsigned long ino; /* associated inode number */ - struct audit_parent *parent; /* associated parent */ - struct list_head wlist; /* entry in parent->watches list */ - struct list_head rules; /* associated rules */ -}; - +struct audit_watch; struct audit_tree; struct audit_chunk; @@ -108,19 +97,28 @@ struct audit_netlink_list { int audit_send_list(void *); -struct inotify_watch; -/* Inotify handle */ -extern struct inotify_handle *audit_ih; - -extern void audit_free_parent(struct inotify_watch *); -extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, - const char *, struct inode *); extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; +/* audit watch functions */ +extern unsigned long audit_watch_inode(struct audit_watch *watch); +extern dev_t audit_watch_dev(struct audit_watch *watch); +extern void audit_put_watch(struct audit_watch *watch); +extern void audit_get_watch(struct audit_watch *watch); +extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); +extern int audit_add_watch(struct audit_krule *krule); +extern void audit_remove_watch(struct audit_watch *watch); +extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); +extern void audit_inotify_unregister(struct list_head *in_list); +extern char *audit_watch_path(struct audit_watch *watch); +extern struct list_head *audit_watch_rules(struct audit_watch *watch); + +extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch); + #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); extern void audit_put_chunk(struct audit_chunk *); @@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *); extern int audit_remove_tree_rule(struct audit_krule *); extern void audit_trim_trees(void); extern int audit_tag_tree(char *old, char *new); -extern void audit_schedule_prune(void); -extern void audit_prune_trees(void); extern const char *audit_tree_path(struct audit_tree *); extern void audit_put_tree(struct audit_tree *); +extern void audit_kill_trees(struct list_head *); #else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL @@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *); #define audit_put_tree(tree) (void)0 #define audit_tag_tree(old, new) -EINVAL #define audit_tree_path(rule) "" /* never called */ +#define audit_kill_trees(list) BUG() #endif extern char *audit_unpack_string(void **, size_t *, size_t); @@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } extern void audit_filter_inodes(struct task_struct *, struct audit_context *); +extern struct list_head *audit_killed_trees(void); #else #define audit_signal_info(s,t) AUDIT_DISABLED #define audit_filter_inodes(t,c) AUDIT_DISABLED #endif + +extern struct mutex audit_cmd_mutex; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1f6396d7..2451dc6 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -2,6 +2,7 @@ #include <linux/inotify.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/kthread.h> struct audit_tree; struct audit_chunk; @@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree) if (rule->tree) { /* not a half-baked one */ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op=remove rule dir="); + audit_log_format(ab, "op="); + audit_log_string(ab, "remove rule"); + audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); - if (rule->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, rule->filterkey); - } else - audit_log_format(ab, " key=(null)"); + audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=1", rule->listnr); audit_log_end(ab); rule->tree = NULL; @@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree) } } +static void audit_schedule_prune(void); + /* called with audit_filter_mutex */ int audit_remove_tree_rule(struct audit_krule *rule) { @@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new) /* * That gets run when evict_chunk() ends up needing to kill audit_tree. - * Runs from a separate thread, with audit_cmd_mutex held. + * Runs from a separate thread. */ -void audit_prune_trees(void) +static int prune_tree_thread(void *unused) { + mutex_lock(&audit_cmd_mutex); mutex_lock(&audit_filter_mutex); while (!list_empty(&prune_list)) { @@ -844,6 +846,40 @@ void audit_prune_trees(void) } mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); + return 0; +} + +static void audit_schedule_prune(void) +{ + kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); +} + +/* + * ... and that one is done if evict_chunk() decides to delay until the end + * of syscall. Runs synchronously. + */ +void audit_kill_trees(struct list_head *list) +{ + mutex_lock(&audit_cmd_mutex); + mutex_lock(&audit_filter_mutex); + + while (!list_empty(list)) { + struct audit_tree *victim; + + victim = list_entry(list->next, struct audit_tree, list); + kill_rules(victim); + list_del_init(&victim->list); + + mutex_unlock(&audit_filter_mutex); + + prune_one(victim); + + mutex_lock(&audit_filter_mutex); + } + + mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); } /* @@ -854,6 +890,8 @@ void audit_prune_trees(void) static void evict_chunk(struct audit_chunk *chunk) { struct audit_tree *owner; + struct list_head *postponed = audit_killed_trees(); + int need_prune = 0; int n; if (chunk->dead) @@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk) owner->root = NULL; list_del_init(&owner->same_root); spin_unlock(&hash_lock); - kill_rules(owner); - list_move(&owner->list, &prune_list); - audit_schedule_prune(); + if (!postponed) { + kill_rules(owner); + list_move(&owner->list, &prune_list); + need_prune = 1; + } else { + list_move(&owner->list, postponed); + } spin_lock(&hash_lock); } list_del_rcu(&chunk->hash); for (n = 0; n < chunk->count; n++) list_del_init(&chunk->owners[n].list); spin_unlock(&hash_lock); + if (need_prune) + audit_schedule_prune(); mutex_unlock(&audit_filter_mutex); } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c new file mode 100644 index 0000000..0e96dbc --- /dev/null +++ b/kernel/audit_watch.c @@ -0,0 +1,543 @@ +/* audit_watch.c -- watching inodes + * + * Copyright 2003-2009 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/audit.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/netlink.h> +#include <linux/sched.h> +#include <linux/inotify.h> +#include <linux/security.h> +#include "audit.h" + +/* + * Reference counting: + * + * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * event. Each audit_watch holds a reference to its associated parent. + * + * audit_watch: if added to lists, lifetime is from audit_init_watch() to + * audit_remove_watch(). Additionally, an audit_watch may exist + * temporarily to assist in searching existing filter data. Each + * audit_krule holds a reference to its associated watch. + */ + +struct audit_watch { + atomic_t count; /* reference count */ + char *path; /* insertion path */ + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + struct audit_parent *parent; /* associated parent */ + struct list_head wlist; /* entry in parent->watches list */ + struct list_head rules; /* associated rules */ +}; + +struct audit_parent { + struct list_head ilist; /* entry in inotify registration list */ + struct list_head watches; /* associated watches */ + struct inotify_watch wdata; /* inotify watch data */ + unsigned flags; /* status flags */ +}; + +/* Inotify handle. */ +struct inotify_handle *audit_ih; + +/* + * audit_parent status flags: + * + * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to + * a filesystem event to ensure we're adding audit watches to a valid parent. + * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot + * receive them while we have nameidata, but must be used for IN_MOVE_SELF which + * we can receive while holding nameidata. + */ +#define AUDIT_PARENT_INVALID 0x001 + +/* Inotify events we care about. */ +#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF + +static void audit_free_parent(struct inotify_watch *i_watch) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} + +void audit_get_watch(struct audit_watch *watch) +{ + atomic_inc(&watch->count); +} + +void audit_put_watch(struct audit_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + WARN_ON(watch->parent); + WARN_ON(!list_empty(&watch->rules)); + kfree(watch->path); + kfree(watch); + } +} + +void audit_remove_watch(struct audit_watch *watch) +{ + list_del(&watch->wlist); + put_inotify_watch(&watch->parent->wdata); + watch->parent = NULL; + audit_put_watch(watch); /* match initial get */ +} + +char *audit_watch_path(struct audit_watch *watch) +{ + return watch->path; +} + +struct list_head *audit_watch_rules(struct audit_watch *watch) +{ + return &watch->rules; +} + +unsigned long audit_watch_inode(struct audit_watch *watch) +{ + return watch->ino; +} + +dev_t audit_watch_dev(struct audit_watch *watch) +{ + return watch->dev; +} + +/* Initialize a parent watch entry. */ +static struct audit_parent *audit_init_parent(struct nameidata *ndp) +{ + struct audit_parent *parent; + s32 wd; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (unlikely(!parent)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&parent->watches); + parent->flags = 0; + + inotify_init_watch(&parent->wdata); + /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ + get_inotify_watch(&parent->wdata); + wd = inotify_add_watch(audit_ih, &parent->wdata, + ndp->path.dentry->d_inode, AUDIT_IN_WATCH); + if (wd < 0) { + audit_free_parent(&parent->wdata); + return ERR_PTR(wd); + } + + return parent; +} + +/* Initialize a watch entry. */ +static struct audit_watch *audit_init_watch(char *path) +{ + struct audit_watch *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&watch->rules); + atomic_set(&watch->count, 1); + watch->path = path; + watch->dev = (dev_t)-1; + watch->ino = (unsigned long)-1; + + return watch; +} + +/* Translate a watch string to kernel respresentation. */ +int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) +{ + struct audit_watch *watch; + + if (!audit_ih) + return -EOPNOTSUPP; + + if (path[0] != '/' || path[len-1] == '/' || + krule->listnr != AUDIT_FILTER_EXIT || + op != Audit_equal || + krule->inode_f || krule->watch || krule->tree) + return -EINVAL; + + watch = audit_init_watch(path); + if (IS_ERR(watch)) + return PTR_ERR(watch); + + audit_get_watch(watch); + krule->watch = watch; + + return 0; +} + +/* Duplicate the given audit watch. The new watch's rules list is initialized + * to an empty list and wlist is undefined. */ +static struct audit_watch *audit_dupe_watch(struct audit_watch *old) +{ + char *path; + struct audit_watch *new; + + path = kstrdup(old->path, GFP_KERNEL); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + + new = audit_init_watch(path); + if (IS_ERR(new)) { + kfree(path); + goto out; + } + + new->dev = old->dev; + new->ino = old->ino; + get_inotify_watch(&old->parent->wdata); + new->parent = old->parent; + +out: + return new; +} + +static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) +{ + if (audit_enabled) { + struct audit_buffer *ab; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u op=", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + audit_log_key(ab, r->filterkey); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); + } +} + +/* Update inode info in audit rules based on filesystem event. */ +static void audit_update_watch(struct audit_parent *parent, + const char *dname, dev_t dev, + unsigned long ino, unsigned invalidating) +{ + struct audit_watch *owatch, *nwatch, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *oentry, *nentry; + + mutex_lock(&audit_filter_mutex); + list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { + if (audit_compare_dname_path(dname, owatch->path, NULL)) + continue; + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ + if (invalidating && current->audit_context) + audit_filter_inodes(current, current->audit_context); + + nwatch = audit_dupe_watch(owatch); + if (IS_ERR(nwatch)) { + mutex_unlock(&audit_filter_mutex); + audit_panic("error updating watch, skipping"); + return; + } + nwatch->dev = dev; + nwatch->ino = ino; + + list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { + + oentry = container_of(r, struct audit_entry, rule); + list_del(&oentry->rule.rlist); + list_del_rcu(&oentry->list); + + nentry = audit_dupe_rule(&oentry->rule, nwatch); + if (IS_ERR(nentry)) { + list_del(&oentry->rule.list); + audit_panic("error updating watch, removing"); + } else { + int h = audit_hash_ino((u32)ino); + list_add(&nentry->rule.rlist, &nwatch->rules); + list_add_rcu(&nentry->list, &audit_inode_hash[h]); + list_replace(&oentry->rule.list, + &nentry->rule.list); + } + + audit_watch_log_rule_change(r, owatch, "updated rules"); + + call_rcu(&oentry->rcu, audit_free_rule_rcu); + } + + audit_remove_watch(owatch); + goto add_watch_to_parent; /* event applies to a single watch */ + } + mutex_unlock(&audit_filter_mutex); + return; + +add_watch_to_parent: + list_add(&nwatch->wlist, &parent->watches); + mutex_unlock(&audit_filter_mutex); + return; +} + +/* Remove all watches & rules associated with a parent that is going away. */ +static void audit_remove_parent_watches(struct audit_parent *parent) +{ + struct audit_watch *w, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *e; + + mutex_lock(&audit_filter_mutex); + parent->flags |= AUDIT_PARENT_INVALID; + list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { + list_for_each_entry_safe(r, nextr, &w->rules, rlist) { + e = container_of(r, struct audit_entry, rule); + audit_watch_log_rule_change(r, w, "remove rule"); + list_del(&r->rlist); + list_del(&r->list); + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + } + audit_remove_watch(w); + } + mutex_unlock(&audit_filter_mutex); +} + +/* Unregister inotify watches for parents on in_list. + * Generates an IN_IGNORED event. */ +void audit_inotify_unregister(struct list_head *in_list) +{ + struct audit_parent *p, *n; + + list_for_each_entry_safe(p, n, in_list, ilist) { + list_del(&p->ilist); + inotify_rm_watch(audit_ih, &p->wdata); + /* the unpin matching the pin in audit_do_del_rule() */ + unpin_inotify_watch(&p->wdata); + } +} + +/* Get path information necessary for adding watches. */ +static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +{ + struct nameidata *ndparent, *ndwatch; + int err; + + ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); + if (unlikely(!ndparent)) + return -ENOMEM; + + ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); + if (unlikely(!ndwatch)) { + kfree(ndparent); + return -ENOMEM; + } + + err = path_lookup(path, LOOKUP_PARENT, ndparent); + if (err) { + kfree(ndparent); + kfree(ndwatch); + return err; + } + + err = path_lookup(path, 0, ndwatch); + if (err) { + kfree(ndwatch); + ndwatch = NULL; + } + + *ndp = ndparent; + *ndw = ndwatch; + + return 0; +} + +/* Release resources used for watch path information. */ +static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +{ + if (ndp) { + path_put(&ndp->path); + kfree(ndp); + } + if (ndw) { + path_put(&ndw->path); + kfree(ndw); + } +} + +/* Associate the given rule with an existing parent inotify_watch. + * Caller must hold audit_filter_mutex. */ +static void audit_add_to_parent(struct audit_krule *krule, + struct audit_parent *parent) +{ + struct audit_watch *w, *watch = krule->watch; + int watch_found = 0; + + list_for_each_entry(w, &parent->watches, wlist) { + if (strcmp(watch->path, w->path)) + continue; + + watch_found = 1; + + /* put krule's and initial refs to temporary watch */ + audit_put_watch(watch); + audit_put_watch(watch); + + audit_get_watch(w); + krule->watch = watch = w; + break; + } + + if (!watch_found) { + get_inotify_watch(&parent->wdata); + watch->parent = parent; + + list_add(&watch->wlist, &parent->watches); + } + list_add(&krule->rlist, &watch->rules); +} + +/* Find a matching watch entry, or add this one. + * Caller must hold audit_filter_mutex. */ +int audit_add_watch(struct audit_krule *krule) +{ + struct audit_watch *watch = krule->watch; + struct inotify_watch *i_watch; + struct audit_parent *parent; + struct nameidata *ndp = NULL, *ndw = NULL; + int ret = 0; + + mutex_unlock(&audit_filter_mutex); + + /* Avoid calling path_lookup under audit_filter_mutex. */ + ret = audit_get_nd(watch->path, &ndp, &ndw); + if (ret) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + goto error; + } + + /* update watch filter fields */ + if (ndw) { + watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->path.dentry->d_inode->i_ino; + } + + /* The audit_filter_mutex must not be held during inotify calls because + * we hold it during inotify event callback processing. If an existing + * inotify watch is found, inotify_find_watch() grabs a reference before + * returning. + */ + if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, + &i_watch) < 0) { + parent = audit_init_parent(ndp); + if (IS_ERR(parent)) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + ret = PTR_ERR(parent); + goto error; + } + } else + parent = container_of(i_watch, struct audit_parent, wdata); + + mutex_lock(&audit_filter_mutex); + + /* parent was moved before we took audit_filter_mutex */ + if (parent->flags & AUDIT_PARENT_INVALID) + ret = -ENOENT; + else + audit_add_to_parent(krule, parent); + + /* match get in audit_init_parent or inotify_find_watch */ + put_inotify_watch(&parent->wdata); + +error: + audit_put_nd(ndp, ndw); /* NULL args OK */ + return ret; + +} + +void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) +{ + struct audit_watch *watch = krule->watch; + struct audit_parent *parent = watch->parent; + + list_del(&krule->rlist); + + if (list_empty(&watch->rules)) { + audit_remove_watch(watch); + + if (list_empty(&parent->watches)) { + /* Put parent on the inotify un-registration + * list. Grab a reference before releasing + * audit_filter_mutex, to be released in + * audit_inotify_unregister(). + * If filesystem is going away, just leave + * the sucker alone, eviction will take + * care of it. */ + if (pin_inotify_watch(&parent->wdata)) + list_add(&parent->ilist, list); + } + } +} + +/* Update watch data in audit rules based on inotify events. */ +static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, + u32 cookie, const char *dname, struct inode *inode) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + + if (mask & (IN_CREATE|IN_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, + inode->i_ino, 0); + else if (mask & (IN_DELETE|IN_MOVED_FROM)) + audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + /* inotify automatically removes the watch and sends IN_IGNORED */ + else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) + audit_remove_parent_watches(parent); + /* inotify does not remove the watch, so remove it manually */ + else if(mask & IN_MOVE_SELF) { + audit_remove_parent_watches(parent); + inotify_remove_watch_locked(audit_ih, i_watch); + } else if (mask & IN_IGNORED) + put_inotify_watch(i_watch); +} + +static const struct inotify_operations audit_inotify_ops = { + .handle_event = audit_handle_ievent, + .destroy_watch = audit_free_parent, +}; + +static int __init audit_watch_init(void) +{ + audit_ih = inotify_init(&audit_inotify_ops); + if (IS_ERR(audit_ih)) + audit_panic("cannot initialize inotify handle"); + return 0; +} +subsys_initcall(audit_watch_init); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 713098e..a706040 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -27,7 +27,6 @@ #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> -#include <linux/inotify.h> #include <linux/security.h> #include "audit.h" @@ -44,36 +43,6 @@ * be written directly provided audit_filter_mutex is held. */ -/* - * Reference counting: - * - * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED - * event. Each audit_watch holds a reference to its associated parent. - * - * audit_watch: if added to lists, lifetime is from audit_init_watch() to - * audit_remove_watch(). Additionally, an audit_watch may exist - * temporarily to assist in searching existing filter data. Each - * audit_krule holds a reference to its associated watch. - */ - -struct audit_parent { - struct list_head ilist; /* entry in inotify registration list */ - struct list_head watches; /* associated watches */ - struct inotify_watch wdata; /* inotify watch data */ - unsigned flags; /* status flags */ -}; - -/* - * audit_parent status flags: - * - * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to - * a filesystem event to ensure we're adding audit watches to a valid parent. - * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot - * receive them while we have nameidata, but must be used for IN_MOVE_SELF which - * we can receive while holding nameidata. - */ -#define AUDIT_PARENT_INVALID 0x001 - /* Audit filter lists, defined in <linux/audit.h> */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), @@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { DEFINE_MUTEX(audit_filter_mutex); -/* Inotify events we care about. */ -#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF - -void audit_free_parent(struct inotify_watch *i_watch) -{ - struct audit_parent *parent; - - parent = container_of(i_watch, struct audit_parent, wdata); - WARN_ON(!list_empty(&parent->watches)); - kfree(parent); -} - -static inline void audit_get_watch(struct audit_watch *watch) -{ - atomic_inc(&watch->count); -} - -static void audit_put_watch(struct audit_watch *watch) -{ - if (atomic_dec_and_test(&watch->count)) { - WARN_ON(watch->parent); - WARN_ON(!list_empty(&watch->rules)); - kfree(watch->path); - kfree(watch); - } -} - -static void audit_remove_watch(struct audit_watch *watch) -{ - list_del(&watch->wlist); - put_inotify_watch(&watch->parent->wdata); - watch->parent = NULL; - audit_put_watch(watch); /* match initial get */ -} - static inline void audit_free_rule(struct audit_entry *e) { int i; @@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head) audit_free_rule(e); } -/* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) -{ - struct audit_parent *parent; - s32 wd; - - parent = kzalloc(sizeof(*parent), GFP_KERNEL); - if (unlikely(!parent)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&parent->watches); - parent->flags = 0; - - inotify_init_watch(&parent->wdata); - /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ - get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, - ndp->path.dentry->d_inode, AUDIT_IN_WATCH); - if (wd < 0) { - audit_free_parent(&parent->wdata); - return ERR_PTR(wd); - } - - return parent; -} - -/* Initialize a watch entry. */ -static struct audit_watch *audit_init_watch(char *path) -{ - struct audit_watch *watch; - - watch = kzalloc(sizeof(*watch), GFP_KERNEL); - if (unlikely(!watch)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&watch->rules); - atomic_set(&watch->count, 1); - watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; - - return watch; -} - /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { @@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule, return 0; } -/* Translate a watch string to kernel respresentation. */ -static int audit_to_watch(struct audit_krule *krule, char *path, int len, - u32 op) -{ - struct audit_watch *watch; - - if (!audit_ih) - return -EOPNOTSUPP; - - if (path[0] != '/' || path[len-1] == '/' || - krule->listnr != AUDIT_FILTER_EXIT || - op != Audit_equal || - krule->inode_f || krule->watch || krule->tree) - return -EINVAL; - - watch = audit_init_watch(path); - if (IS_ERR(watch)) - return PTR_ERR(watch); - - audit_get_watch(watch); - krule->watch = watch; - - return 0; -} - static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) @@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) break; case AUDIT_WATCH: data->buflen += data->values[i] = - audit_pack_string(&bufp, krule->watch->path); + audit_pack_string(&bufp, + audit_watch_path(krule->watch)); break; case AUDIT_DIR: data->buflen += data->values[i] = @@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 1; break; case AUDIT_WATCH: - if (strcmp(a->watch->path, b->watch->path)) + if (strcmp(audit_watch_path(a->watch), + audit_watch_path(b->watch))) return 1; break; case AUDIT_DIR: @@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 0; } -/* Duplicate the given audit watch. The new watch's rules list is initialized - * to an empty list and wlist is undefined. */ -static struct audit_watch *audit_dupe_watch(struct audit_watch *old) -{ - char *path; - struct audit_watch *new; - - path = kstrdup(old->path, GFP_KERNEL); - if (unlikely(!path)) - return ERR_PTR(-ENOMEM); - - new = audit_init_watch(path); - if (IS_ERR(new)) { - kfree(path); - goto out; - } - - new->dev = old->dev; - new->ino = old->ino; - get_inotify_watch(&old->parent->wdata); - new->parent = old->parent; - -out: - return new; -} - /* Duplicate LSM field information. The lsm_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_lsm_field(struct audit_field *df, @@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ -static struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch) +struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, return entry; } -/* Update inode info in audit rules based on filesystem event. */ -static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, - unsigned long ino, unsigned invalidating) -{ - struct audit_watch *owatch, *nwatch, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *oentry, *nentry; - - mutex_lock(&audit_filter_mutex); - list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) - continue; - - /* If the update involves invalidating rules, do the inode-based - * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context) - audit_filter_inodes(current, current->audit_context); - - nwatch = audit_dupe_watch(owatch); - if (IS_ERR(nwatch)) { - mutex_unlock(&audit_filter_mutex); - audit_panic("error updating watch, skipping"); - return; - } - nwatch->dev = dev; - nwatch->ino = ino; - - list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { - - oentry = container_of(r, struct audit_entry, rule); - list_del(&oentry->rule.rlist); - list_del_rcu(&oentry->list); - - nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (IS_ERR(nentry)) { - list_del(&oentry->rule.list); - audit_panic("error updating watch, removing"); - } else { - int h = audit_hash_ino((u32)ino); - list_add(&nentry->rule.rlist, &nwatch->rules); - list_add_rcu(&nentry->list, &audit_inode_hash[h]); - list_replace(&oentry->rule.list, - &nentry->rule.list); - } - - call_rcu(&oentry->rcu, audit_free_rule_rcu); - } - - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, - AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_format(ab, - " op=updated rules specifying path="); - audit_log_untrustedstring(ab, owatch->path); - audit_log_format(ab, " with dev=%u ino=%lu\n", - dev, ino); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } - audit_remove_watch(owatch); - goto add_watch_to_parent; /* event applies to a single watch */ - } - mutex_unlock(&audit_filter_mutex); - return; - -add_watch_to_parent: - list_add(&nwatch->wlist, &parent->watches); - mutex_unlock(&audit_filter_mutex); - return; -} - -/* Remove all watches & rules associated with a parent that is going away. */ -static void audit_remove_parent_watches(struct audit_parent *parent) -{ - struct audit_watch *w, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *e; - - mutex_lock(&audit_filter_mutex); - parent->flags |= AUDIT_PARENT_INVALID; - list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { - list_for_each_entry_safe(r, nextr, &w->rules, rlist) { - e = container_of(r, struct audit_entry, rule); - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, - AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_format(ab, " op=remove rule path="); - audit_log_untrustedstring(ab, w->path); - if (r->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, - r->filterkey); - } else - audit_log_format(ab, " key=(null)"); - audit_log_format(ab, " list=%d res=1", - r->listnr); - audit_log_end(ab); - } - list_del(&r->rlist); - list_del(&r->list); - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - } - audit_remove_watch(w); - } - mutex_unlock(&audit_filter_mutex); -} - -/* Unregister inotify watches for parents on in_list. - * Generates an IN_IGNORED event. */ -static void audit_inotify_unregister(struct list_head *in_list) -{ - struct audit_parent *p, *n; - - list_for_each_entry_safe(p, n, in_list, ilist) { - list_del(&p->ilist); - inotify_rm_watch(audit_ih, &p->wdata); - /* the unpin matching the pin in audit_do_del_rule() */ - unpin_inotify_watch(&p->wdata); - } -} - /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, @@ -1145,134 +855,6 @@ out: return found; } -/* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, - struct nameidata **ndw) -{ - struct nameidata *ndparent, *ndwatch; - int err; - - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; - - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; - } - - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; - } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; - } - - *ndp = ndparent; - *ndw = ndwatch; - - return 0; -} - -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - -/* Associate the given rule with an existing parent inotify_watch. - * Caller must hold audit_filter_mutex. */ -static void audit_add_to_parent(struct audit_krule *krule, - struct audit_parent *parent) -{ - struct audit_watch *w, *watch = krule->watch; - int watch_found = 0; - - list_for_each_entry(w, &parent->watches, wlist) { - if (strcmp(watch->path, w->path)) - continue; - - watch_found = 1; - - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); - audit_put_watch(watch); - - audit_get_watch(w); - krule->watch = watch = w; - break; - } - - if (!watch_found) { - get_inotify_watch(&parent->wdata); - watch->parent = parent; - - list_add(&watch->wlist, &parent->watches); - } - list_add(&krule->rlist, &watch->rules); -} - -/* Find a matching watch entry, or add this one. - * Caller must hold audit_filter_mutex. */ -static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, - struct nameidata *ndw) -{ - struct audit_watch *watch = krule->watch; - struct inotify_watch *i_watch; - struct audit_parent *parent; - int ret = 0; - - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } - - /* The audit_filter_mutex must not be held during inotify calls because - * we hold it during inotify event callback processing. If an existing - * inotify watch is found, inotify_find_watch() grabs a reference before - * returning. - */ - mutex_unlock(&audit_filter_mutex); - - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, - &i_watch) < 0) { - parent = audit_init_parent(ndp); - if (IS_ERR(parent)) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - return PTR_ERR(parent); - } - } else - parent = container_of(i_watch, struct audit_parent, wdata); - - mutex_lock(&audit_filter_mutex); - - /* parent was moved before we took audit_filter_mutex */ - if (parent->flags & AUDIT_PARENT_INVALID) - ret = -ENOENT; - else - audit_add_to_parent(krule, parent); - - /* match get in audit_init_parent or inotify_find_watch */ - put_inotify_watch(&parent->wdata); - return ret; -} - static u64 prio_low = ~0ULL/2; static u64 prio_high = ~0ULL/2 - 1; @@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_entry *e; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; - struct nameidata *ndp = NULL, *ndw = NULL; struct list_head *list; int h, err; #ifdef CONFIG_AUDITSYSCALL @@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry) mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); - mutex_unlock(&audit_filter_mutex); if (e) { + mutex_unlock(&audit_filter_mutex); err = -EEXIST; /* normally audit_add_tree_rule() will free it on failure */ if (tree) @@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry) goto error; } - /* Avoid calling path_lookup under audit_filter_mutex. */ - if (watch) { - err = audit_get_nd(watch->path, &ndp, &ndw); - if (err) - goto error; - } - - mutex_lock(&audit_filter_mutex); if (watch) { /* audit_filter_mutex is dropped and re-taken during this call */ - err = audit_add_watch(&entry->rule, ndp, ndw); + err = audit_add_watch(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); goto error; } - h = audit_hash_ino((u32)watch->ino); + /* entry->rule.watch may have changed during audit_add_watch() */ + watch = entry->rule.watch; + h = audit_hash_ino((u32)audit_watch_inode(watch)); list = &audit_inode_hash[h]; } if (tree) { @@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - audit_put_nd(ndp, ndw); /* NULL args OK */ return 0; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ if (watch) audit_put_watch(watch); /* tmp watch, matches initial get */ return err; @@ -1372,7 +945,7 @@ error: static inline int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_watch *watch, *tmp_watch = entry->rule.watch; + struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; LIST_HEAD(inotify_list); @@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry) goto out; } - watch = e->rule.watch; - if (watch) { - struct audit_parent *parent = watch->parent; - - list_del(&e->rule.rlist); - - if (list_empty(&watch->rules)) { - audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - /* Put parent on the inotify un-registration - * list. Grab a reference before releasing - * audit_filter_mutex, to be released in - * audit_inotify_unregister(). - * If filesystem is going away, just leave - * the sucker alone, eviction will take - * care of it. - */ - if (pin_inotify_watch(&parent->wdata)) - list_add(&parent->ilist, &inotify_list); - } - } - } + if (e->rule.watch) + audit_remove_watch_rule(&e->rule, &inotify_list); if (e->rule.tree) audit_remove_tree_rule(&e->rule); @@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry) audit_inotify_unregister(&inotify_list); out: - if (tmp_watch) - audit_put_watch(tmp_watch); /* match initial get */ + if (watch) + audit_put_watch(watch); /* match initial get */ if (tree) audit_put_tree(tree); /* that's the temporary one */ @@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, security_release_secctx(ctx, len); } } - audit_log_format(ab, " op=%s rule key=", action); - if (rule->filterkey) - audit_log_untrustedstring(ab, rule->filterkey); - else - audit_log_format(ab, "(null)"); + audit_log_format(ab, " op="); + audit_log_string(ab, action); + audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=%d", rule->listnr, res); audit_log_end(ab); } @@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return PTR_ERR(entry); err = audit_add_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "add", + audit_log_rule_change(loginuid, sessionid, sid, "add rule", &entry->rule, !err); if (err) @@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return PTR_ERR(entry); err = audit_del_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "remove", + audit_log_rule_change(loginuid, sessionid, sid, "remove rule", &entry->rule, !err); audit_free_rule(entry); @@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r) list_del(&r->list); } else { if (watch) { - list_add(&nentry->rule.rlist, &watch->rules); + list_add(&nentry->rule.rlist, audit_watch_rules(watch)); list_del(&r->rlist); } else if (tree) list_replace_init(&r->rlist, &nentry->rule.rlist); @@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void) return err; } - -/* Update watch data in audit rules based on inotify events. */ -void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) -{ - struct audit_parent *parent; - - parent = container_of(i_watch, struct audit_parent, wdata); - - if (mask & (IN_CREATE|IN_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, - inode->i_ino, 0); - else if (mask & (IN_DELETE|IN_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); - /* inotify automatically removes the watch and sends IN_IGNORED */ - else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) - audit_remove_parent_watches(parent); - /* inotify does not remove the watch, so remove it manually */ - else if(mask & IN_MOVE_SELF) { - audit_remove_parent_watches(parent); - inotify_remove_watch_locked(audit_ih, i_watch); - } else if (mask & IN_IGNORED) - put_inotify_watch(i_watch); -} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7d6ac7c..68d3c6a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -199,6 +199,7 @@ struct audit_context { struct audit_tree_refs *trees, *first_trees; int tree_count; + struct list_head killed_trees; int type; union { @@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name && rule->watch->ino != (unsigned long)-1) - result = (name->dev == rule->watch->dev && - name->ino == rule->watch->ino); + if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) + result = (name->dev == audit_watch_dev(rule->watch) && + name->ino == audit_watch_inode(rule->watch)); break; case AUDIT_DIR: if (ctx) @@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) return NULL; audit_zero_context(context, state); + INIT_LIST_HEAD(&context->killed_trees); return context; } @@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context, { char arg_num_len_buf[12]; const char __user *tmp_p = p; - /* how many digits are in arg_num? 3 is the length of " a=" */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; + /* how many digits are in arg_num? 5 is the length of ' a=""' */ + size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; size_t len, len_left, to_send; size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; unsigned int i, has_cntl = 0, too_long = 0; @@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (has_cntl) audit_log_n_hex(*ab, buf, to_send); else - audit_log_format(*ab, "\"%s\"", buf); + audit_log_string(*ab, buf); p += to_send; len_left -= to_send; @@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_task_info(ab, tsk); - if (context->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, context->filterkey); - } else - audit_log_format(ab, " key=(null)"); + audit_log_key(ab, context->filterkey); audit_log_end(ab); for (aux = context->aux; aux; aux = aux->next) { @@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk) /* that can happen only if we are called from do_exit() */ if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); + if (!list_empty(&context->killed_trees)) + audit_kill_trees(&context->killed_trees); audit_free_context(context); } @@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(&context->killed_trees); + if (context->previous) { struct audit_context *new_context = context->previous; context->previous = NULL; @@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr) audit_log_format(ab, " sig=%ld", signr); audit_log_end(ab); } + +struct list_head *audit_killed_trees(void) +{ + struct audit_context *ctx = current->audit_context; + if (likely(!ctx || !ctx->in_syscall)) + return NULL; + return &ctx->killed_trees; +} diff --git a/kernel/cpu.c b/kernel/cpu.c index 395b697..8ce1004 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -34,14 +34,11 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; -} cpu_hotplug; - -void __init cpu_hotplug_init(void) -{ - cpu_hotplug.active_writer = NULL; - mutex_init(&cpu_hotplug.lock); - cpu_hotplug.refcount = 0; -} +} cpu_hotplug = { + .active_writer = NULL, + .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), + .refcount = 0, +}; #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/exit.c b/kernel/exit.c index 13ae640..628d41f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1197,8 +1197,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) } traced = ptrace_reparented(p); - - if (likely(!traced)) { + /* + * It can be ptraced but not reparented, check + * !task_detached() to filter out sub-threads. + */ + if (likely(!traced) && likely(!task_detached(p))) { struct signal_struct *psig; struct signal_struct *sig; diff --git a/kernel/futex.c b/kernel/futex.c index 80b5ce7..794c862 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -284,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/* + * fault_in_user_writeable - fault in user address and verify RW access + * @uaddr: pointer to faulting user space address + * + * Slow path to fixup the fault we just took in the atomic write + * access to @uaddr. + * + * We have no generic implementation of a non destructive write to the + * user address. We know that we faulted in the atomic pagefault + * disabled section so we can as well avoid the #PF overhead by + * calling get_user_pages() right away. + */ +static int fault_in_user_writeable(u32 __user *uaddr) +{ + int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, + 1, 1, 0, NULL, NULL); + return ret < 0 ? ret : 0; +} + /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in @@ -896,7 +915,6 @@ retry: retry_private: op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - u32 dummy; double_unlock_hb(hb1, hb2); @@ -914,7 +932,7 @@ retry_private: goto out_put_keys; } - ret = get_user(dummy, uaddr2); + ret = fault_in_user_writeable(uaddr2); if (ret) goto out_put_keys; @@ -1204,7 +1222,7 @@ retry_private: double_unlock_hb(hb1, hb2); put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - ret = get_user(curval2, uaddr2); + ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; goto out; @@ -1482,7 +1500,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); @@ -1807,7 +1825,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, { struct hrtimer_sleeper timeout, *to = NULL; struct futex_hash_bucket *hb; - u32 uval; struct futex_q q; int res, ret; @@ -1909,16 +1926,9 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - /* - * We have to r/w *(int __user *)uaddr, and we have to modify it - * atomically. Therefore, if we continue to fault after get_user() - * below, we need to handle the fault ourselves, while still holding - * the mmap_sem. This can occur if the uaddr is under contention as - * we have to drop the mmap_sem in order to call get_user(). - */ queue_unlock(&q, hb); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); if (ret) goto out_put_key; @@ -2013,17 +2023,10 @@ out: return ret; pi_faulted: - /* - * We have to r/w *(int __user *)uaddr, and we have to modify it - * atomically. Therefore, if we continue to fault after get_user() - * below, we need to handle the fault ourselves, while still holding - * the mmap_sem. This can occur if the uaddr is under contention as - * we have to drop the mmap_sem in order to call get_user(). - */ spin_unlock(&hb->lock); put_futex_key(fshared, &key); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); if (!ret) goto retry; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index aaf5c9d..50da676 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -856,7 +856,7 @@ EXPORT_SYMBOL(free_irq); * still called in hard interrupt context and has to check * whether the interrupt originates from the device. If yes it * needs to disable the interrupt on the device and return - * IRQ_THREAD_WAKE which will wake up the handler thread and run + * IRQ_WAKE_THREAD which will wake up the handler thread and run * @thread_fn. This split handler design is necessary to support * shared interrupts. * diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 29b685f..1a933a2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -124,7 +124,7 @@ void perf_enable(void) static void get_ctx(struct perf_counter_context *ctx) { - atomic_inc(&ctx->refcount); + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } static void free_ctx(struct rcu_head *head) @@ -175,6 +175,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } } rcu_read_unlock(); return ctx; @@ -193,7 +198,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; - get_ctx(ctx); spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; @@ -1283,7 +1287,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) if (!interrupts) { perf_disable(); counter->pmu->disable(counter); - atomic_set(&hwc->period_left, 0); + atomic64_set(&hwc->period_left, 0); counter->pmu->enable(counter); perf_enable(); } @@ -1459,11 +1463,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) put_ctx(parent_ctx); ctx->parent_ctx = NULL; /* no longer a clone */ } - /* - * Get an extra reference before dropping the lock so that - * this context won't get freed if the task exits. - */ - get_ctx(ctx); spin_unlock_irqrestore(&ctx->lock, flags); } @@ -1553,7 +1552,7 @@ static int perf_release(struct inode *inode, struct file *file) static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[3]; + u64 values[4]; int n; /* @@ -1620,22 +1619,6 @@ static void perf_counter_reset(struct perf_counter *counter) perf_counter_update_userpage(counter); } -static void perf_counter_for_each_sibling(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, list_entry) - func(sibling); - mutex_unlock(&ctx->mutex); -} - /* * Holding the top-level counter's child_mutex means that any * descendant process that has inherited this counter will block @@ -1658,14 +1641,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter, static void perf_counter_for_each(struct perf_counter *counter, void (*func)(struct perf_counter *)) { - struct perf_counter *child; + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *sibling; - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - perf_counter_for_each_sibling(counter, func); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_for_each_sibling(child, func); - mutex_unlock(&counter->child_mutex); + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + counter = counter->group_leader; + + perf_counter_for_each_child(counter, func); + func(counter); + list_for_each_entry(sibling, &counter->sibling_list, list_entry) + perf_counter_for_each_child(counter, func); + mutex_unlock(&ctx->mutex); } static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) @@ -1806,6 +1793,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct perf_mmap_data *data; int ret = VM_FAULT_SIGBUS; + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + rcu_read_lock(); data = rcu_dereference(counter->data); if (!data) @@ -1819,9 +1812,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if ((unsigned)nr > data->nr_pages) goto unlock; + if (vmf->flags & FAULT_FLAG_WRITE) + goto unlock; + vmf->page = virt_to_page(data->data_pages[nr]); } + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + ret = 0; unlock: rcu_read_unlock(); @@ -1874,6 +1874,14 @@ fail: return -ENOMEM; } +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page(addr); + + page->mapping = NULL; + __free_page(page); +} + static void __perf_mmap_data_free(struct rcu_head *rcu_head) { struct perf_mmap_data *data; @@ -1881,9 +1889,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head) data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - free_page((unsigned long)data->user_page); + perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) - free_page((unsigned long)data->data_pages[i]); + perf_mmap_free_page((unsigned long)data->data_pages[i]); + kfree(data); } @@ -1920,9 +1929,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) } static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) @@ -1936,7 +1946,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) long user_extra, extra; int ret = 0; - if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) + if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; vma_size = vma->vm_end - vma->vm_start; @@ -1995,10 +2005,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; counter->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + counter->data->writable = 1; + unlock: mutex_unlock(&counter->mmap_mutex); - vma->vm_flags &= ~VM_MAYWRITE; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; @@ -2175,11 +2187,38 @@ struct perf_output_handle { unsigned long head; unsigned long offset; int nmi; - int overflow; + int sample; int locked; unsigned long flags; }; +static bool perf_output_space(struct perf_mmap_data *data, + unsigned int offset, unsigned int head) +{ + unsigned long tail; + unsigned long mask; + + if (!data->writable) + return true; + + mask = (data->nr_pages << PAGE_SHIFT) - 1; + /* + * Userspace could choose to issue a mb() before updating the tail + * pointer. So that all reads will be completed before the write is + * issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->data->poll, POLL_IN); @@ -2270,12 +2309,57 @@ out: local_irq_restore(handle->flags); } +static void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, - int nmi, int overflow) + int nmi, int sample) { struct perf_mmap_data *data; unsigned int offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; /* * For inherited counters we send all the output towards the parent. @@ -2288,19 +2372,25 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->overflow = overflow; + handle->data = data; + handle->counter = counter; + handle->nmi = nmi; + handle->sample = sample; if (!data->nr_pages) goto fail; + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + perf_output_lock(handle); do { offset = head = atomic_long_read(&data->head); head += size; + if (unlikely(!perf_output_space(data, offset, head))) + goto fail; } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); handle->offset = offset; @@ -2309,55 +2399,27 @@ static int perf_output_begin(struct perf_output_handle *handle, if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) atomic_set(&data->wakeup, 1); + if (have_lost) { + lost_event.header.type = PERF_EVENT_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = counter->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + return 0; fail: - perf_output_wakeup(handle); + atomic_inc(&data->lost); + perf_output_unlock(handle); out: rcu_read_unlock(); return -ENOSPC; } -static void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - - do { - unsigned int page_offset; - int nr; - - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); - - memcpy(pages[nr] + page_offset, buf, size); - - len -= size; - buf += size; - offset += size; - } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); -} - -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - static void perf_output_end(struct perf_output_handle *handle) { struct perf_counter *counter = handle->counter; @@ -2365,7 +2427,7 @@ static void perf_output_end(struct perf_output_handle *handle) int wakeup_events = counter->attr.wakeup_events; - if (handle->overflow && wakeup_events) { + if (handle->sample && wakeup_events) { int events = atomic_inc_return(&data->events); if (events >= wakeup_events) { atomic_sub(wakeup_events, &data->events); @@ -2970,7 +3032,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) } /* - * Generic counter overflow handling. + * Generic counter overflow handling, sampling. */ int perf_counter_overflow(struct perf_counter *counter, int nmi, @@ -3109,20 +3171,15 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) } static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) + int nmi, struct perf_sample_data *data) { - struct perf_sample_data data = { - .regs = regs, - .addr = addr, - .period = counter->hw.last_period, - }; + data->period = counter->hw.last_period; perf_swcounter_update(counter); perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, &data)) + if (perf_counter_overflow(counter, nmi, data)) /* soft-disable the counter */ ; - } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3187,18 +3244,18 @@ static int perf_swcounter_match(struct perf_counter *counter, } static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct pt_regs *regs, u64 addr) + int nmi, struct perf_sample_data *data) { int neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.sample_period && !neg && regs) - perf_swcounter_overflow(counter, nmi, regs, addr); + if (counter->hw.sample_period && !neg && data->regs) + perf_swcounter_overflow(counter, nmi, data); } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_type_id type, u32 event, - u64 nr, int nmi, struct pt_regs *regs, - u64 addr) + enum perf_type_id type, + u32 event, u64 nr, int nmi, + struct perf_sample_data *data) { struct perf_counter *counter; @@ -3207,8 +3264,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, regs, addr); + if (perf_swcounter_match(counter, type, event, data->regs)) + perf_swcounter_add(counter, nr, nmi, data); } rcu_read_unlock(); } @@ -3227,9 +3284,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) return &cpuctx->recursion[0]; } -static void __perf_swcounter_event(enum perf_type_id type, u32 event, - u64 nr, int nmi, struct pt_regs *regs, - u64 addr) +static void do_perf_swcounter_event(enum perf_type_id type, u32 event, + u64 nr, int nmi, + struct perf_sample_data *data) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); @@ -3242,7 +3299,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event, barrier(); perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, regs, addr); + nr, nmi, data); rcu_read_lock(); /* * doesn't really matter which of the child contexts the @@ -3250,7 +3307,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event, */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); + perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); rcu_read_unlock(); barrier(); @@ -3263,7 +3320,12 @@ out: void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); + struct perf_sample_data data = { + .regs = regs, + .addr = addr, + }; + + do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); } static void perf_swcounter_read(struct perf_counter *counter) @@ -3404,36 +3466,18 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: cpu migrations - */ -void perf_counter_task_migration(struct task_struct *task, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx; - - perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, - PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); - - ctx = perf_pin_task_context(task); - if (ctx) { - perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, - PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); - perf_unpin_context(ctx); - } -} - #ifdef CONFIG_EVENT_PROFILE void perf_tpcounter_event(int event_id) { - struct pt_regs *regs = get_irq_regs(); + struct perf_sample_data data = { + .regs = get_irq_regs(); + .addr = 0, + }; - if (!regs) - regs = task_pt_regs(current); + if (!data.regs) + data.regs = task_pt_regs(current); - __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); diff --git a/kernel/sched.c b/kernel/sched.c index 247fd0f..7c9098d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_counter_task_migration(p, new_cpu); + perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -7822,7 +7823,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { gfp_t gfp = GFP_KERNEL; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 7deffc9..e6c2517 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -152,7 +152,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) +int cpupri_init(struct cpupri *cp, bool bootmem) { gfp_t gfp = GFP_KERNEL; int i; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 467ca72..70c7e0b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, spread, rq0_min_vruntime, spread0; - struct rq *rq = &per_cpu(runqueues, cpu); + struct rq *rq = cpu_rq(cpu); struct sched_entity *last; unsigned long flags; @@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; + rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); @@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) static void print_cpu(struct seq_file *m, int cpu) { - struct rq *rq = &per_cpu(runqueues, cpu); + struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_X86 { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5f9650e..ba7fd6e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -430,12 +430,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *load; + struct load_weight lw; cfs_rq = cfs_rq_of(se); load = &cfs_rq->load; if (unlikely(!se->on_rq)) { - struct load_weight lw = cfs_rq->load; + lw = cfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62e4ff9..98e0232 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = { .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, }, #endif { @@ -744,6 +747,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", .data = &bootloader_type, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2aff39c..e0f59a2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle) cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); + + /* + * Call to tick_nohz_start_idle stops the last_update_time from being + * updated. Thus, it must not be called in the event we are called from + * irq_exit() with the prior state different than idle. + */ + if (!inidle && !ts->inidle) + goto end; + now = tick_nohz_start_idle(ts); /* @@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - if (!inidle && !ts->inidle) - goto end; - ts->inidle = 1; if (need_resched()) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index c994530..4cde8b9 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex); /* * Collection status, active/inactive: */ -static int __read_mostly active; +int __read_mostly timer_stats_active; /* * Beginning/end timestamps of measurement: @@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, struct entry *entry, input; unsigned long flags; - if (likely(!active)) + if (likely(!timer_stats_active)) return; lock = &per_cpu(lookup_lock, raw_smp_processor_id()); @@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.timer_flag = timer_flag; spin_lock_irqsave(lock, flags); - if (!active) + if (!timer_stats_active) goto out_unlock; entry = tstat_lookup(&input, comm); @@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v) /* * If still active then calculate up to now: */ - if (active) + if (timer_stats_active) time_stop = ktime_get(); time = ktime_sub(time_stop, time_start); @@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf, mutex_lock(&show_mutex); switch (ctl[0]) { case '0': - if (active) { - active = 0; + if (timer_stats_active) { + timer_stats_active = 0; time_stop = ktime_get(); sync_access(); } break; case '1': - if (!active) { + if (!timer_stats_active) { reset_entries(); time_start = ktime_get(); smp_mb(); - active = 1; + timer_stats_active = 1; } break; default: diff --git a/kernel/timer.c b/kernel/timer.c index 54d3912..0b36b9e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer) { unsigned int flag = 0; + if (likely(!timer->start_site)) + return; if (unlikely(tbase_get_deferrable(timer->base))) flag |= TIMER_STATS_FLAG_DEFERRABLE; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61071fe..1551f47 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER config HAVE_FUNCTION_GRAPH_TRACER bool +config HAVE_FUNCTION_GRAPH_FP_TEST + bool + help + An arch may pass in a unique value (frame pointer) to both the + entering and exiting of a function. On exit, the value is compared + and if it does not match, then it will panic the kernel. + config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help @@ -121,6 +128,7 @@ config FUNCTION_GRAPH_TRACER bool "Kernel Function Graph Tracer" depends on HAVE_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER + depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE default y help Enable the kernel to trace a function at both its return diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bb60732..f3716bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -291,7 +291,9 @@ function_stat_next(void *v, int idx) pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); again: - rec++; + if (idx != 0) + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { pg = pg->next; if (!pg) @@ -1224,6 +1226,13 @@ static void ftrace_shutdown(int command) return; ftrace_start_up--; + /* + * Just warn in case of unbalance, no need to kill ftrace, it's not + * critical but the ftrace_call callers may be never nopped again after + * further ftrace uses. + */ + WARN_ON_ONCE(ftrace_start_up < 0); + if (!ftrace_start_up) command |= FTRACE_DISABLE_CALLS; @@ -1410,10 +1419,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; + + if (!(iter->flags & FTRACE_ITER_HASH)) + *pos = 0; iter->flags |= FTRACE_ITER_HASH; - return t_hash_next(m, p, pos); + iter->hidx = 0; + for (l = 0; l <= *pos; ) { + p = t_hash_next(m, p, &l); + if (!p) + break; + } + return p; } static int t_hash_show(struct seq_file *m, void *v) @@ -1460,8 +1479,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; - } else { - iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -1490,6 +1507,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; mutex_lock(&ftrace_lock); /* @@ -1501,23 +1519,21 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; - (*pos)++; return iter; } if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); - if (*pos > 0) { - if (iter->idx < 0) - return p; - (*pos)--; - iter->idx--; + iter->pg = ftrace_pages_start; + iter->idx = 0; + for (l = 0; l <= *pos; ) { + p = t_next(m, p, &l); + if (!p) + break; } - p = t_next(m, p, pos); - - if (!p) + if (!p && iter->flags & FTRACE_ITER_FILTER) return t_hash_start(m, pos); return p; @@ -2493,32 +2509,31 @@ int ftrace_graph_count; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * -g_next(struct seq_file *m, void *v, loff_t *pos) +__g_next(struct seq_file *m, loff_t *pos) { unsigned long *array = m->private; - int index = *pos; - - (*pos)++; - if (index >= ftrace_graph_count) + if (*pos >= ftrace_graph_count) return NULL; + return &array[*pos]; +} - return &array[index]; +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __g_next(m, pos); } static void *g_start(struct seq_file *m, loff_t *pos) { - void *p = NULL; - mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ if (!ftrace_graph_count && !*pos) return (void *)1; - p = g_next(m, p, pos); - - return p; + return __g_next(m, pos); } static void g_stop(struct seq_file *m, void *p) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 86cdf67..1edaa95 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -186,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr) int cpu; kmemtrace_array = tr; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); kmemtrace_start_probes(); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dc4dc70..bf27bb7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -206,6 +206,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -415,6 +416,8 @@ struct ring_buffer_per_cpu { unsigned long overrun; unsigned long read; local_t entries; + local_t committing; + local_t commits; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -618,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -/* - * Causes compile errors if the struct buffer_page gets bigger - * than the struct page. - */ -extern int ring_buffer_page_too_big(void); - #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu); @@ -646,11 +643,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, int bsize; int cpu; - /* Paranoid! Optimizes out when all is well */ - if (sizeof(struct buffer_page) > sizeof(struct page)) - ring_buffer_page_too_big(); - - /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); @@ -666,8 +658,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, buffer->reader_lock_key = key; /* need at least two pages */ - if (buffer->pages == 1) - buffer->pages++; + if (buffer->pages < 2) + buffer->pages = 2; /* * In case of non-hotplug cpu, if the ring-buffer is allocated @@ -1011,12 +1003,12 @@ rb_event_index(struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); + return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } static inline int -rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; unsigned long index; @@ -1029,31 +1021,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, } static void -rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - while (cpu_buffer->commit_page->page != (void *)addr) { - if (RB_WARN_ON(cpu_buffer, - cpu_buffer->commit_page == cpu_buffer->tail_page)) - return; - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - } - - /* Now set the commit to the event's index */ - local_set(&cpu_buffer->commit_page->page->commit, index); -} - -static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { /* @@ -1171,6 +1138,60 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } +static inline void +rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + unsigned long tail, unsigned long length) +{ + struct ring_buffer_event *event; + + /* + * Only the event that crossed the page boundary + * must fill the old tail_page with padding. + */ + if (tail >= BUF_PAGE_SIZE) { + local_sub(length, &tail_page->write); + return; + } + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + + /* + * If this event is bigger than the minimum size, then + * we need to be careful that we don't subtract the + * write counter enough to allow another writer to slip + * in on this page. + * We put in a discarded commit instead, to make sure + * that this space is not used again. + * + * If we are less than the minimum size, we don't need to + * worry about it. + */ + if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + /* No room for any events */ + + /* Mark the rest of the page with padding */ + rb_event_set_padding(event); + + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); + return; + } + + /* Put in a discarded event */ + event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + event->time_delta = 1; + /* Account for this as an entry */ + local_inc(&tail_page->entries); + local_inc(&cpu_buffer->entries); + + /* Set write to end of buffer */ + length = (tail + length) - BUF_PAGE_SIZE; + local_sub(length, &tail_page->write); +} static struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, @@ -1180,7 +1201,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, { struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_event *event; bool lock_taken = false; unsigned long flags; @@ -1265,27 +1285,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page->page->time_stamp = *ts; } - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_event_set_padding(event); - } - - /* Set the write back to the previous setting */ - local_sub(length, &tail_page->write); - - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } + rb_reset_tail(cpu_buffer, tail_page, tail, length); __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); @@ -1295,7 +1295,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_reset: /* reset write */ - local_sub(length, &tail_page->write); + rb_reset_tail(cpu_buffer, tail_page, tail, length); if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); @@ -1325,9 +1325,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; - event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(event, type, length); @@ -1337,11 +1334,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, local_inc(&tail_page->entries); /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. + * If this is the first commit on the page, then update + * its timestamp. */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; + if (!tail) + tail_page->page->time_stamp = *ts; return event; } @@ -1410,16 +1407,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return -EAGAIN; /* Only a commited time event can update the write stamp */ - if (rb_is_commit(cpu_buffer, event)) { + if (rb_event_is_commit(cpu_buffer, event)) { /* - * If this is the first on the page, then we need to - * update the page itself, and just put in a zero. + * If this is the first on the page, then it was + * updated with the page itself. Try to discard it + * and if we can't just make it zero. */ if (rb_event_index(event)) { event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; } else { - cpu_buffer->commit_page->page->time_stamp = *ts; /* try to discard, since we do not need this */ if (!rb_try_to_discard(cpu_buffer, event)) { /* nope, just zero it */ @@ -1445,6 +1442,44 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return ret; } +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) @@ -1454,6 +1489,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, int commit = 0; int nr_loops = 0; + rb_start_commit(cpu_buffer); + length = rb_calculate_event_length(length); again: /* @@ -1466,7 +1503,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * Bail! */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - return NULL; + goto out_fail; ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); @@ -1497,7 +1534,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); if (commit == -EBUSY) - return NULL; + goto out_fail; if (commit == -EAGAIN) goto again; @@ -1511,30 +1548,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; - if (!event) { - if (unlikely(commit)) - /* - * Ouch! We needed a timestamp and it was commited. But - * we didn't get our event reserved. - */ - rb_set_commit_to_write(cpu_buffer); - return NULL; - } + if (!event) + goto out_fail; - /* - * If the timestamp was commited, make the commit our entry - * now so that we will update it when needed. - */ - if (unlikely(commit)) - rb_set_commit_event(cpu_buffer, event); - else if (!rb_is_commit(cpu_buffer, event)) + if (!rb_event_is_commit(cpu_buffer, event)) delta = 0; event->time_delta = delta; return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; } +#ifdef CONFIG_TRACING + #define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) @@ -1565,6 +1595,13 @@ static void trace_recursive_unlock(void) current->trace_recursion--; } +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -1642,13 +1679,14 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, { local_inc(&cpu_buffer->entries); - /* Only process further if we own the commit */ - if (!rb_is_commit(cpu_buffer, event)) - return; - - cpu_buffer->write_stamp += event->time_delta; + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) + cpu_buffer->write_stamp += event->time_delta; - rb_set_commit_to_write(cpu_buffer); + rb_end_commit(cpu_buffer); } /** @@ -1737,15 +1775,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, /* The event is discarded regardless */ rb_event_discard(event); + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* * This must only be called if the event has not been * committed yet. Thus we can assume that preemption * is still disabled. */ - RB_WARN_ON(buffer, preemptible()); - - cpu = smp_processor_id(); - cpu_buffer = buffer->buffers[cpu]; + RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); if (!rb_try_to_discard(cpu_buffer, event)) goto out; @@ -1756,13 +1794,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ local_inc(&cpu_buffer->entries); out: - /* - * If a write came in and pushed the tail page - * we still need to update the commit pointer - * if we were the commit. - */ - if (rb_is_commit(cpu_buffer, event)) - rb_set_commit_to_write(cpu_buffer); + rb_end_commit(cpu_buffer); trace_recursive_unlock(); @@ -2446,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); +static inline int rb_ok_to_lock(void) +{ + /* + * If an NMI die dumps out the content of the ring buffer + * do not grab locks. We also permanently disable the ring + * buffer too. A one time deal is all you get from reading + * the ring buffer from an NMI. + */ + if (likely(!in_nmi() && !oops_in_progress)) + return 1; + + tracing_off_permanent(); + return 0; +} + /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read @@ -2461,14 +2508,20 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; + int dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; + dolock = rb_ok_to_lock(); again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); @@ -2520,6 +2573,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; + int dolock; + + dolock = rb_ok_to_lock(); again: /* might be called in atomic */ @@ -2529,7 +2585,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) goto out; cpu_buffer = buffer->buffers[cpu]; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); if (!event) @@ -2538,7 +2596,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) rb_advance_reader(cpu_buffer); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); out: preempt_enable(); @@ -2680,6 +2740,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->overrun = 0; cpu_buffer->read = 0; local_set(&cpu_buffer->entries, 0); + local_set(&cpu_buffer->committing, 0); + local_set(&cpu_buffer->commits, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -2734,12 +2796,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset); int ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int dolock; int cpu; + int ret; + + dolock = rb_ok_to_lock(); /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - if (!rb_per_cpu_empty(cpu_buffer)) + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); + ret = rb_per_cpu_empty(cpu_buffer); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + if (!ret) return 0; } @@ -2755,14 +2830,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int dolock; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; + dolock = rb_ok_to_lock(); + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); - + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); return ret; } @@ -3029,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +#ifdef CONFIG_TRACING static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3096,6 +3181,7 @@ static __init int rb_init_debugfs(void) } fs_initcall(rb_init_debugfs); +#endif #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, @@ -3108,7 +3194,7 @@ static int rb_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (cpu_isset(cpu, *buffer->cpumask)) + if (cpumask_test_cpu(cpu, buffer->cpumask)) return NOTIFY_OK; buffer->buffers[cpu] = @@ -3119,7 +3205,7 @@ static int rb_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } smp_wmb(); - cpu_set(cpu, *buffer->cpumask); + cpumask_set_cpu(cpu, buffer->cpumask); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 8d68e14..573d3cc 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -102,8 +102,10 @@ static enum event_status read_page(int cpu) event = (void *)&rpage->data[i]; switch (event->type_len) { case RINGBUF_TYPE_PADDING: - /* We don't expect any padding */ - KILL_TEST(); + /* failed writes may be discarded events */ + if (!event->time_delta) + KILL_TEST(); + inc = event->array[0] + 4; break; case RINGBUF_TYPE_TIME_EXTEND: inc = 8; @@ -119,7 +121,7 @@ static enum event_status read_page(int cpu) KILL_TEST(); break; } - inc = event->array[0]; + inc = event->array[0] + 4; break; default: entry = ring_buffer_event_data(event); @@ -201,7 +203,7 @@ static void ring_buffer_producer(void) * Hammer the buffer for 10 secs (this may * make the system stall) */ - pr_info("Starting ring buffer hammer\n"); + trace_printk("Starting ring buffer hammer\n"); do_gettimeofday(&start_tv); do { struct ring_buffer_event *event; @@ -237,7 +239,7 @@ static void ring_buffer_producer(void) #endif } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); - pr_info("End ring buffer hammer\n"); + trace_printk("End ring buffer hammer\n"); if (consumer) { /* Init both completions here to avoid races */ @@ -260,49 +262,50 @@ static void ring_buffer_producer(void) overruns = ring_buffer_overruns(buffer); if (kill_test) - pr_info("ERROR!\n"); - pr_info("Time: %lld (usecs)\n", time); - pr_info("Overruns: %lld\n", overruns); + trace_printk("ERROR!\n"); + trace_printk("Time: %lld (usecs)\n", time); + trace_printk("Overruns: %lld\n", overruns); if (disable_reader) - pr_info("Read: (reader disabled)\n"); + trace_printk("Read: (reader disabled)\n"); else - pr_info("Read: %ld (by %s)\n", read, + trace_printk("Read: %ld (by %s)\n", read, read_events ? "events" : "pages"); - pr_info("Entries: %lld\n", entries); - pr_info("Total: %lld\n", entries + overruns + read); - pr_info("Missed: %ld\n", missed); - pr_info("Hit: %ld\n", hit); + trace_printk("Entries: %lld\n", entries); + trace_printk("Total: %lld\n", entries + overruns + read); + trace_printk("Missed: %ld\n", missed); + trace_printk("Hit: %ld\n", hit); /* Convert time from usecs to millisecs */ do_div(time, USEC_PER_MSEC); if (time) hit /= (long)time; else - pr_info("TIME IS ZERO??\n"); + trace_printk("TIME IS ZERO??\n"); - pr_info("Entries per millisec: %ld\n", hit); + trace_printk("Entries per millisec: %ld\n", hit); if (hit) { /* Calculate the average time in nanosecs */ avg = NSEC_PER_MSEC / hit; - pr_info("%ld ns per entry\n", avg); + trace_printk("%ld ns per entry\n", avg); } if (missed) { if (time) missed /= (long)time; - pr_info("Total iterations per millisec: %ld\n", hit + missed); + trace_printk("Total iterations per millisec: %ld\n", + hit + missed); /* it is possible that hit + missed will overflow and be zero */ if (!(hit + missed)) { - pr_info("hit + missed overflowed and totalled zero!\n"); + trace_printk("hit + missed overflowed and totalled zero!\n"); hit--; /* make it non zero */ } /* Caculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); - pr_info("%ld ns per entry\n", avg); + trace_printk("%ld ns per entry\n", avg); } } @@ -353,7 +356,7 @@ static int ring_buffer_producer_thread(void *arg) ring_buffer_producer(); - pr_info("Sleeping for 10 secs\n"); + trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ * SLEEP_TIME); __set_current_state(TASK_RUNNING); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c1878bf..3aa0a0d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -284,13 +284,12 @@ void trace_wake_up(void) static int __init set_buf_size(char *str) { unsigned long buf_size; - int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &buf_size); + buf_size = memparse(str, &str); /* nr_entries can not be zero */ - if (ret < 0 || buf_size == 0) + if (buf_size == 0) return 0; trace_buf_size = buf_size; return 1; @@ -2053,25 +2052,23 @@ static int tracing_open(struct inode *inode, struct file *file) static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t = v; (*pos)++; if (t) t = t->next; - m->private = t; - return t; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (; t && l < *pos; t = t_next(m, t, &l)) + for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) ; return t; @@ -2107,18 +2104,10 @@ static struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { - int ret; - if (tracing_disabled) return -ENODEV; - ret = seq_open(file, &show_traces_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = trace_types; - } - - return ret; + return seq_open(file, &show_traces_seq_ops); } static ssize_t @@ -2191,11 +2180,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; - mutex_lock(&tracing_cpumask_update_lock); err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) goto err_unlock; + mutex_lock(&tracing_cpumask_update_lock); + local_irq_disable(); __raw_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { @@ -2223,8 +2213,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, return count; err_unlock: - mutex_unlock(&tracing_cpumask_update_lock); - free_cpumask_var(tracing_cpumask); + free_cpumask_var(tracing_cpumask_new); return err; } @@ -3626,7 +3615,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, struct trace_seq *s; unsigned long cnt; - s = kmalloc(sizeof(*s), GFP_ATOMIC); + s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return ENOMEM; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6e735d4..3548ae5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter) extern struct pid *ftrace_pid_trace; +#ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { if (!ftrace_pid_trace) @@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +#else +static inline int ftrace_trace_task(struct task_struct *task) +{ + return 1; +} +#endif /* * trace_iterator_flags is an enumeration that defines bit diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be6..53c8fd3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call = NULL; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return t_next(m, NULL, pos); + + m->private = ftrace_events.next; + for (l = 0; l <= *pos; ) { + call = t_next(m, NULL, &l); + if (!call) + break; + } + return call; } static void * @@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call = NULL; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return s_next(m, NULL, pos); + + m->private = ftrace_events.next; + for (l = 0; l <= *pos; ) { + call = s_next(m, NULL, &l); + if (!call) + break; + } + return call; } static int t_show(struct seq_file *m, void *v) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index db6e54b..936c621 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -27,8 +27,6 @@ #include "trace.h" #include "trace_output.h" -static DEFINE_MUTEX(filter_mutex); - enum filter_op_ids { OP_OR, @@ -178,7 +176,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, static int filter_pred_strloc(struct filter_pred *pred, void *event, int val1, int val2) { - int str_loc = *(int *)(event + pred->offset); + unsigned short str_loc = *(unsigned short *)(event + pred->offset); char *addr = (char *)(event + str_loc); int cmp, match; @@ -294,12 +292,12 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { struct event_filter *filter = call->filter; - mutex_lock(&filter_mutex); + mutex_lock(&event_mutex); if (filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); } void print_subsystem_event_filter(struct event_subsystem *system, @@ -307,12 +305,12 @@ void print_subsystem_event_filter(struct event_subsystem *system, { struct event_filter *filter = system->filter; - mutex_lock(&filter_mutex); + mutex_lock(&event_mutex); if (filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); } static struct ftrace_event_field * @@ -381,6 +379,7 @@ void destroy_preds(struct ftrace_event_call *call) filter_free_pred(filter->preds[i]); } kfree(filter->preds); + kfree(filter->filter_string); kfree(filter); call->filter = NULL; } @@ -433,7 +432,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) filter->n_preds = 0; } - mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -443,7 +441,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) remove_filter_string(call->filter); } } - mutex_unlock(&event_mutex); } static int filter_add_pred_fn(struct filter_parse_state *ps, @@ -546,6 +543,7 @@ static int filter_add_pred(struct filter_parse_state *ps, filter_pred_fn_t fn; unsigned long long val; int string_type; + int ret; pred->fn = filter_pred_none; @@ -581,7 +579,11 @@ static int filter_add_pred(struct filter_parse_state *ps, pred->not = 1; return filter_add_pred_fn(ps, call, pred, fn); } else { - if (strict_strtoull(pred->str_val, 0, &val)) { + if (field->is_signed) + ret = strict_strtoll(pred->str_val, 0, &val); + else + ret = strict_strtoull(pred->str_val, 0, &val); + if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; } @@ -625,7 +627,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, filter->preds[filter->n_preds] = pred; filter->n_preds++; - mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -636,14 +637,12 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, err = filter_add_pred(ps, call, pred); if (err) { - mutex_unlock(&event_mutex); filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); goto out; } replace_filter_string(call->filter, filter_string); } - mutex_unlock(&event_mutex); out: return err; } @@ -1070,12 +1069,12 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) struct filter_parse_state *ps; - mutex_lock(&filter_mutex); + mutex_lock(&event_mutex); if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); return 0; } @@ -1103,7 +1102,7 @@ out: postfix_clear(ps); kfree(ps); out_unlock: - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); return err; } @@ -1115,12 +1114,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, struct filter_parse_state *ps; - mutex_lock(&filter_mutex); + mutex_lock(&event_mutex); if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); return 0; } @@ -1148,7 +1147,7 @@ out: postfix_clear(ps); kfree(ps); out_unlock: - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); return err; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c9a0b7d..7402144 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -193,9 +193,11 @@ static void tracing_start_function_trace(void) static void tracing_stop_function_trace(void) { ftrace_function_enabled = 0; - /* OK if they are not registered */ - unregister_ftrace_function(&trace_stack_ops); - unregister_ftrace_function(&trace_ops); + + if (func_flags.val & TRACE_FUNC_OPT_STACK) + unregister_ftrace_function(&trace_stack_ops); + else + unregister_ftrace_function(&trace_ops); } static int func_set_flag(u32 old_flags, u32 bit, int set) @@ -300,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, if (count == -1) seq_printf(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld", count); - seq_putc(m, '\n'); + seq_printf(m, ":count=%ld\n", count); return 0; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8b59241..d2249ab 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = { /* Add a function return address to the trace stack on thread info.*/ int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, + unsigned long frame_pointer) { unsigned long long calltime; int index; @@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; + current->ret_stack[index].fp = frame_pointer; *depth = index; return 0; @@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) /* Retrieve a function return address to the trace stack on thread info.*/ static void -ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, + unsigned long frame_pointer) { int index; @@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) return; } +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST + /* + * The arch may choose to record the frame pointer used + * and check it here to make sure that it is what we expect it + * to be. If gcc does not set the place holder of the return + * address in the frame pointer, and does a copy instead, then + * the function graph trace will fail. This test detects this + * case. + * + * Currently, x86_32 with optimize for size (-Os) makes the latest + * gcc do the above. + */ + if (unlikely(current->ret_stack[index].fp != frame_pointer)) { + ftrace_graph_stop(); + WARN(1, "Bad frame pointer: expected %lx, received %lx\n" + " from func %pF return to %lx\n", + current->ret_stack[index].fp, + frame_pointer, + (void *)current->ret_stack[index].func, + current->ret_stack[index].ret); + *ret = (unsigned long)panic; + return; + } +#endif + *ret = current->ret_stack[index].ret; trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; @@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) * Send the trace to the ring-buffer. * @return the original return address. */ -unsigned long ftrace_return_to_handler(void) +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) { struct ftrace_graph_ret trace; unsigned long ret; - ftrace_pop_return_trace(&trace, &ret); + ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); barrier(); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 9bece96..7b62781 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) EXPORT_SYMBOL_GPL(__ftrace_vprintk); static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_start(struct seq_file *m, loff_t *pos) { - const char **fmt = m->private; - const char **next = fmt; - - (*pos)++; + const char **fmt = __start___trace_bprintk_fmt + *pos; if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) return NULL; - - next = fmt; - m->private = ++next; - return fmt; } -static void *t_start(struct seq_file *m, loff_t *pos) +static void *t_next(struct seq_file *m, void * v, loff_t *pos) { - return t_next(m, NULL, pos); + (*pos)++; + return t_start(m, pos); } static int t_show(struct seq_file *m, void *v) @@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = { static int ftrace_formats_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &show_format_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = __start___trace_bprintk_fmt; - } - return ret; + return seq_open(file, &show_format_seq_ops); } static const struct file_operations ftrace_formats_fops = { diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index c006437..e66f5e4 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -199,17 +199,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) { - (*pos)++; + if (!*pos && session->ts->stat_headers) return SEQ_START_TOKEN; - } node = rb_first(&session->stat_root); for (i = 0; node && i < *pos; i++) node = rb_next(node); - (*pos)++; - return node; } |