diff options
Diffstat (limited to 'kernel')
49 files changed, 4181 insertions, 2135 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 906ae5a0..34c5a23 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -41,6 +41,8 @@ * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <asm/types.h> #include <linux/atomic.h> @@ -63,6 +65,7 @@ #include <linux/freezer.h> #include <linux/tty.h> #include <linux/pid_namespace.h> +#include <net/netns/generic.h> #include "audit.h" @@ -76,16 +79,16 @@ static int audit_initialized; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 -int audit_enabled; -int audit_ever_enabled; +u32 audit_enabled; +u32 audit_ever_enabled; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ -static int audit_default; +static u32 audit_default; /* If auditing cannot proceed, audit_failure selects what happens. */ -static int audit_failure = AUDIT_FAIL_PRINTK; +static u32 audit_failure = AUDIT_FAIL_PRINTK; /* * If audit records are to be written to the netlink socket, audit_pid @@ -93,17 +96,19 @@ static int audit_failure = AUDIT_FAIL_PRINTK; * the portid to use to send netlink messages to that process. */ int audit_pid; -static int audit_nlk_portid; +static __u32 audit_nlk_portid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ -static int audit_rate_limit; +static u32 audit_rate_limit; -/* Number of outstanding audit_buffers allowed. */ -static int audit_backlog_limit = 64; -static int audit_backlog_wait_time = 60 * HZ; -static int audit_backlog_wait_overflow = 0; +/* Number of outstanding audit_buffers allowed. + * When set to zero, this means unlimited. */ +static u32 audit_backlog_limit = 64; +#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) +static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; +static u32 audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ kuid_t audit_sig_uid = INVALID_UID; @@ -121,6 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; +int audit_net_id; /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -175,27 +181,27 @@ struct audit_buffer { }; struct audit_reply { - int pid; + __u32 portid; + pid_t pid; struct sk_buff *skb; }; -static void audit_set_pid(struct audit_buffer *ab, pid_t pid) +static void audit_set_portid(struct audit_buffer *ab, __u32 portid) { if (ab) { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_pid = pid; + nlh->nlmsg_pid = portid; } } void audit_panic(const char *message) { - switch (audit_failure) - { + switch (audit_failure) { case AUDIT_FAIL_SILENT: break; case AUDIT_FAIL_PRINTK: if (printk_ratelimit()) - printk(KERN_ERR "audit: %s\n", message); + pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: /* test audit_pid since printk is always losey, why bother? */ @@ -266,9 +272,7 @@ void audit_log_lost(const char *message) if (print) { if (printk_ratelimit()) - printk(KERN_WARNING - "audit: audit_lost=%d audit_rate_limit=%d " - "audit_backlog_limit=%d\n", + pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n", atomic_read(&audit_lost), audit_rate_limit, audit_backlog_limit); @@ -276,7 +280,7 @@ void audit_log_lost(const char *message) } } -static int audit_log_config_change(char *function_name, int new, int old, +static int audit_log_config_change(char *function_name, u32 new, u32 old, int allow_changes) { struct audit_buffer *ab; @@ -285,7 +289,7 @@ static int audit_log_config_change(char *function_name, int new, int old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%d old=%d", function_name, new, old); + audit_log_format(ab, "%s=%u old=%u", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) @@ -295,9 +299,10 @@ static int audit_log_config_change(char *function_name, int new, int old, return rc; } -static int audit_do_config_change(char *function_name, int *to_change, int new) +static int audit_do_config_change(char *function_name, u32 *to_change, u32 new) { - int allow_changes, rc = 0, old = *to_change; + int allow_changes, rc = 0; + u32 old = *to_change; /* check if we are locked */ if (audit_enabled == AUDIT_LOCKED) @@ -320,17 +325,23 @@ static int audit_do_config_change(char *function_name, int *to_change, int new) return rc; } -static int audit_set_rate_limit(int limit) +static int audit_set_rate_limit(u32 limit) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } -static int audit_set_backlog_limit(int limit) +static int audit_set_backlog_limit(u32 limit) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } -static int audit_set_enabled(int state) +static int audit_set_backlog_wait_time(u32 timeout) +{ + return audit_do_config_change("audit_backlog_wait_time", + &audit_backlog_wait_time, timeout); +} + +static int audit_set_enabled(u32 state) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) @@ -343,7 +354,7 @@ static int audit_set_enabled(int state) return rc; } -static int audit_set_failure(int state) +static int audit_set_failure(u32 state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK @@ -365,7 +376,8 @@ static int audit_set_failure(int state) static void audit_hold_skb(struct sk_buff *skb) { if (audit_default && - skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) + (!audit_backlog_limit || + skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)) skb_queue_tail(&audit_skb_hold_queue, skb); else kfree_skb(skb); @@ -382,7 +394,7 @@ static void audit_printk_skb(struct sk_buff *skb) if (nlh->nlmsg_type != AUDIT_EOE) { if (printk_ratelimit()) - printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); + pr_notice("type=%d %s\n", nlh->nlmsg_type, data); else audit_log_lost("printk limit exceeded\n"); } @@ -398,9 +410,12 @@ static void kauditd_send_skb(struct sk_buff *skb) err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ - printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd disappeared\n"); - audit_pid = 0; + if (audit_pid) { + pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); + audit_log_lost("auditd disappeared\n"); + audit_pid = 0; + audit_sock = NULL; + } /* we might get lucky and get this in the next auditd */ audit_hold_skb(skb); } else @@ -457,8 +472,10 @@ static int kauditd_thread(void *dummy) flush_hold_queue(); skb = skb_dequeue(&audit_skb_queue); - wake_up(&audit_backlog_wait); + if (skb) { + if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit) + wake_up(&audit_backlog_wait); if (audit_pid) kauditd_send_skb(skb); else @@ -482,22 +499,23 @@ static int kauditd_thread(void *dummy) int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; - int pid = dest->pid; struct sk_buff *skb; + struct net *net = get_net_ns_by_pid(dest->pid); + struct audit_net *aunet = net_generic(net, audit_net_id); /* wait for parent to finish and send an ACK */ mutex_lock(&audit_cmd_mutex); mutex_unlock(&audit_cmd_mutex); while ((skb = __skb_dequeue(&dest->q)) != NULL) - netlink_unicast(audit_sock, skb, pid, 0); + netlink_unicast(aunet->nlsk, skb, dest->portid, 0); kfree(dest); return 0; } -struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, +struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; @@ -510,7 +528,7 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, if (!skb) return NULL; - nlh = nlmsg_put(skb, pid, seq, t, size, flags); + nlh = nlmsg_put(skb, portid, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); @@ -525,19 +543,21 @@ out_kfree_skb: static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; + struct net *net = get_net_ns_by_pid(reply->pid); + struct audit_net *aunet = net_generic(net, audit_net_id); mutex_lock(&audit_cmd_mutex); mutex_unlock(&audit_cmd_mutex); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ - netlink_unicast(audit_sock, reply->skb, reply->pid, 0); + netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); kfree(reply); return 0; } /** * audit_send_reply - send an audit reply message via netlink - * @pid: process id to send reply to + * @portid: netlink port to which to send reply * @seq: sequence number * @type: audit message type * @done: done (last) flag @@ -545,11 +565,11 @@ static int audit_send_reply_thread(void *arg) * @payload: payload data * @size: payload size * - * Allocates an skb, builds the netlink message, and sends it to the pid. + * Allocates an skb, builds the netlink message, and sends it to the port id. * No failure notifications. */ -static void audit_send_reply(int pid, int seq, int type, int done, int multi, - const void *payload, int size) +static void audit_send_reply(__u32 portid, int seq, int type, int done, + int multi, const void *payload, int size) { struct sk_buff *skb; struct task_struct *tsk; @@ -559,11 +579,12 @@ static void audit_send_reply(int pid, int seq, int type, int done, int multi, if (!reply) return; - skb = audit_make_reply(pid, seq, type, done, multi, payload, size); + skb = audit_make_reply(portid, seq, type, done, multi, payload, size); if (!skb) goto out; - reply->pid = pid; + reply->portid = portid; + reply->pid = task_pid_vnr(current); reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); @@ -663,8 +684,12 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature { struct audit_buffer *ab; + if (audit_enabled == AUDIT_OFF) + return; + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); - audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", + audit_log_task_info(ab, current); + audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); @@ -694,7 +719,7 @@ static int audit_set_feature(struct sk_buff *skb) old_lock = af.lock & feature; /* are we changing a locked feature? */ - if ((af.lock & feature) && (new_feature != old_feature)) { + if (old_lock && (new_feature != old_feature)) { audit_log_feature_change(i, old_feature, new_feature, old_lock, new_lock, 0); return -EPERM; @@ -732,7 +757,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; - struct audit_status *status_get, status_set; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; @@ -758,48 +782,70 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) data = nlmsg_data(nlh); switch (msg_type) { - case AUDIT_GET: - memset(&status_set, 0, sizeof(status_set)); - status_set.enabled = audit_enabled; - status_set.failure = audit_failure; - status_set.pid = audit_pid; - status_set.rate_limit = audit_rate_limit; - status_set.backlog_limit = audit_backlog_limit; - status_set.lost = atomic_read(&audit_lost); - status_set.backlog = skb_queue_len(&audit_skb_queue); + case AUDIT_GET: { + struct audit_status s; + memset(&s, 0, sizeof(s)); + s.enabled = audit_enabled; + s.failure = audit_failure; + s.pid = audit_pid; + s.rate_limit = audit_rate_limit; + s.backlog_limit = audit_backlog_limit; + s.lost = atomic_read(&audit_lost); + s.backlog = skb_queue_len(&audit_skb_queue); + s.version = AUDIT_VERSION_LATEST; + s.backlog_wait_time = audit_backlog_wait_time; audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &status_set, sizeof(status_set)); + &s, sizeof(s)); break; - case AUDIT_SET: - if (nlmsg_len(nlh) < sizeof(struct audit_status)) - return -EINVAL; - status_get = (struct audit_status *)data; - if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled); + } + case AUDIT_SET: { + struct audit_status s; + memset(&s, 0, sizeof(s)); + /* guard against past and future API changes */ + memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + if (s.mask & AUDIT_STATUS_ENABLED) { + err = audit_set_enabled(s.enabled); if (err < 0) return err; } - if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure); + if (s.mask & AUDIT_STATUS_FAILURE) { + err = audit_set_failure(s.failure); if (err < 0) return err; } - if (status_get->mask & AUDIT_STATUS_PID) { - int new_pid = status_get->pid; + if (s.mask & AUDIT_STATUS_PID) { + int new_pid = s.pid; + if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) + return -EACCES; if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; audit_nlk_portid = NETLINK_CB(skb).portid; + audit_sock = skb->sk; } - if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { - err = audit_set_rate_limit(status_get->rate_limit); + if (s.mask & AUDIT_STATUS_RATE_LIMIT) { + err = audit_set_rate_limit(s.rate_limit); + if (err < 0) + return err; + } + if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { + err = audit_set_backlog_limit(s.backlog_limit); + if (err < 0) + return err; + } + if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { + if (sizeof(s) > (size_t)nlh->nlmsg_len) + return -EINVAL; + if (s.backlog_wait_time < 0 || + s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) + return -EINVAL; + err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) return err; } - if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - err = audit_set_backlog_limit(status_get->backlog_limit); break; + } case AUDIT_GET_FEATURE: err = audit_get_feature(skb); if (err) @@ -817,13 +863,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; err = audit_filter_user(msg_type); - if (err == 1) { + if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push_current(); if (err) break; } + mutex_unlock(&audit_cmd_mutex); audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.*s'", @@ -839,8 +886,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) size--; audit_log_n_untrustedstring(ab, data, size); } - audit_set_pid(ab, NETLINK_CB(skb).portid); + audit_set_portid(ab, NETLINK_CB(skb).portid); audit_log_end(ab); + mutex_lock(&audit_cmd_mutex); } break; case AUDIT_ADD_RULE: @@ -853,11 +901,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - /* fallthrough */ - case AUDIT_LIST_RULES: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, + err = audit_rule_change(msg_type, NETLINK_CB(skb).portid, seq, data, nlmsg_len(nlh)); break; + case AUDIT_LIST_RULES: + err = audit_list_rules_send(NETLINK_CB(skb).portid, seq); + break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); @@ -939,20 +988,33 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_TTY_SET: { - struct audit_tty_status s; + struct audit_tty_status s, old; struct task_struct *tsk = current; + struct audit_buffer *ab; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) - return -EINVAL; + err = -EINVAL; spin_lock(&tsk->sighand->siglock); - tsk->signal->audit_tty = s.enabled; - tsk->signal->audit_tty_log_passwd = s.log_passwd; + old.enabled = tsk->signal->audit_tty; + old.log_passwd = tsk->signal->audit_tty_log_passwd; + if (!err) { + tsk->signal->audit_tty = s.enabled; + tsk->signal->audit_tty_log_passwd = s.log_passwd; + } spin_unlock(&tsk->sighand->siglock); + + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" + " old-log_passwd=%d new-log_passwd=%d res=%d", + old.enabled, s.enabled, old.log_passwd, + s.log_passwd, !err); + audit_log_end(ab); break; } default: @@ -998,24 +1060,55 @@ static void audit_receive(struct sk_buff *skb) mutex_unlock(&audit_cmd_mutex); } -/* Initialize audit support at boot time. */ -static int __init audit_init(void) +static int __net_init audit_net_init(struct net *net) { - int i; struct netlink_kernel_cfg cfg = { .input = audit_receive, }; + struct audit_net *aunet = net_generic(net, audit_net_id); + + aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); + if (aunet->nlsk == NULL) { + audit_panic("cannot initialize netlink socket in namespace"); + return -ENOMEM; + } + aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + return 0; +} + +static void __net_exit audit_net_exit(struct net *net) +{ + struct audit_net *aunet = net_generic(net, audit_net_id); + struct sock *sock = aunet->nlsk; + if (sock == audit_sock) { + audit_pid = 0; + audit_sock = NULL; + } + + rcu_assign_pointer(aunet->nlsk, NULL); + synchronize_net(); + netlink_kernel_release(sock); +} + +static struct pernet_operations audit_net_ops __net_initdata = { + .init = audit_net_init, + .exit = audit_net_exit, + .id = &audit_net_id, + .size = sizeof(struct audit_net), +}; + +/* Initialize audit support at boot time. */ +static int __init audit_init(void) +{ + int i; + if (audit_initialized == AUDIT_DISABLED) return 0; - printk(KERN_INFO "audit: initializing netlink socket (%s)\n", - audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); - if (!audit_sock) - audit_panic("cannot initialize netlink socket"); - else - audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + pr_info("initializing netlink subsys (%s)\n", + audit_default ? "enabled" : "disabled"); + register_pernet_subsys(&audit_net_ops); skb_queue_head_init(&audit_skb_queue); skb_queue_head_init(&audit_skb_hold_queue); @@ -1039,22 +1132,32 @@ static int __init audit_enable(char *str) if (!audit_default) audit_initialized = AUDIT_DISABLED; - printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled"); + pr_info("%s\n", audit_default ? + "enabled (after initialization)" : "disabled (until reboot)"); - if (audit_initialized == AUDIT_INITIALIZED) { - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; - } else if (audit_initialized == AUDIT_UNINITIALIZED) { - printk(" (after initialization)"); - } else { - printk(" (until reboot)"); + return 1; +} +__setup("audit=", audit_enable); + +/* Process kernel command-line parameter at boot time. + * audit_backlog_limit=<n> */ +static int __init audit_backlog_limit_set(char *str) +{ + u32 audit_backlog_limit_arg; + + pr_info("audit_backlog_limit: "); + if (kstrtouint(str, 0, &audit_backlog_limit_arg)) { + pr_cont("using default of %u, unable to parse %s\n", + audit_backlog_limit, str); + return 1; } - printk("\n"); + + audit_backlog_limit = audit_backlog_limit_arg; + pr_cont("%d\n", audit_backlog_limit); return 1; } - -__setup("audit=", audit_enable); +__setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { @@ -1165,18 +1268,20 @@ static inline void audit_get_stamp(struct audit_context *ctx, /* * Wait for auditd to drain the queue a little */ -static void wait_for_auditd(unsigned long sleep_time) +static long wait_for_auditd(long sleep_time) { DECLARE_WAITQUEUE(wait, current); set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&audit_backlog_wait, &wait); + add_wait_queue_exclusive(&audit_backlog_wait, &wait); if (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) - schedule_timeout(sleep_time); + sleep_time = schedule_timeout(sleep_time); __set_current_state(TASK_RUNNING); remove_wait_queue(&audit_backlog_wait, &wait); + + return sleep_time; } /** @@ -1200,7 +1305,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, struct audit_buffer *ab = NULL; struct timespec t; unsigned int uninitialized_var(serial); - int reserve; + int reserve = 5; /* Allow atomic callers to go up to five + entries over the normal backlog limit */ unsigned long timeout_start = jiffies; if (audit_initialized != AUDIT_INITIALIZED) @@ -1209,36 +1315,37 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) - reserve = 0; - else - reserve = 5; /* Allow atomic callers to go up to five - entries over the normal backlog limit */ + if (gfp_mask & __GFP_WAIT) { + if (audit_pid && audit_pid == current->pid) + gfp_mask &= ~__GFP_WAIT; + else + reserve = 0; + } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { - unsigned long sleep_time; + long sleep_time; - sleep_time = timeout_start + audit_backlog_wait_time - - jiffies; - if ((long)sleep_time > 0) { - wait_for_auditd(sleep_time); - continue; + sleep_time = timeout_start + audit_backlog_wait_time - jiffies; + if (sleep_time > 0) { + sleep_time = wait_for_auditd(sleep_time); + if (sleep_time > 0) + continue; } } if (audit_rate_check() && printk_ratelimit()) - printk(KERN_WARNING - "audit: audit_backlog=%d > " - "audit_backlog_limit=%d\n", - skb_queue_len(&audit_skb_queue), - audit_backlog_limit); + pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", + skb_queue_len(&audit_skb_queue), + audit_backlog_limit); audit_log_lost("backlog limit exceeded"); audit_backlog_wait_time = audit_backlog_wait_overflow; wake_up(&audit_backlog_wait); return NULL; } + audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; + ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); @@ -1356,7 +1463,6 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, int i, avail, new_len; unsigned char *ptr; struct sk_buff *skb; - static const unsigned char *hex = "0123456789ABCDEF"; if (!ab) return; @@ -1374,10 +1480,8 @@ void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, } ptr = skb_tail_pointer(skb); - for (i=0; i<len; i++) { - *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ - *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ - } + for (i = 0; i < len; i++) + ptr = hex_byte_pack_upper(ptr, buf[i]); *ptr = 0; skb_put(skb, len << 1); /* new string is twice the old string */ } @@ -1491,7 +1595,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, void audit_log_session_info(struct audit_buffer *ab) { - u32 sessionid = audit_get_sessionid(current); + unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); @@ -1716,7 +1820,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) audit_log_format(ab, " ppid=%ld pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", + " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", sys_getppid(), tsk->pid, from_kuid(&init_user_ns, audit_get_loginuid(tsk)), @@ -1728,7 +1832,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), - audit_get_sessionid(tsk), tty); + tty, audit_get_sessionid(tsk)); get_task_comm(name, tsk); audit_log_format(ab, " comm="); @@ -1739,7 +1843,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) if (mm->exe_file) audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); up_read(&mm->mmap_sem); - } + } else + audit_log_format(ab, " exe=(null)"); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); diff --git a/kernel/audit.h b/kernel/audit.h index b779642..57cc64d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -209,7 +209,7 @@ struct audit_context { #endif }; -extern int audit_ever_enabled; +extern u32 audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, @@ -240,18 +240,23 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); extern int audit_compare_dname_path(const char *dname, const char *path, int plen); -extern struct sk_buff * audit_make_reply(int pid, int seq, int type, - int done, int multi, - const void *payload, int size); +extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, + int done, int multi, + const void *payload, int size); extern void audit_panic(const char *message); struct audit_netlink_list { - int pid; + __u32 portid; + pid_t pid; struct sk_buff_head q; }; int audit_send_list(void *); +struct audit_net { + struct sock *nlsk; +}; + extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 43c307d..67ccf0e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk) } static int audit_tree_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmonut_mark, - struct fsnotify_event *event) + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { - BUG(); - return -EOPNOTSUPP; + return 0; } static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) @@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify BUG_ON(atomic_read(&entry->refcnt) < 1); } -static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return false; -} - static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, - .should_send_event = audit_tree_send_event, - .free_group_priv = NULL, - .free_event_priv = NULL, .freeing_mark = audit_tree_freeing_mark, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 22831c4..2596fac 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule) } } -static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return true; -} - /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *dname) { struct inode *inode; - __u32 mask = event->mask; - const char *dname = event->file_name; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); BUG_ON(group != audit_watch_group); - switch (event->data_type) { + switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = event->path.dentry->d_inode; + inode = ((struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = event->inode; + inode = (struct inode *)data; break; default: BUG(); @@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .should_send_event = audit_watch_should_send_event, .handle_event = audit_watch_handle_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, }; static int __init audit_watch_init(void) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 51f3fd4..14a78cc 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -972,7 +972,7 @@ out: } /* List rules using struct audit_rule_data. */ -static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) +static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; struct audit_krule *r; @@ -987,14 +987,15 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) data = audit_krule_to_data(r); if (unlikely(!data)) break; - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data) + data->buflen); + skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, + 0, 1, data, + sizeof(*data) + data->buflen); if (skb) skb_queue_tail(q, skb); kfree(data); } } - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); if (skb) skb_queue_tail(q, skb); } @@ -1004,7 +1005,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re { struct audit_buffer *ab; uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - u32 sessionid = audit_get_sessionid(current); + unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -1022,45 +1023,20 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re } /** - * audit_receive_filter - apply all rules to the specified message type + * audit_rule_change - apply all rules to the specified message type * @type: audit message type - * @pid: target pid for netlink audit messages + * @portid: target port id for netlink audit messages * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data */ -int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) +int audit_rule_change(int type, __u32 portid, int seq, void *data, + size_t datasz) { - struct task_struct *tsk; - struct audit_netlink_list *dest; int err = 0; struct audit_entry *entry; switch (type) { - case AUDIT_LIST_RULES: - /* We can't just spew out the rules here because we might fill - * the available socket buffer space and deadlock waiting for - * auditctl to read from it... which isn't ever going to - * happen if we're actually running in the context of auditctl - * trying to _send_ the stuff */ - - dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); - if (!dest) - return -ENOMEM; - dest->pid = pid; - skb_queue_head_init(&dest->q); - - mutex_lock(&audit_filter_mutex); - audit_list_rules(pid, seq, &dest->q); - mutex_unlock(&audit_filter_mutex); - - tsk = kthread_run(audit_send_list, dest, "audit_send_list"); - if (IS_ERR(tsk)) { - skb_queue_purge(&dest->q); - kfree(dest); - err = PTR_ERR(tsk); - } - break; case AUDIT_ADD_RULE: entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) @@ -1087,6 +1063,44 @@ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) return err; } +/** + * audit_list_rules_send - list the audit rules + * @portid: target portid for netlink audit messages + * @seq: netlink audit message sequence (serial) number + */ +int audit_list_rules_send(__u32 portid, int seq) +{ + struct task_struct *tsk; + struct audit_netlink_list *dest; + int err = 0; + + /* We can't just spew out the rules here because we might fill + * the available socket buffer space and deadlock waiting for + * auditctl to read from it... which isn't ever going to + * happen if we're actually running in the context of auditctl + * trying to _send_ the stuff */ + + dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); + if (!dest) + return -ENOMEM; + dest->portid = portid; + dest->pid = task_pid_vnr(current); + skb_queue_head_init(&dest->q); + + mutex_lock(&audit_filter_mutex); + audit_list_rules(portid, seq, &dest->q); + mutex_unlock(&audit_filter_mutex); + + tsk = kthread_run(audit_send_list, dest, "audit_send_list"); + if (IS_ERR(tsk)) { + skb_queue_purge(&dest->q); + kfree(dest); + err = PTR_ERR(tsk); + } + + return err; +} + int audit_comparator(u32 left, u32 op, u32 right) { switch (op) { @@ -1276,19 +1290,22 @@ int audit_filter_user(int type) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; - int ret = 1; + int rc, ret; + + ret = 1; /* Audit by default */ rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(&e->rule, type, &state)) { - if (state == AUDIT_DISABLED) + rc = audit_filter_user_rules(&e->rule, type, &state); + if (rc) { + if (rc > 0 && state == AUDIT_DISABLED) ret = 0; break; } } rcu_read_unlock(); - return ret; /* Audit by default */ + return ret; } int audit_filter_type(int type) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 90594c9..10176cd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1969,18 +1969,24 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, int rc) { struct audit_buffer *ab; - uid_t uid, ologinuid, nloginuid; + uid_t uid, oldloginuid, loginuid; + + if (!audit_enabled) + return; uid = from_kuid(&init_user_ns, task_uid(current)); - ologinuid = from_kuid(&init_user_ns, koldloginuid); - nloginuid = from_kuid(&init_user_ns, kloginuid), + oldloginuid = from_kuid(&init_user_ns, koldloginuid); + loginuid = from_kuid(&init_user_ns, kloginuid), ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; - audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " - "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, - nloginuid, oldsessionid, sessionid, !rc); + audit_log_format(ab, "pid=%d uid=%u" + " old-auid=%u new-auid=%u old-ses=%u new-ses=%u" + " res=%d", + current->pid, uid, + oldloginuid, loginuid, oldsessionid, sessionid, + !rc); audit_log_end(ab); } @@ -2008,7 +2014,7 @@ int audit_set_loginuid(kuid_t loginuid) /* are we setting or clearing? */ if (uid_valid(loginuid)) - sessionid = atomic_inc_return(&session_id); + sessionid = (unsigned int)atomic_inc_return(&session_id); task->sessionid = sessionid; task->loginuid = loginuid; @@ -2321,18 +2327,16 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, /** * __audit_log_capset - store information about the arguments to the capset syscall - * @pid: target pid of the capset call * @new: the new credentials * @old: the old (current) credentials * * Record the aguments userspace sent to sys_capset for later printing by the * audit system if applicable */ -void __audit_log_capset(pid_t pid, - const struct cred *new, const struct cred *old) +void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = current->audit_context; - context->capset.pid = pid; + context->capset.pid = task_pid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; @@ -2352,6 +2356,7 @@ static void audit_log_task(struct audit_buffer *ab) kuid_t auid, uid; kgid_t gid; unsigned int sessionid; + struct mm_struct *mm = current->mm; auid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); @@ -2365,15 +2370,15 @@ static void audit_log_task(struct audit_buffer *ab) audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); + if (mm) { + down_read(&mm->mmap_sem); + if (mm->exe_file) + audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); + up_read(&mm->mmap_sem); + } else + audit_log_format(ab, " exe=(null)"); } -static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) -{ - audit_log_task(ab); - audit_log_format(ab, " reason="); - audit_log_string(ab, reason); - audit_log_format(ab, " sig=%ld", signr); -} /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value @@ -2394,7 +2399,8 @@ void audit_core_dumps(long signr) ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); if (unlikely(!ab)) return; - audit_log_abend(ab, "memory violation", signr); + audit_log_task(ab); + audit_log_format(ab, " sig=%ld", signr); audit_log_end(ab); } diff --git a/kernel/capability.c b/kernel/capability.c index 4e66bf9..34019c5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) if (ret < 0) goto error; - audit_log_capset(pid, new, current_cred()); + audit_log_capset(new, current_cred()); return commit_creds(new); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc1dcab..e2f46ba 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -41,7 +41,6 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/backing-dev.h> -#include <linux/seq_file.h> #include <linux/slab.h> #include <linux/magic.h> #include <linux/spinlock.h> @@ -56,15 +55,20 @@ #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ -#include <linux/eventfd.h> -#include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> -#include <linux/file.h> #include <linux/atomic.h> /* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); +#define cgroup_assert_mutex_or_rcu_locked() \ + rcu_lockdep_assert(rcu_read_lock_held() || \ + lockdep_is_held(&cgroup_mutex), \ + "cgroup_mutex or RCU read lock required"); + +#ifdef CONFIG_LOCKDEP +#define cgroup_assert_mutex_or_root_locked() \ + WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ + !lockdep_is_held(&cgroup_root_mutex))) +#else +#define cgroup_assert_mutex_or_root_locked() do { } while (0) +#endif + /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup @@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); static struct workqueue_struct *cgroup_destroy_wq; /* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by @@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root; /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; -/* - * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. - */ -struct cfent { - struct list_head node; - struct dentry *dentry; - struct cftype *type; - struct cgroup_subsys_state *css; - - /* file xattrs */ - struct simple_xattrs xattrs; -}; - -/* - * cgroup_event represents events which userspace want to receive. - */ -struct cgroup_event { - /* - * css which the event belongs to. - */ - struct cgroup_subsys_state *css; - /* - * Control file which the event associated. - */ - struct cftype *cft; - /* - * eventfd to signal userspace about the event. - */ - struct eventfd_ctx *eventfd; - /* - * Each of these stored in a list by the cgroup. - */ - struct list_head list; - /* - * All fields below needed to unregister event when - * userspace closes eventfd. - */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_t wait; - struct work_struct remove; -}; - /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static int cgroup_file_release(struct inode *inode, struct file *file); +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp) } /** + * for_each_css - iterate all css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_mutex. + */ +#define for_each_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = rcu_dereference_check( \ + (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_mutex)))) { } \ + else + +/** * for_each_subsys - iterate all loaded cgroup subsystems * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * - * Should be called under cgroup_mutex. + * Iterates through all loaded subsystems. Should be called under + * cgroup_mutex or cgroup_root_mutex. */ -#define for_each_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - !((ss) = cgroup_subsys[i]); })) { } \ +#define for_each_subsys(ss, ssid) \ + for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ + (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((ss) = cgroup_subsys[(ssid)])) { } \ else /** @@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp) for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[i]) || true); (i)++) -/* iterate each subsystem attached to a hierarchy */ -#define for_each_root_subsys(root, ss) \ - list_for_each_entry((ss), &(root)->subsys_list, sibling) - /* iterate across the active hierarchies */ #define for_each_active_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) @@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work) */ deactivate_super(cgrp->root->sb); - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); + cgroup_pidlist_destroy_all(cgrp); simple_xattrs_free(&cgrp->xattrs); @@ -1050,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, cgroup_css(cgroup_dummy_top, ss)); cgroup_css(cgrp, ss)->cgroup = cgrp; - list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgroup_css(cgrp, ss)); @@ -1069,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; - list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); /* subsystem is now free - drop reference on module */ module_put(ss->module); @@ -1096,10 +1083,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) { struct cgroupfs_root *root = dentry->d_sb->s_fs_info; struct cgroup_subsys *ss; + int ssid; mutex_lock(&cgroup_root_mutex); - for_each_root_subsys(root, ss) - seq_printf(seq, ",%s", ss->name); + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) @@ -1362,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->event_list); - spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); } @@ -1371,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; - INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; @@ -1693,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return ERR_PTR(ret); } -static void cgroup_kill_sb(struct super_block *sb) { +static void cgroup_kill_sb(struct super_block *sb) +{ struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; @@ -1976,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, bool threadgroup) { int retval, i, group_size; - struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroupfs_root *root = cgrp->root; + struct cgroup_subsys_state *css, *failed_css = NULL; /* threadgroup list cursor and array */ struct task_struct *leader = tsk; struct task_and_cgroup *tc; @@ -2050,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 1: check that we can legitimately attach to the cgroup. */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss->can_attach) { - retval = ss->can_attach(css, &tset); + for_each_css(css, i, cgrp) { + if (css->ss->can_attach) { + retval = css->ss->can_attach(css, &tset); if (retval) { - failed_ss = ss; + failed_css = css; goto out_cancel_attach; } } @@ -2092,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 4: do subsystem attach callbacks. */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss->attach) - ss->attach(css, &tset); - } + for_each_css(css, i, cgrp) + if (css->ss->attach) + css->ss->attach(css, &tset); /* * step 5: success! and cleanup @@ -2114,13 +2096,11 @@ out_put_css_set_refs: } out_cancel_attach: if (retval) { - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss == failed_ss) + for_each_css(css, i, cgrp) { + if (css == failed_css) break; - if (ss->cancel_attach) - ss->cancel_attach(css, &tset); + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -2148,7 +2128,7 @@ retry_find_task: tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - ret= -ESRCH; + ret = -ESRCH; goto out_unlock_cgroup; } /* @@ -2260,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css, return 0; } -static int cgroup_release_agent_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_release_agent_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = css->cgroup; + struct cgroup *cgrp = seq_css(seq)->cgroup; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2273,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css, return 0; } -static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); return 0; } /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - const char __user *userbuf, size_t nbytes, - loff_t *unused_ppos) +static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, + size_t nbytes, loff_t *ppos) { - char buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - char *end; + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cfe->css; + size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; + char *buf; + int ret; - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) + if (nbytes >= max_bytes) return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - buffer[nbytes] = 0; /* nul-terminate */ - if (cft->write_u64) { - u64 val = simple_strtoull(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_u64(css, cft, val); + buf = kmalloc(nbytes + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, userbuf, nbytes)) { + ret = -EFAULT; + goto out_free; + } + + buf[nbytes] = '\0'; + + if (cft->write_string) { + ret = cft->write_string(css, cft, strstrip(buf)); + } else if (cft->write_u64) { + unsigned long long v; + ret = kstrtoull(buf, 0, &v); + if (!ret) + ret = cft->write_u64(css, cft, v); + } else if (cft->write_s64) { + long long v; + ret = kstrtoll(buf, 0, &v); + if (!ret) + ret = cft->write_s64(css, cft, v); + } else if (cft->trigger) { + ret = cft->trigger(css, (unsigned int)cft->private); } else { - s64 val = simple_strtoll(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_s64(css, cft, val); + ret = -EINVAL; } - if (!retval) - retval = nbytes; - return retval; +out_free: + kfree(buf); + return ret ?: nbytes; } -static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - const char __user *userbuf, size_t nbytes, - loff_t *unused_ppos) +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - size_t max_bytes = cft->max_write_len; - char *buffer = local_buffer; + struct cftype *cft = seq_cft(seq); - if (!max_bytes) - max_bytes = sizeof(local_buffer) - 1; - if (nbytes >= max_bytes) - return -E2BIG; - /* Allocate a dynamic buffer if we need one */ - if (nbytes >= sizeof(local_buffer)) { - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out; + if (cft->seq_start) { + return cft->seq_start(seq, ppos); + } else { + /* + * The same behavior and code as single_open(). Returns + * !NULL if pos is at the beginning; otherwise, NULL. + */ + return NULL + !*ppos; } - - buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(css, cft, strstrip(buffer)); - if (!retval) - retval = nbytes; -out: - if (buffer != local_buffer) - kfree(buffer); - return retval; } -static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) +static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; + struct cftype *cft = seq_cft(seq); - if (cft->write) - return cft->write(css, cft, file, buf, nbytes, ppos); - if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); - if (cft->write_string) - return cgroup_write_string(css, cft, file, buf, nbytes, ppos); - if (cft->trigger) { - int ret = cft->trigger(css, (unsigned int)cft->private); - return ret ? ret : nbytes; + if (cft->seq_next) { + return cft->seq_next(seq, v, ppos); + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; } - return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(css, cft); - int len = sprintf(tmp, "%llu\n", (unsigned long long) val); + struct cftype *cft = seq_cft(seq); - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); + if (cft->seq_stop) + cft->seq_stop(seq, v); } -static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(css, cft); - int len = sprintf(tmp, "%lld\n", (long long) val); + struct cftype *cft = seq_cft(m); + struct cgroup_subsys_state *css = seq_css(m); - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} + if (cft->seq_show) + return cft->seq_show(m, arg); -static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - - if (cft->read) - return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); - if (cft->read_s64) - return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); - return -EINVAL; -} - -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - -static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) -{ - struct seq_file *sf = cb->state; - return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); -} - -static int cgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct cfent *cfe = m->private; - struct cftype *cft = cfe->type; - struct cgroup_subsys_state *css = cfe->css; - - if (cft->read_map) { - struct cgroup_map_cb cb = { - .fill = cgroup_map_add, - .state = m, - }; - return cft->read_map(css, cft, &cb); - } - return cft->read_seq_string(css, cft, m); + seq_printf(m, "%llu\n", cft->read_u64(css, cft)); + else if (cft->read_s64) + seq_printf(m, "%lld\n", cft->read_s64(css, cft)); + else + return -EINVAL; + return 0; } -static const struct file_operations cgroup_seqfile_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = seq_lseek, - .release = cgroup_file_release, +static struct seq_operations cgroup_seq_operations = { + .start = cgroup_seqfile_start, + .next = cgroup_seqfile_next, + .stop = cgroup_seqfile_stop, + .show = cgroup_seqfile_show, }; static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2449,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); struct cgroup_subsys_state *css; + struct cgroup_open_file *of; int err; err = generic_file_open(inode, file); @@ -2478,32 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file) WARN_ON_ONCE(cfe->css && cfe->css != css); cfe->css = css; - if (cft->read_map || cft->read_seq_string) { - file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, cfe); - } else if (cft->open) { - err = cft->open(inode, file); + of = __seq_open_private(file, &cgroup_seq_operations, + sizeof(struct cgroup_open_file)); + if (of) { + of->cfe = cfe; + return 0; } - if (css->ss && err) + if (css->ss) css_put(css); - return err; + return -ENOMEM; } static int cgroup_file_release(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); struct cgroup_subsys_state *css = cfe->css; - int ret = 0; - if (cft->release) - ret = cft->release(inode, file); if (css->ss) css_put(css); - if (file->f_op == &cgroup_seqfile_operations) - single_release(inode, file); - return ret; + return seq_release_private(inode, file); } /* @@ -2614,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) } static const struct file_operations cgroup_file_operations = { - .read = cgroup_file_read, + .read = seq_read, .write = cgroup_file_write, .llseek = generic_file_llseek, .open = cgroup_file_open, @@ -2639,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { .removexattr = cgroup_removexattr, }; -/* - * Check if a file is a control file - */ -static inline struct cftype *__file_cft(struct file *file) -{ - if (file_inode(file)->i_fop != &cgroup_file_operations) - return ERR_PTR(-EINVAL); - return __d_cft(file->f_dentry); -} - static int cgroup_create_file(struct dentry *dentry, umode_t mode, struct super_block *sb) { @@ -2706,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->mode) return cft->mode; - if (cft->read || cft->read_u64 || cft->read_s64 || - cft->read_map || cft->read_seq_string) + if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write || cft->write_u64 || cft->write_s64 || - cft->write_string || cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write_string || + cft->trigger) mode |= S_IWUSR; return mode; @@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void) * @parent_css: css whose children to walk * * This function returns the next child of @parent_css and should be called - * under RCU read lock. The only requirement is that @parent_css and - * @pos_css are accessible. The next sibling is guaranteed to be returned - * regardless of their states. + * under either cgroup_mutex or RCU read lock. The only requirement is + * that @parent_css and @pos_css are accessible. The next sibling is + * guaranteed to be returned regardless of their states. */ struct cgroup_subsys_state * css_next_child(struct cgroup_subsys_state *pos_css, @@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child); * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct next descendant as long as both @pos - * and @root are accessible and @pos is a descendant of @root. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct next descendant as long + * as both @pos and @root are accessible and @pos is a descendant of @root. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct rightmost descendant as long as @pos is - * accessible. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct rightmost descendant as + * long as @pos is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); do { last = pos; @@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct next descendant as long + * as both @pos and @cgroup are accessible and @pos is a descendant of + * @cgroup. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, @@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -3504,14 +3423,12 @@ struct cgroup_pidlist { pid_t *list; /* how many elements the above list has */ int length; - /* how many files are using the current array */ - int use_count; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; - /* protects the other fields */ - struct rw_semaphore rwsem; + /* for delayed destruction */ + struct delayed_work destroy_dwork; }; /* @@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count) else return kmalloc(count * sizeof(pid_t), GFP_KERNEL); } + static void pidlist_free(void *p) { if (is_vmalloc_addr(p)) @@ -3536,6 +3454,47 @@ static void pidlist_free(void *p) } /* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) +{ + struct cgroup_pidlist *l, *tmp_l; + + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); + + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); +} + +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); + + /* + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. + */ + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } + + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); +} + +/* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ @@ -3565,52 +3524,92 @@ after: return dest; } +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + * + * All this extra complexity was caused by the original implementation + * committing to an entirely unnecessary property. In the long term, we + * want to do away with it. Explicitly scramble sort order if + * sane_behavior so that no such expectation exists in the new interface. + * + * Scrambling is done by swapping every two consecutive bits, which is + * non-identity one-to-one mapping which disturbs sort order sufficiently. + */ +static pid_t pid_fry(pid_t pid) +{ + unsigned a = pid & 0x55555555; + unsigned b = pid & 0xAAAAAAAA; + + return (a << 1) | (b >> 1); +} + +static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) +{ + if (cgroup_sane_behavior(cgrp)) + return pid_fry(pid); + else + return pid; +} + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } +static int fried_cmppid(const void *a, const void *b) +{ + return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); +} + +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); + + lockdep_assert_held(&cgrp->pidlist_mutex); + + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; +} + /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) { struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - /* - * We can't drop the pidlist_mutex before taking the l->rwsem in case - * the last ref-holder is trying to remove l from the list at the same - * time. Holding the pidlist_mutex precludes somebody taking whichever - * list we find out from under us - compare release_pid_array(). - */ - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry(l, &cgrp->pidlists, links) { - if (l->key.type == type && l->key.ns == ns) { - /* make sure l doesn't vanish out from under us */ - down_write(&l->rwsem); - mutex_unlock(&cgrp->pidlist_mutex); - return l; - } - } + lockdep_assert_held(&cgrp->pidlist_mutex); + + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; + /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); + if (!l) return l; - } - init_rwsem(&l->rwsem); - down_write(&l->rwsem); + + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; - l->key.ns = get_pid_ns(ns); + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); - mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct task_struct *tsk; struct cgroup_pidlist *l; + lockdep_assert_held(&cgrp->pidlist_mutex); + /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the @@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); + if (cgroup_sane_behavior(cgrp)) + sort(array, length, sizeof(pid_t), fried_cmppid, NULL); + else + sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(array, length); - l = cgroup_pidlist_find(cgrp, type); + + l = cgroup_pidlist_find_create(cgrp, type); if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); pidlist_free(array); return -ENOMEM; } - /* store array, freeing old if necessary - lock already held */ + + /* store array, freeing old if necessary */ pidlist_free(l->list); l->list = array; l->length = length; - l->use_count++; - up_write(&l->rwsem); *lp = l; return 0; } @@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pidlist *l = s->private; + struct cgroup_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; - int *iter; + int *iter, ret; + + mutex_lock(&cgrp->pidlist_mutex); + + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; - down_read(&l->rwsem); if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (l->list[mid] == pid) { + if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) index = mid + 1; else end = mid; @@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; - *pos = *iter; + *pos = cgroup_pid_fry(cgrp, *iter); return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pidlist *l = s->private; - up_read(&l->rwsem); + struct cgroup_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pidlist *l = s->private; + struct cgroup_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; /* @@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) if (p >= end) { return NULL; } else { - *pos = *p; + *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); return p; } } @@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = { .show = cgroup_pidlist_show, }; -static void cgroup_release_pid_array(struct cgroup_pidlist *l) -{ - /* - * the case where we're the last user of this particular pidlist will - * have us remove it from the cgroup's list, which entails taking the - * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> - * pidlist_mutex, we have to take pidlist_mutex first. - */ - mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->rwsem); - BUG_ON(!l->use_count); - if (!--l->use_count) { - /* we're the last user if refcount is 0; remove and free */ - list_del(&l->links); - mutex_unlock(&l->owner->pidlist_mutex); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - up_write(&l->rwsem); - kfree(l); - return; - } - mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->rwsem); -} - -static int cgroup_pidlist_release(struct inode *inode, struct file *file) -{ - struct cgroup_pidlist *l; - if (!(file->f_mode & FMODE_READ)) - return 0; - /* - * the seq_file will only be initialized if the file was opened for - * reading; hence we check if it's not null only in that case. - */ - l = ((struct seq_file *)file->private_data)->private; - cgroup_release_pid_array(l); - return seq_release(inode, file); -} - -static const struct file_operations cgroup_pidlist_operations = { - .read = seq_read, - .llseek = seq_lseek, - .write = cgroup_file_write, - .release = cgroup_pidlist_release, -}; - -/* - * The following functions handle opens on a file that displays a pidlist - * (tasks or procs). Prepare an array of the process/thread IDs of whoever's - * in the cgroup. - */ -/* helper function for the two below it */ -static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) -{ - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct cgroup_pidlist *l; - int retval; - - /* Nothing to do for write-only files */ - if (!(file->f_mode & FMODE_READ)) - return 0; - - /* have the array populated */ - retval = pidlist_array_load(cgrp, type, &l); - if (retval) - return retval; - /* configure file information */ - file->f_op = &cgroup_pidlist_operations; - - retval = seq_open(file, &cgroup_pidlist_seq_operations); - if (retval) { - cgroup_release_pid_array(l); - return retval; - } - ((struct seq_file *)file->private_data)->private = l; - return 0; -} -static int cgroup_tasks_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); -} -static int cgroup_procs_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); -} - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp) deactivate_super(sb); } -/* - * Unregister event and free resources. - * - * Gets called from workqueue. - */ -static void cgroup_event_remove(struct work_struct *work) -{ - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); - struct cgroup_subsys_state *css = event->css; - - remove_wait_queue(event->wqh, &event->wait); - - event->cft->unregister_event(css, event->cft, event->eventfd); - - /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd, 1); - - eventfd_ctx_put(event->eventfd); - kfree(event); - css_put(css); -} - -/* - * Gets called on POLLHUP on eventfd when user closes it. - * - * Called with wqh->lock held and interrupts disabled. - */ -static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); - struct cgroup *cgrp = event->css->cgroup; - unsigned long flags = (unsigned long)key; - - if (flags & POLLHUP) { - /* - * If the event has been detached at cgroup removal, we - * can simply return knowing the other side will cleanup - * for us. - * - * We can't race against event freeing since the other - * side will require wqh->lock via remove_wait_queue(), - * which we hold. - */ - spin_lock(&cgrp->event_list_lock); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - /* - * We are in atomic context, but cgroup_event_remove() - * may sleep, so we have to call it in workqueue. - */ - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - } - - return 0; -} - -static void cgroup_event_ptable_queue_proc(struct file *file, - wait_queue_head_t *wqh, poll_table *pt) -{ - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); - - event->wqh = wqh; - add_wait_queue(wqh, &event->wait); -} - -/* - * Parse input and register new cgroup event handler. - * - * Input must be in format '<event_fd> <control_fd> <args>'. - * Interpretation of args is defined by control file implementation. - */ -static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, const char *buffer) -{ - struct cgroup *cgrp = dummy_css->cgroup; - struct cgroup_event *event; - struct cgroup_subsys_state *cfile_css; - unsigned int efd, cfd; - struct fd efile; - struct fd cfile; - char *endp; - int ret; - - efd = simple_strtoul(buffer, &endp, 10); - if (*endp != ' ') - return -EINVAL; - buffer = endp + 1; - - cfd = simple_strtoul(buffer, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) - return -EINVAL; - buffer = endp + 1; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); - - efile = fdget(efd); - if (!efile.file) { - ret = -EBADF; - goto out_kfree; - } - - event->eventfd = eventfd_ctx_fileget(efile.file); - if (IS_ERR(event->eventfd)) { - ret = PTR_ERR(event->eventfd); - goto out_put_efile; - } - - cfile = fdget(cfd); - if (!cfile.file) { - ret = -EBADF; - goto out_put_eventfd; - } - - /* the process need read permission on control file */ - /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile.file), MAY_READ); - if (ret < 0) - goto out_put_cfile; - - event->cft = __file_cft(cfile.file); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto out_put_cfile; - } - - if (!event->cft->ss) { - ret = -EBADF; - goto out_put_cfile; - } - - /* - * Determine the css of @cfile, verify it belongs to the same - * cgroup as cgroup.event_control, and associate @event with it. - * Remaining events are automatically removed on cgroup destruction - * but the removal is asynchronous, so take an extra ref. - */ - rcu_read_lock(); - - ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); - if (event->css && event->css == cfile_css && css_tryget(event->css)) - ret = 0; - - rcu_read_unlock(); - if (ret) - goto out_put_cfile; - - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto out_put_css; - } - - ret = event->cft->register_event(event->css, event->cft, - event->eventfd, buffer); - if (ret) - goto out_put_css; - - efile.file->f_op->poll(efile.file, &event->pt); - - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); - - fdput(cfile); - fdput(efile); - - return 0; - -out_put_css: - css_put(event->css); -out_put_cfile: - fdput(cfile); -out_put_eventfd: - eventfd_ctx_put(event->eventfd); -out_put_efile: - fdput(efile); -out_kfree: - kfree(event); - - return ret; -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", - .open = cgroup_procs_open, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, .write_u64 = cgroup_procs_write, - .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { - .name = "cgroup.event_control", - .write_string = cgroup_write_event_control, - .mode = S_IWUGO, - }, - { .name = "cgroup.clone_children", .flags = CFTYPE_INSANE, .read_u64 = cgroup_clone_children_read, @@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = { { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cgroup_sane_behavior_show, + .seq_show = cgroup_sane_behavior_show, }, /* @@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = { { .name = "tasks", .flags = CFTYPE_INSANE, /* use "procs" instead */ - .open = cgroup_tasks_open, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { @@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = { { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cgroup_release_agent_show, + .seq_show = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, }, @@ -4333,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } +/** + * create_css - create a cgroup_subsys_state + * @cgrp: the cgroup new css will be associated with + * @ss: the subsys of new css + * + * Create a new css associated with @cgrp - @ss pair. On success, the new + * css is online and installed in @cgrp with all interface files created. + * Returns 0 on success, -errno on failure. + */ +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *css; + int err; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); + + css = ss->css_alloc(cgroup_css(parent, ss)); + if (IS_ERR(css)) + return PTR_ERR(css); + + err = percpu_ref_init(&css->refcnt, css_release); + if (err) + goto err_free; + + init_css(css, ss, cgrp); + + err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + if (err) + goto err_free; + + err = online_css(css); + if (err) + goto err_free; + + dget(cgrp->dentry); + css_get(css->parent); + + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } + + return 0; + +err_free: + percpu_ref_cancel_init(&css->refcnt); + ss->css_free(css); + return err; +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4344,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { - struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; - int err = 0; + int ssid, err = 0; struct cgroup_subsys *ss; struct super_block *sb = root->sb; @@ -4404,23 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css; - - css = ss->css_alloc(cgroup_css(parent, ss)); - if (IS_ERR(css)) { - err = PTR_ERR(css); - goto err_free_all; - } - css_ar[ss->subsys_id] = css; - - err = percpu_ref_init(&css->refcnt, css_release); - if (err) - goto err_free_all; - - init_css(css, ss, cgrp); - } - /* * Create directory. cgroup_create_file() returns with the new * directory locked on success so that it can be populated without @@ -4428,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_free_all; + goto err_unlock; lockdep_assert_held(&dentry->d_inode->i_mutex); cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4440,55 +4233,31 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* hold a ref to the parent's dentry */ dget(parent->dentry); - /* creation succeeded, notify subsystems */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - err = online_css(css); - if (err) - goto err_destroy; - - /* each css holds a ref to the cgroup's dentry and parent css */ - dget(dentry); - css_get(css->parent); - - /* mark it consumed for error path */ - css_ar[ss->subsys_id] = NULL; - - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); - ss->warned_broken_hierarchy = true; - } - } - + /* + * @cgrp is now fully operational. If something fails after this + * point, it'll be released via the normal destruction path. + */ idr_replace(&root->cgroup_idr, cgrp, cgrp->id); err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; - err = cgroup_populate_dir(cgrp, root->subsys_mask); - if (err) - goto err_destroy; + /* let's create and online css's */ + for_each_subsys(ss, ssid) { + if (root->subsys_mask & (1 << ssid)) { + err = create_css(cgrp, ss); + if (err) + goto err_destroy; + } + } mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; -err_free_all: - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - if (css) { - percpu_ref_cancel_init(&css->refcnt); - ss->css_free(css); - } - } +err_unlock: mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); @@ -4501,14 +4270,6 @@ err_free_cgrp: return err; err_destroy: - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - if (css) { - percpu_ref_cancel_init(&css->refcnt); - ss->css_free(css); - } - } cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); mutex_unlock(&dentry->d_inode->i_mutex); @@ -4631,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; - struct cgroup_event *event, *tmp; - struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; struct cgroup *child; bool empty; + int ssid; lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4670,12 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * will be invoked to perform the rest of destruction once the * percpu refs of all css's are confirmed to be killed. */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (css) - kill_css(css); - } + for_each_css(css, ssid, cgrp) + kill_css(css); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4710,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) dget(d); cgroup_d_remove_dir(d); - /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del_init(&event->list); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - return 0; }; @@ -4792,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); /* Create the top cgroup state for this subsystem */ - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ @@ -4866,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); cgroup_subsys[ss->subsys_id] = ss; /* @@ -4877,11 +4622,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ @@ -4911,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ret = online_css(css); - if (ret) + if (ret) { + ss->css_free(css); goto err_unload; + } /* success! */ + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; err_unload: + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); /* @ss can't be mounted here as try_module_get() would fail */ cgroup_unload_subsys(ss); @@ -4937,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); void cgroup_unload_subsys(struct cgroup_subsys *ss) { struct cgrp_cset_link *link; + struct cgroup_subsys_state *css; BUG_ON(ss->module == NULL); @@ -4948,15 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) BUG_ON(ss->root != &cgroup_dummy_root); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); - offline_css(cgroup_css(cgroup_dummy_top, ss)); + css = cgroup_css(cgroup_dummy_top, ss); + if (css) + offline_css(css); /* deassign the subsys_id */ cgroup_subsys[ss->subsys_id] = NULL; - /* remove subsystem from the dummy root's list of subsystems */ - list_del_init(&ss->sibling); - /* * disentangle the css from all css_sets attached to the dummy * top. as in loading, we need to pay our respects to the hashtable @@ -4979,9 +4729,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * need to free before marking as null because ss->css_free needs * the cgrp->subsys pointer to find their state. */ - ss->css_free(cgroup_css(cgroup_dummy_top, ss)); + if (css) + ss->css_free(css); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); } EXPORT_SYMBOL_GPL(cgroup_unload_subsys); @@ -5100,6 +4852,15 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); + + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; } core_initcall(cgroup_wq_init); @@ -5143,11 +4904,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int count = 0; + int ssid, count = 0; seq_printf(m, "%d:", root->hierarchy_id); - for_each_root_subsys(root, ss) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); @@ -5488,16 +5250,16 @@ __setup("cgroup_disable=", cgroup_disable); * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under RCU read lock. The caller is responsible for - * pinning the returned css if it needs to be accessed outside the RCU - * critical section. + * Must be called under cgroup_mutex or RCU read lock. The caller is + * responsible for pinning the returned css if it needs to be accessed + * outside the critical section. */ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct cgroup *cgrp; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* is @dentry a cgroup dir? */ if (!dentry->d_inode || @@ -5520,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&cgroup_mutex), - "css_from_id() needs proper protection"); + cgroup_assert_mutex_or_rcu_locked(); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) @@ -5570,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, return count; } -static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, - struct cftype *cft, - struct seq_file *seq) +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; @@ -5597,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_css_links_read(struct seq_file *seq, void *v) { + struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; read_lock(&css_set_lock); @@ -5645,12 +5403,12 @@ static struct cftype debug_files[] = { { .name = "current_css_set_cg_links", - .read_seq_string = current_css_set_cg_links_read, + .seq_show = current_css_set_cg_links_read, }, { .name = "cgroup_css_links", - .read_seq_string = cgroup_css_links_read, + .seq_show = cgroup_css_links_read, }, { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f0ff64d..6c3154e 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -301,10 +301,9 @@ out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *m) +static int freezer_read(struct seq_file *m, void *v) { - struct cgroup_subsys_state *pos; + struct cgroup_subsys_state *css = seq_css(m), *pos; rcu_read_lock(); @@ -458,7 +457,7 @@ static struct cftype files[] = { { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, - .read_seq_string = freezer_read, + .seq_show = freezer_read, .write_string = freezer_write, }, { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4772034..4410ac6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1731,66 +1731,41 @@ out_unlock: * used, list of ranges of sequential numbers, is variable length, * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. - * A single large read to a buffer that crosses a page boundary is - * ok, because the result being copied to user land is not recomputed - * across a page fault. */ - -static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static int cpuset_common_seq_show(struct seq_file *sf, void *v) { - size_t count; - - mutex_lock(&callback_mutex); - count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); - mutex_unlock(&callback_mutex); + struct cpuset *cs = css_cs(seq_css(sf)); + cpuset_filetype_t type = seq_cft(sf)->private; + ssize_t count; + char *buf, *s; + int ret = 0; - return count; -} - -static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) -{ - size_t count; + count = seq_get_buf(sf, &buf); + s = buf; mutex_lock(&callback_mutex); - count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); - mutex_unlock(&callback_mutex); - - return count; -} - -static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - char *page; - ssize_t retval = 0; - char *s; - - if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - s = page; switch (type) { case FILE_CPULIST: - s += cpuset_sprintf_cpulist(s, cs); + s += cpulist_scnprintf(s, count, cs->cpus_allowed); break; case FILE_MEMLIST: - s += cpuset_sprintf_memlist(s, cs); + s += nodelist_scnprintf(s, count, cs->mems_allowed); break; default: - retval = -EINVAL; - goto out; + ret = -EINVAL; + goto out_unlock; } - *s++ = '\n'; - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); -out: - free_page((unsigned long)page); - return retval; + if (s < buf + count - 1) { + *s++ = '\n'; + seq_commit(sf, s - buf); + } else { + seq_commit(sf, -1); + } +out_unlock: + mutex_unlock(&callback_mutex); + return ret; } static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -1847,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) static struct cftype files[] = { { .name = "cpus", - .read = cpuset_common_file_read, + .seq_show = cpuset_common_seq_show, .write_string = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, @@ -1855,7 +1830,7 @@ static struct cftype files[] = { { .name = "mems", - .read = cpuset_common_file_read, + .seq_show = cpuset_common_seq_show, .write_string = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7d2f35e..334b398 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -736,7 +736,8 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } -int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) +int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, + atomic_t *send_ready) { #ifdef CONFIG_SMP if (!kgdb_io_ready(0) || !send_ready) @@ -750,7 +751,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) ks->cpu = cpu; ks->ex_vector = trapnr; ks->signo = SIGTRAP; - ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; + ks->err_code = err_code; ks->linux_regs = regs; ks->send_ready = send_ready; kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 572aa4f..127d9bc 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -75,13 +75,11 @@ extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); extern int kdb_common_init_state(struct kgdb_state *ks); extern int kdb_common_deinit_state(void); -#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { return DBG_PASS_EVENT; } -#define KGDB_KDB_REASON_SYSTEM_NMI 0 #endif /* CONFIG_KGDB_KDB */ #endif /* _DEBUG_CORE_H_ */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b886a5e..307d87c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1854,6 +1854,10 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; + /* Tracing handlers use ->utask to communicate with fetch methods */ + if (!get_utask()) + goto out; + handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) goto out; diff --git a/kernel/exit.c b/kernel/exit.c index a949819..1e77fc6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 294189f..a17621c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -800,14 +800,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; - if (!oldmm) - return NULL; - mm = allocate_mm(); if (!mm) goto fail_nomem; @@ -1035,6 +1032,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); + + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); + init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); @@ -1224,7 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); @@ -1474,6 +1475,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + list_add_tail_rcu(&p->thread_node, + &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; @@ -1647,7 +1650,7 @@ SYSCALL_DEFINE0(fork) return do_fork(SIGCHLD, 0, 0, NULL, NULL); #else /* can not support in nommu mode */ - return(-EINVAL); + return -EINVAL; #endif } #endif @@ -1655,7 +1658,7 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9328b80..0b9c169 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -37,7 +37,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; -unsigned long __read_mostly sysctl_hung_task_warnings = 10; +int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; @@ -98,7 +98,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (!sysctl_hung_task_warnings) return; - sysctl_hung_task_warnings--; + + if (sysctl_hung_task_warnings > 0) + sysctl_hung_task_warnings--; /* * Ok, the task did not get scheduled for more than 2 minutes, diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c97016..60bafbe 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -932,6 +932,7 @@ static int kimage_load_segment(struct kimage *image, */ struct kimage *kexec_image; struct kimage *kexec_crash_image; +int kexec_load_disabled; static DEFINE_MUTEX(kexec_mutex); @@ -942,7 +943,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, int result; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; /* @@ -1536,7 +1537,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) size_t r; va_start(args, fmt); - r = vsnprintf(buf, sizeof(buf), fmt, args); + r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 9659d38..d945a94 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -126,7 +126,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { return sprintf(buf, "%lx %x\n", paddr_vmcoreinfo_note(), - (unsigned int)vmcoreinfo_max_size); + (unsigned int)sizeof(vmcoreinfo_note)); } KERNEL_ATTR_RO(vmcoreinfo); diff --git a/kernel/module.c b/kernel/module.c index f5a3b1e..d24fcf2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -815,10 +815,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (!(flags & O_NONBLOCK)) { - printk(KERN_WARNING - "waiting module removal not supported: please upgrade"); - } + if (!(flags & O_NONBLOCK)) + pr_warn("waiting module removal not supported: please upgrade\n"); if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; diff --git a/kernel/padata.c b/kernel/padata.c index 2abd25d..161402f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -112,7 +112,7 @@ int padata_do_parallel(struct padata_instance *pinst, rcu_read_lock_bh(); - pd = rcu_dereference(pinst->pd); + pd = rcu_dereference_bh(pinst->pd); err = -EINVAL; if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) diff --git a/kernel/params.c b/kernel/params.c index c00d5b5..b00142e 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -227,17 +227,10 @@ int parse_args(const char *doing, } /* Lazy bastard, eh? */ -#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ +#define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ - tmptype l; \ - int ret; \ - \ - ret = strtolfn(val, 0, &l); \ - if (ret < 0 || ((type)l != l)) \ - return ret < 0 ? ret : -EINVAL; \ - *((type *)kp->arg) = l; \ - return 0; \ + return strtolfn(val, 0, (type *)kp->arg); \ } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ @@ -253,13 +246,13 @@ int parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); int param_set_charp(const char *val, const struct kernel_param *kp) { diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index d09dd10..9a58bc2 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -32,7 +32,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio->bi_end_io = end_swap_bio_read; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0121dab..37170d4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -82,6 +82,7 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) unlock_system_sleep(); } +EXPORT_SYMBOL_GPL(hibernation_set_ops); static bool entering_platform_hibernation; @@ -293,10 +294,10 @@ static int create_image(int platform_mode) error); /* Restore control flow magically appears here */ restore_processor_state(); - if (!in_suspend) { + if (!in_suspend) events_check_enabled = false; - platform_leave(platform_mode); - } + + platform_leave(platform_mode); Power_up: syscore_resume(); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b38109e..d9f61a1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem(sizeof(struct nosave_region)); + region = memblock_virt_alloc(sizeof(struct nosave_region), 0); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index be7c86b..b1d255f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -757,14 +757,10 @@ void __init setup_log_buf(int early) return; if (early) { - unsigned long mem; - - mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (!mem) - return; - new_log_buf = __va(mem); + new_log_buf = + memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); } else { - new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); } if (unlikely(!new_log_buf)) { @@ -1599,10 +1595,13 @@ asmlinkage int vprintk_emit(int facility, int level, * either merge it with the current buffer and flush, or if * there was a race with interrupts (prefix == true) then just * flush it out and store this line separately. + * If the preceding printk was from a different task and missed + * a newline, flush and append the newline. */ - if (cont.len && cont.owner == current) { - if (!(lflags & LOG_PREFIX)) - stored = cont_add(facility, level, text, text_len); + if (cont.len) { + if (cont.owner == current && !(lflags & LOG_PREFIX)) + stored = cont_add(facility, level, text, + text_len); cont_flush(LOG_NEWLINE); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 802365c..c54609f 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -200,17 +200,6 @@ void wait_rcu_gp(call_rcu_func_t crf) } EXPORT_SYMBOL_GPL(wait_rcu_gp); -#ifdef CONFIG_PROVE_RCU -/* - * wrapper function to avoid #include problems. - */ -int rcu_my_thread_group_empty(void) -{ - return thread_group_empty(current); -} -EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); -#endif /* #ifdef CONFIG_PROVE_RCU */ - #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD static inline void debug_init_rcu_head(struct rcu_head *head) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81343d6..656cd70 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) goto out; + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); out: @@ -1769,7 +1770,29 @@ void set_numabalancing_state(bool enabled) numabalancing_enabled = enabled; } #endif /* CONFIG_SCHED_DEBUG */ -#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = numabalancing_enabled; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_numabalancing_state(state); + return err; +} +#endif +#endif /* * fork()/clone()-time setup: @@ -4605,6 +4628,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } @@ -7854,15 +7878,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpu_stats_show(struct seq_file *sf, void *v) { - struct task_group *tg = css_tg(css); + struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - cb->fill(cb, "nr_periods", cfs_b->nr_periods); - cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); - cb->fill(cb, "throttled_time", cfs_b->throttled_time); + seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); + seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); + seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); return 0; } @@ -7916,7 +7939,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .read_map = cpu_stats_show, + .seq_show = cpu_stats_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f64722f..622e081 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -163,10 +163,9 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - struct cpuacct *ca = css_ca(css); + struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; @@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct seq_file *sf, void *v) { - struct cpuacct *ca = css_ca(css); + struct cpuacct *ca = css_ca(seq_css(sf)); int cpu; s64 val = 0; @@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, val += kcpustat->cpustat[CPUTIME_NICE]; } val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); val = 0; for_each_online_cpu(cpu) { @@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, } val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); return 0; } @@ -220,11 +218,11 @@ static struct cftype files[] = { }, { .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, + .seq_show = cpuacct_percpu_seq_show, }, { .name = "stat", - .read_map = cpuacct_stats_show, + .seq_show = cpuacct_stats_show, }, { } /* terminate */ }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efe6457..966cc2b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1250,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p) p->numa_scan_period = task_scan_min(p); if (env.best_task == NULL) { - int ret = migrate_task_to(p, env.best_cpu); + ret = migrate_task_to(p, env.best_cpu); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); return ret; } diff --git a/kernel/signal.c b/kernel/signal.c index 940b30e..52f881d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2047,8 +2047,8 @@ static bool do_signal_stop(int signr) if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; - for (t = next_thread(current); t != current; - t = next_thread(t)) { + t = current; + while_each_thread(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -3125,8 +3125,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) rm_from_queue_full(&mask, &t->signal->shared_pending); do { rm_from_queue_full(&mask, &t->pending); - t = next_thread(t); - } while (t != current); + } while_each_thread(current, t); } } diff --git a/kernel/smp.c b/kernel/smp.c index bd9f940..ffee35b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -23,17 +23,11 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; - cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); -struct call_single_queue { - struct list_head list; - raw_spinlock_t lock; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -47,14 +41,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); - if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, - cpu_to_node(cpu))) { - free_cpumask_var(cfd->cpumask); - return notifier_from_errno(-ENOMEM); - } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { - free_cpumask_var(cfd->cpumask_ipi); free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); } @@ -67,7 +55,6 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); - free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); break; #endif @@ -85,12 +72,8 @@ void __init call_function_init(void) void *cpu = (void *)(long)smp_processor_id(); int i; - for_each_possible_cpu(i) { - struct call_single_queue *q = &per_cpu(call_single_queue, i); - - raw_spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->list); - } + for_each_possible_cpu(i) + init_llist_head(&per_cpu(call_single_queue, i)); hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); register_cpu_notifier(&hotplug_cfd_notifier); @@ -141,18 +124,9 @@ static void csd_unlock(struct call_single_data *csd) */ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { - struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); - unsigned long flags; - int ipi; - if (wait) csd->flags |= CSD_FLAG_WAIT; - raw_spin_lock_irqsave(&dst->lock, flags); - ipi = list_empty(&dst->list); - list_add_tail(&csd->list, &dst->list); - raw_spin_unlock_irqrestore(&dst->lock, flags); - /* * The list addition should be visible before sending the IPI * handler locks the list to pull the entry off it because of @@ -164,7 +138,7 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ - if (ipi) + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) arch_send_call_function_single_ipi(cpu); if (wait) @@ -177,27 +151,26 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) */ void generic_smp_call_function_single_interrupt(void) { - struct call_single_queue *q = &__get_cpu_var(call_single_queue); - LIST_HEAD(list); + struct llist_node *entry, *next; /* * Shouldn't receive this interrupt on a cpu that is not yet online. */ WARN_ON_ONCE(!cpu_online(smp_processor_id())); - raw_spin_lock(&q->lock); - list_replace_init(&q->list, &list); - raw_spin_unlock(&q->lock); + entry = llist_del_all(&__get_cpu_var(call_single_queue)); + entry = llist_reverse_order(entry); - while (!list_empty(&list)) { + while (entry) { struct call_single_data *csd; - csd = list_entry(list.next, struct call_single_data, list); - list_del(&csd->list); + next = entry->next; + csd = llist_entry(entry, struct call_single_data, llist); csd->func(csd->info); - csd_unlock(csd); + + entry = next; } } @@ -402,30 +375,17 @@ void smp_call_function_many(const struct cpumask *mask, if (unlikely(!cpumask_weight(cfd->cpumask))) return; - /* - * After we put an entry into the list, cfd->cpumask may be cleared - * again when another CPU sends another IPI for a SMP function call, so - * cfd->cpumask will be zero. - */ - cpumask_copy(cfd->cpumask_ipi, cfd->cpumask); - for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); - struct call_single_queue *dst = - &per_cpu(call_single_queue, cpu); - unsigned long flags; csd_lock(csd); csd->func = func; csd->info = info; - - raw_spin_lock_irqsave(&dst->lock, flags); - list_add_tail(&csd->list, &dst->list); - raw_spin_unlock_irqrestore(&dst->lock, flags); + llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask_ipi); + arch_send_call_function_ipi_mask(cfd->cpumask); if (wait) { for_each_cpu(cpu, cfd->cpumask) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 8a1e6e1..8509670 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -8,6 +8,8 @@ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/export.h> #include <linux/kernel_stat.h> #include <linux/interrupt.h> @@ -54,7 +56,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); -char *softirq_to_name[NR_SOFTIRQS] = { +const char * const softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; @@ -136,7 +138,6 @@ void _local_bh_enable(void) WARN_ON_ONCE(in_irq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } - EXPORT_SYMBOL(_local_bh_enable); void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) @@ -153,7 +154,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) /* * Keep preemption disabled until we are done with * softirq processing: - */ + */ preempt_count_sub(cnt - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) { @@ -229,6 +230,7 @@ asmlinkage void __do_softirq(void) struct softirq_action *h; bool in_hardirq; __u32 pending; + int softirq_bit; int cpu; /* @@ -253,30 +255,30 @@ restart: h = softirq_vec; - do { - if (pending & 1) { - unsigned int vec_nr = h - softirq_vec; - int prev_count = preempt_count(); - - kstat_incr_softirqs_this_cpu(vec_nr); - - trace_softirq_entry(vec_nr); - h->action(h); - trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %u %s %p" - "with preempt_count %08x," - " exited with %08x?\n", vec_nr, - softirq_to_name[vec_nr], h->action, - prev_count, preempt_count()); - preempt_count_set(prev_count); - } + while ((softirq_bit = ffs(pending))) { + unsigned int vec_nr; + int prev_count; - rcu_bh_qs(cpu); + h += softirq_bit - 1; + + vec_nr = h - softirq_vec; + prev_count = preempt_count(); + + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); + h->action(h); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", + vec_nr, softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); + preempt_count_set(prev_count); } + rcu_bh_qs(cpu); h++; - pending >>= 1; - } while (pending); + pending >>= softirq_bit; + } local_irq_disable(); @@ -433,8 +435,7 @@ void open_softirq(int nr, void (*action)(struct softirq_action *)) /* * Tasklets */ -struct tasklet_head -{ +struct tasklet_head { struct tasklet_struct *head; struct tasklet_struct **tail; }; @@ -453,7 +454,6 @@ void __tasklet_schedule(struct tasklet_struct *t) raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_restore(flags); } - EXPORT_SYMBOL(__tasklet_schedule); void __tasklet_hi_schedule(struct tasklet_struct *t) @@ -467,7 +467,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) raise_softirq_irqoff(HI_SOFTIRQ); local_irq_restore(flags); } - EXPORT_SYMBOL(__tasklet_hi_schedule); void __tasklet_hi_schedule_first(struct tasklet_struct *t) @@ -478,7 +477,6 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) __this_cpu_write(tasklet_hi_vec.head, t); __raise_softirq_irqoff(HI_SOFTIRQ); } - EXPORT_SYMBOL(__tasklet_hi_schedule_first); static void tasklet_action(struct softirq_action *a) @@ -498,7 +496,8 @@ static void tasklet_action(struct softirq_action *a) if (tasklet_trylock(t)) { if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + if (!test_and_clear_bit(TASKLET_STATE_SCHED, + &t->state)) BUG(); t->func(t->data); tasklet_unlock(t); @@ -533,7 +532,8 @@ static void tasklet_hi_action(struct softirq_action *a) if (tasklet_trylock(t)) { if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + if (!test_and_clear_bit(TASKLET_STATE_SCHED, + &t->state)) BUG(); t->func(t->data); tasklet_unlock(t); @@ -551,7 +551,6 @@ static void tasklet_hi_action(struct softirq_action *a) } } - void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -561,13 +560,12 @@ void tasklet_init(struct tasklet_struct *t, t->func = func; t->data = data; } - EXPORT_SYMBOL(tasklet_init); void tasklet_kill(struct tasklet_struct *t) { if (in_interrupt()) - printk("Attempt to kill tasklet from interrupt\n"); + pr_notice("Attempt to kill tasklet from interrupt\n"); while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { do { @@ -577,7 +575,6 @@ void tasklet_kill(struct tasklet_struct *t) tasklet_unlock_wait(t); clear_bit(TASKLET_STATE_SCHED, &t->state); } - EXPORT_SYMBOL(tasklet_kill); /* @@ -727,9 +724,8 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { switch (action) { #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/sys.c b/kernel/sys.c index c723113..c0a58be 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -895,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * - * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. - * LBT 04.03.94 + * !PF_FORKNOEXEC check to conform completely to POSIX. */ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { @@ -932,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; - if (p->did_exec) + if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; @@ -1572,8 +1571,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) t = p; do { accumulate_thread_rusage(t, r); - t = next_thread(t); - } while (t != p); + } while_each_thread(p, t); break; default: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f..49e13e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -62,6 +62,7 @@ #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> +#include <linux/kexec.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -95,8 +96,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP @@ -122,6 +121,8 @@ extern int blk_iopoll_enabled; static int sixty = 60; #endif +static int __maybe_unused neg_one = -1; + static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; @@ -391,6 +392,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_balancing, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { @@ -607,6 +617,18 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_KEXEC + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, +#endif #ifdef CONFIG_MODULES { .procname = "modprobe", @@ -977,9 +999,10 @@ static struct ctl_table kern_table[] = { { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, }, #endif #ifdef CONFIG_COMPAT @@ -1121,7 +1144,14 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, }, { .procname = "page-cluster", diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d7e2068..1378e84 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -50,6 +50,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM_RUNTIME),y) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f785aef..b418cb0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -781,8 +781,8 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, if (!error && !bio_flagged(bio, BIO_UPTODATE)) error = EIO; - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - error, 0, NULL); + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_rw, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -885,8 +885,9 @@ static void blk_add_trace_split(void *ignore, if (bt) { __be64 rpdu = cpu_to_be64(pdu); - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), + __blk_add_trace(bt, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, + !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); } } @@ -918,9 +919,9 @@ static void blk_add_trace_bio_remap(void *ignore, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), - sizeof(r), &r); + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_rw, BLK_TA_REMAP, + !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); } /** diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 72a0f81..cd7f76d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -85,6 +85,8 @@ int function_trace_stop __read_mostly; /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; +/* What to set function_trace_op to */ +static struct ftrace_ops *set_function_trace_op; /* List for set_ftrace_pid's pids. */ LIST_HEAD(ftrace_pids); @@ -278,6 +280,29 @@ static void update_global_ops(void) global_ops.func = func; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + +static void ftrace_sync_ipi(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void update_function_graph_func(void); +#else +static inline void update_function_graph_func(void) { } +#endif + static void update_ftrace_function(void) { ftrace_func_t func; @@ -296,16 +321,61 @@ static void update_ftrace_function(void) !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ if (ftrace_ops_list == &global_ops) - function_trace_op = ftrace_global_list; + set_function_trace_op = ftrace_global_list; else - function_trace_op = ftrace_ops_list; + set_function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; } else { /* Just use the default ftrace_ops */ - function_trace_op = &ftrace_list_end; + set_function_trace_op = &ftrace_list_end; func = ftrace_ops_list_func; } + /* If there's no change, then do nothing more here */ + if (ftrace_trace_function == func) + return; + + update_function_graph_func(); + + /* + * If we are using the list function, it doesn't care + * about the function_trace_ops. + */ + if (func == ftrace_ops_list_func) { + ftrace_trace_function = func; + /* + * Don't even bother setting function_trace_ops, + * it would be racy to do so anyway. + */ + return; + } + +#ifndef CONFIG_DYNAMIC_FTRACE + /* + * For static tracing, we need to be a bit more careful. + * The function change takes affect immediately. Thus, + * we need to coorditate the setting of the function_trace_ops + * with the setting of the ftrace_trace_function. + * + * Set the function to the list ops, which will call the + * function we want, albeit indirectly, but it handles the + * ftrace_ops and doesn't depend on function_trace_op. + */ + ftrace_trace_function = ftrace_ops_list_func; + /* + * Make sure all CPUs see this. Yes this is slow, but static + * tracing is slow and nasty to have enabled. + */ + schedule_on_each_cpu(ftrace_sync); + /* Now all cpus are using the list ops. */ + function_trace_op = set_function_trace_op; + /* Make sure the function_trace_op is visible on all CPUs */ + smp_wmb(); + /* Nasty way to force a rmb on all cpus */ + smp_call_function(ftrace_sync_ipi, NULL, 1); + /* OK, we are all set to update the ftrace_trace_function now! */ +#endif /* !CONFIG_DYNAMIC_FTRACE */ + ftrace_trace_function = func; } @@ -410,17 +480,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -439,20 +498,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { ret = remove_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); - if (!ret) { - /* - * The ftrace_ops is now removed from the list, - * so there'll be no new users. We must ensure - * all current users are done before we free - * the control data. - * Note synchronize_sched() is not enough, as we - * use preempt_disable() to do RCU, but the function - * tracer can be called where RCU is not active - * (before user_exit()). - */ - schedule_on_each_cpu(ftrace_sync); - control_ops_free(ops); - } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -462,17 +507,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); - /* - * Dynamic ops may be freed, we must make sure that all - * callers are done before leaving this function. - * - * Again, normal synchronize_sched() is not good enough. - * We need to do a hard force of sched synchronization. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - schedule_on_each_cpu(ftrace_sync); - - return 0; } @@ -1082,19 +1116,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; -loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, whence); - else - file->f_pos = ret = 1; - - return ret; -} - #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -1992,8 +2013,14 @@ void ftrace_modify_all_code(int command) else if (command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - if (update && ftrace_trace_function != ftrace_ops_list_func) + if (update && ftrace_trace_function != ftrace_ops_list_func) { + function_trace_op = set_function_trace_op; + smp_wmb(); + /* If irqs are disabled, we are in stop machine */ + if (!irqs_disabled()) + smp_call_function(ftrace_sync_ipi, NULL, 1); ftrace_update_ftrace_func(ftrace_trace_function); + } if (command & FTRACE_START_FUNC_RET) ftrace_enable_ftrace_graph_caller(); @@ -2156,10 +2183,41 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_TRACE_FUNC; } - if (!command || !ftrace_enabled) + if (!command || !ftrace_enabled) { + /* + * If these are control ops, they still need their + * per_cpu field freed. Since, function tracing is + * not currently active, we can just free them + * without synchronizing all CPUs. + */ + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); return 0; + } ftrace_run_update_code(command); + + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + * The same goes for freeing the per_cpu data of the control + * ops. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + schedule_on_each_cpu(ftrace_sync); + + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + } + return 0; } @@ -2739,7 +2797,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) * routine, you can use ftrace_filter_write() for the write * routine if @flag has FTRACE_ITER_FILTER set, or * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_filter_lseek() should be used as the lseek routine, and + * tracing_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). */ int @@ -3767,7 +3825,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -3775,7 +3833,7 @@ static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = seq_read, .write = ftrace_notrace_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -4038,7 +4096,7 @@ static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, .read = seq_read, .write = ftrace_graph_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_graph_release, }; @@ -4046,7 +4104,7 @@ static const struct file_operations ftrace_graph_notrace_fops = { .open = ftrace_graph_notrace_open, .read = seq_read, .write = ftrace_graph_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_graph_release, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4719,7 +4777,7 @@ static const struct file_operations ftrace_pid_fops = { .open = ftrace_pid_open, .write = ftrace_pid_write, .read = seq_read, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_pid_release, }; @@ -4862,6 +4920,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) trace_func_graph_ret_t ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -5003,6 +5062,30 @@ static struct ftrace_ops fgraph_ops __read_mostly = { FTRACE_OPS_FL_RECURSION_SAFE, }; +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func, NULL)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +static void update_function_graph_func(void) +{ + if (ftrace_ops_list == &ftrace_list_end || + (ftrace_ops_list == &global_ops && + global_ops.next == &ftrace_list_end)) + ftrace_graph_entry = __ftrace_graph_entry; + else + ftrace_graph_entry = ftrace_graph_entry_test; +} + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5027,7 +5110,16 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, } ftrace_graph_return = retfunc; - ftrace_graph_entry = entryfunc; + + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); @@ -5046,6 +5138,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; + __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9d20cd9..815c878 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -455,6 +455,9 @@ int __trace_puts(unsigned long ip, const char *str, int size) unsigned long irq_flags; int alloc; + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + alloc = sizeof(*entry) + size + 2; /* possible \n added */ local_save_flags(irq_flags); @@ -495,6 +498,9 @@ int __trace_bputs(unsigned long ip, const char *str) unsigned long irq_flags; int size = sizeof(struct bputs_entry); + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, @@ -595,6 +601,28 @@ void free_snapshot(struct trace_array *tr) } /** + * tracing_alloc_snapshot - allocate snapshot buffer. + * + * This only allocates the snapshot buffer if it isn't already + * allocated - it doesn't also take a snapshot. + * + * This is meant to be used in cases where the snapshot buffer needs + * to be set up for events that can't sleep but need to be able to + * trigger a snapshot. + */ +int tracing_alloc_snapshot(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = alloc_snapshot(tr); + WARN_ON(ret < 0); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); + +/** * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. * * This is similar to trace_snapshot(), but it will allocate the @@ -607,11 +635,10 @@ void free_snapshot(struct trace_array *tr) */ void tracing_snapshot_alloc(void) { - struct trace_array *tr = &global_trace; int ret; - ret = alloc_snapshot(tr); - if (WARN_ON(ret < 0)) + ret = tracing_alloc_snapshot(); + if (ret < 0) return; tracing_snapshot(); @@ -623,6 +650,12 @@ void tracing_snapshot(void) WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); +int tracing_alloc_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ @@ -3156,19 +3189,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } -static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +loff_t tracing_lseek(struct file *file, loff_t offset, int whence) { + int ret; + if (file->f_mode & FMODE_READ) - return seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else - return 0; + file->f_pos = ret = 0; + + return ret; } static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .write = tracing_write_stub, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_release, }; @@ -3488,60 +3525,103 @@ static const char readme_msg[] = " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" - "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" + "\t\t\t Disable an option by adding a suffix 'no' to the\n" + "\t\t\t option name\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" - " set_ftrace_filter\t- echo function name in here to only trace these functions\n" - " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" - " modules: Can select a group via module\n" - " Format: :mod:<module-name>\n" - " example: echo :mod:ext3 > set_ftrace_filter\n" - " triggers: a command to perform when function is hit\n" - " Format: <function>:<trigger>[:count]\n" - " trigger: traceon, traceoff\n" - " enable_event:<system>:<event>\n" - " disable_event:<system>:<event>\n" + " set_ftrace_filter\t- echo function name in here to only trace these\n" + "\t\t\t functions\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module\n" + "\t Format: :mod:<module-name>\n" + "\t example: echo :mod:ext3 > set_ftrace_filter\n" + "\t triggers: a command to perform when function is hit\n" + "\t Format: <function>:<trigger>[:count]\n" + "\t trigger: traceon, traceoff\n" + "\t\t enable_event:<system>:<event>\n" + "\t\t disable_event:<system>:<event>\n" #ifdef CONFIG_STACKTRACE - " stacktrace\n" + "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT - " snapshot\n" + "\t\t snapshot\n" #endif - " example: echo do_fault:traceoff > set_ftrace_filter\n" - " echo do_trap:traceoff:3 > set_ftrace_filter\n" - " The first one will disable tracing every time do_fault is hit\n" - " The second will disable tracing at most 3 times when do_trap is hit\n" - " The first time do trap is hit and it disables tracing, the counter\n" - " will decrement to 2. If tracing is already disabled, the counter\n" - " will not decrement. It only decrements when the trigger did work\n" - " To remove trigger without count:\n" - " echo '!<function>:<trigger> > set_ftrace_filter\n" - " To remove trigger with a count:\n" - " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + "\t example: echo do_fault:traceoff > set_ftrace_filter\n" + "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" + "\t The first one will disable tracing every time do_fault is hit\n" + "\t The second will disable tracing at most 3 times when do_trap is hit\n" + "\t The first time do trap is hit and it disables tracing, the\n" + "\t counter will decrement to 2. If tracing is already disabled,\n" + "\t the counter will not decrement. It only decrements when the\n" + "\t trigger did work\n" + "\t To remove trigger without count:\n" + "\t echo '!<function>:<trigger> > set_ftrace_filter\n" + "\t To remove trigger with a count:\n" + "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" " set_ftrace_notrace\t- echo function name in here to never trace.\n" - " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" - " modules: Can select a group via module command :mod:\n" - " Does not accept triggers\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module command :mod:\n" + "\t Does not accept triggers\n" #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_TRACER - " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" + " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" + "\t\t (function)\n" #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT - "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" - "\t\t\t Read the contents for more information\n" + "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" + "\t\t\t snapshot buffer. Read the contents for more\n" + "\t\t\t information\n" #endif #ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" - "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" + "\t\t\t Write into this file to reset the max size (trigger a\n" + "\t\t\t new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE - " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" + "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ + " events/\t\t- Directory containing all trace event subsystems:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" + " events/<system>/\t- Directory containing all trace events for <system>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" + "\t\t\t events\n" + " filter\t\t- If set, only events passing filter are traced\n" + " events/<system>/<event>/\t- Directory containing control files for\n" + "\t\t\t <event>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" + " filter\t\t- If set, only events passing filter are traced\n" + " trigger\t\t- If set, a command to perform when event is hit\n" + "\t Format: <trigger>[:count][if <filter>]\n" + "\t trigger: traceon, traceoff\n" + "\t enable_event:<system>:<event>\n" + "\t disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif + "\t example: echo traceoff > events/block/block_unplug/trigger\n" + "\t echo traceoff:3 > events/block/block_unplug/trigger\n" + "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" + "\t events/block/block_unplug/trigger\n" + "\t The first disables tracing every time block_unplug is hit.\n" + "\t The second disables tracing the first 3 times block_unplug is hit.\n" + "\t The third enables the kmalloc event the first 3 times block_unplug\n" + "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" + "\t Like function triggers, the counter is only decremented if it\n" + "\t enabled or disabled tracing.\n" + "\t To remove a trigger without a count:\n" + "\t echo '!<trigger> > <system>/<event>/trigger\n" + "\t To remove a trigger with a count:\n" + "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" + "\t Filters can be ignored when removing a trigger.\n" ; static ssize_t @@ -4212,12 +4292,6 @@ out: return sret; } -static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - __free_page(buf->page); -} - static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, unsigned int idx) { @@ -4229,7 +4303,7 @@ static const struct pipe_buf_operations tracing_pipe_buf_ops = { .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, - .release = tracing_pipe_buf_release, + .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -4913,7 +4987,7 @@ static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, .read = seq_read, .write = tracing_snapshot_write, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_snapshot_release, }; @@ -5883,6 +5957,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + buf->tr = tr; + buf->buffer = ring_buffer_alloc(size, rb_flags); if (!buf->buffer) return -ENOMEM; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ea189e0..02b592f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,3 +1,4 @@ + #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H @@ -587,6 +588,8 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); +loff_t tracing_lseek(struct file *file, loff_t offset, int whence); + extern cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ @@ -1020,6 +1023,10 @@ extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); +extern int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp); +extern void free_event_filter(struct event_filter *filter); struct ftrace_event_field * trace_find_event_field(struct ftrace_event_call *call, char *name); @@ -1028,9 +1035,195 @@ extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); +extern struct ftrace_event_file *find_event_file(struct trace_array *tr, + const char *system, + const char *event); + +static inline void *event_file_data(struct file *filp) +{ + return ACCESS_ONCE(file_inode(filp)->i_private); +} + extern struct mutex event_mutex; extern struct list_head ftrace_events; +extern const struct file_operations event_trigger_fops; + +extern int register_trigger_cmds(void); +extern void clear_event_triggers(struct trace_array *tr); + +struct event_trigger_data { + unsigned long count; + int ref; + struct event_trigger_ops *ops; + struct event_command *cmd_ops; + struct event_filter __rcu *filter; + char *filter_str; + void *private_data; + struct list_head list; +}; + +/** + * struct event_trigger_ops - callbacks for trace event triggers + * + * The methods in this structure provide per-event trigger hooks for + * various trigger operations. + * + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @func: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command). + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). + */ +struct event_trigger_ops { + void (*func)(struct event_trigger_data *data); + int (*init)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + void (*free)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +}; + +/** + * struct event_command - callbacks and data members for event commands + * + * Event commands are invoked by users by writing the command name + * into the 'trigger' file associated with a trace event. The + * parameters associated with a specific invocation of an event + * command are used to create an event trigger instance, which is + * added to the list of trigger instances associated with that trace + * event. When the event is hit, the set of triggers associated with + * that event is invoked. + * + * The data members in this structure provide per-event command data + * for various event commands. + * + * All the data members below, except for @post_trigger, must be set + * for each event command. + * + * @name: The unique name that identifies the event command. This is + * the name used when setting triggers via trigger files. + * + * @trigger_type: A unique id that identifies the event command + * 'type'. This value has two purposes, the first to ensure that + * only one trigger of the same type can be set at a given time + * for a particular event e.g. it doesn't make sense to have both + * a traceon and traceoff trigger attached to a single event at + * the same time, so traceon and traceoff have the same type + * though they have different names. The @trigger_type value is + * also used as a bit value for deferring the actual trigger + * action until after the current event is finished. Some + * commands need to do this if they themselves log to the trace + * buffer (see the @post_trigger() member below). @trigger_type + * values are defined by adding new values to the trigger_type + * enum in include/linux/ftrace_event.h. + * + * @post_trigger: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * All the methods below, except for @set_filter(), must be + * implemented. + * + * @func: The callback function responsible for parsing and + * registering the trigger written to the 'trigger' file by the + * user. It allocates the trigger instance and registers it with + * the appropriate trace event. It makes use of the other + * event_command callback functions to orchestrate this, and is + * usually implemented by the generic utility function + * @event_trigger_callback() (see trace_event_triggers.c). + * + * @reg: Adds the trigger to the list of triggers associated with the + * event, and enables the event trigger itself, after + * initializing it (via the event_trigger_ops @init() function). + * This is also where commands can use the @trigger_type value to + * make the decision as to whether or not multiple instances of + * the trigger should be allowed. This is usually implemented by + * the generic utility function @register_trigger() (see + * trace_event_triggers.c). + * + * @unreg: Removes the trigger from the list of triggers associated + * with the event, and disables the event trigger itself, after + * initializing it (via the event_trigger_ops @free() function). + * This is usually implemented by the generic utility function + * @unregister_trigger() (see trace_event_triggers.c). + * + * @set_filter: An optional function called to parse and set a filter + * for the trigger. If no @set_filter() method is set for the + * event command, filters set by the user for the command will be + * ignored. This is usually implemented by the generic utility + * function @set_trigger_filter() (see trace_event_triggers.c). + * + * @get_trigger_ops: The callback function invoked to retrieve the + * event_trigger_ops implementation associated with the command. + */ +struct event_command { + struct list_head list; + char *name; + enum event_trigger_type trigger_type; + bool post_trigger; + int (*func)(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *params); + int (*reg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + void (*unreg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + int (*set_filter)(char *filter_str, + struct event_trigger_data *data, + struct ftrace_event_file *file); + struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); +}; + +extern int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable); +extern int tracing_alloc_snapshot(void); + extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a11800a..e71ffd4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -342,6 +342,12 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, return ret; } +int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) +{ + return __ftrace_event_enable_disable(file, enable, soft_disable); +} + static int ftrace_event_enable_disable(struct ftrace_event_file *file, int enable) { @@ -421,11 +427,6 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) } } -static void *event_file_data(struct file *filp) -{ - return ACCESS_ONCE(file_inode(filp)->i_private); -} - static void remove_event_file_dir(struct ftrace_event_file *file) { struct dentry *dir = file->dir; @@ -1549,6 +1550,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); + trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); @@ -1645,6 +1649,8 @@ trace_create_new_event(struct ftrace_event_call *call, file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); + atomic_set(&file->tm_ref, 0); + INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); return file; @@ -1849,20 +1855,7 @@ __trace_add_event_dirs(struct trace_array *tr) } } -#ifdef CONFIG_DYNAMIC_FTRACE - -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - -struct event_probe_data { - struct ftrace_event_file *file; - unsigned long count; - int ref; - bool enable; -}; - -static struct ftrace_event_file * +struct ftrace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { struct ftrace_event_file *file; @@ -1885,6 +1878,19 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) return NULL; } +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct ftrace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + static void event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) { @@ -2311,6 +2317,9 @@ int event_trace_del_tracer(struct trace_array *tr) { mutex_lock(&event_mutex); + /* Disable any event triggers and associated soft-disabled events */ + clear_event_triggers(tr); + /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); @@ -2377,6 +2386,8 @@ static __init int event_trace_enable(void) register_event_cmds(); + register_trigger_cmds(); + return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2468f56..8a86319 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -799,6 +799,11 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +void free_event_filter(struct event_filter *filter) +{ + __free_filter(filter); +} + void destroy_call_preds(struct ftrace_event_call *call) { __free_filter(call->filter); @@ -1938,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call, return err; } +int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + return create_filter(call, filter_str, set_str, filterp); +} + /** * create_system_filter - create a filter for an event_subsystem * @system: event_subsystem to create a filter for diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c new file mode 100644 index 0000000..8efbb69 --- /dev/null +++ b/kernel/trace/trace_events_trigger.c @@ -0,0 +1,1437 @@ +/* + * trace_events_trigger - trace event triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/slab.h> + +#include "trace.h" + +static LIST_HEAD(trigger_commands); +static DEFINE_MUTEX(trigger_cmd_mutex); + +static void +trigger_data_free(struct event_trigger_data *data) +{ + if (data->cmd_ops->set_filter) + data->cmd_ops->set_filter(NULL, data, NULL); + + synchronize_sched(); /* make sure current triggers exit before free */ + kfree(data); +} + +/** + * event_triggers_call - Call triggers associated with a trace event + * @file: The ftrace_event_file associated with the event + * @rec: The trace entry for the event, NULL for unconditional invocation + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command. If rec is + * non-NULL, it means that the trigger requires further processing and + * shouldn't be unconditionally invoked. If rec is non-NULL and the + * trigger has a filter associated with it, rec will checked against + * the filter and if the record matches the trigger will be invoked. + * If the trigger is a 'post_trigger', meaning it shouldn't be invoked + * in any case until the current event is written, the trigger + * function isn't invoked but the bit associated with the deferred + * trigger is set in the return value. + * + * Returns an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + * + * Return: an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + */ +enum event_trigger_type +event_triggers_call(struct ftrace_event_file *file, void *rec) +{ + struct event_trigger_data *data; + enum event_trigger_type tt = ETT_NONE; + struct event_filter *filter; + + if (list_empty(&file->triggers)) + return tt; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (!rec) { + data->ops->func(data); + continue; + } + filter = rcu_dereference(data->filter); + if (filter && !filter_match_preds(filter, rec)) + continue; + if (data->cmd_ops->post_trigger) { + tt |= data->cmd_ops->trigger_type; + continue; + } + data->ops->func(data); + } + return tt; +} +EXPORT_SYMBOL_GPL(event_triggers_call); + +/** + * event_triggers_post_call - Call 'post_triggers' for a trace event + * @file: The ftrace_event_file associated with the event + * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command, if the + * corresponding bit is set in the tt enum passed into this function. + * See @event_triggers_call for details on how those bits are set. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + */ +void +event_triggers_post_call(struct ftrace_event_file *file, + enum event_trigger_type tt) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type & tt) + data->ops->func(data); + } +} +EXPORT_SYMBOL_GPL(event_triggers_post_call); + +#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL) + +static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) +{ + struct ftrace_event_file *event_file = event_file_data(m->private); + + if (t == SHOW_AVAILABLE_TRIGGERS) + return NULL; + + return seq_list_next(t, &event_file->triggers, pos); +} + +static void *trigger_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_file *event_file; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + event_file = event_file_data(m->private); + if (unlikely(!event_file)) + return ERR_PTR(-ENODEV); + + if (list_empty(&event_file->triggers)) + return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; + + return seq_list_start(&event_file->triggers, *pos); +} + +static void trigger_stop(struct seq_file *m, void *t) +{ + mutex_unlock(&event_mutex); +} + +static int trigger_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct event_command *p; + + if (v == SHOW_AVAILABLE_TRIGGERS) { + seq_puts(m, "# Available triggers:\n"); + seq_putc(m, '#'); + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_reverse(p, &trigger_commands, list) + seq_printf(m, " %s", p->name); + seq_putc(m, '\n'); + mutex_unlock(&trigger_cmd_mutex); + return 0; + } + + data = list_entry(v, struct event_trigger_data, list); + data->ops->print(m, data->ops, data); + + return 0; +} + +static const struct seq_operations event_triggers_seq_ops = { + .start = trigger_start, + .next = trigger_next, + .stop = trigger_stop, + .show = trigger_show, +}; + +static int event_trigger_regex_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&event_mutex); + + if (unlikely(!event_file_data(file))) { + mutex_unlock(&event_mutex); + return -ENODEV; + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &event_triggers_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = file; + } + } + + mutex_unlock(&event_mutex); + + return ret; +} + +static int trigger_process_regex(struct ftrace_event_file *file, char *buff) +{ + char *command, *next = buff; + struct event_command *p; + int ret = -EINVAL; + + command = strsep(&next, ": \t"); + command = (command[0] != '!') ? command : command + 1; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(p, file, buff, command, next); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +static ssize_t event_trigger_regex_write(struct file *file, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ftrace_event_file *event_file; + ssize_t ret; + char *buf; + + if (!cnt) + return 0; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long)buf); + return -EFAULT; + } + buf[cnt] = '\0'; + strim(buf); + + mutex_lock(&event_mutex); + event_file = event_file_data(file); + if (unlikely(!event_file)) { + mutex_unlock(&event_mutex); + free_page((unsigned long)buf); + return -ENODEV; + } + ret = trigger_process_regex(event_file, buf); + mutex_unlock(&event_mutex); + + free_page((unsigned long)buf); + if (ret < 0) + goto out; + + *ppos += cnt; + ret = cnt; + out: + return ret; +} + +static int event_trigger_regex_release(struct inode *inode, struct file *file) +{ + mutex_lock(&event_mutex); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + mutex_unlock(&event_mutex); + + return 0; +} + +static ssize_t +event_trigger_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_trigger_regex_write(filp, ubuf, cnt, ppos); +} + +static int +event_trigger_open(struct inode *inode, struct file *filp) +{ + return event_trigger_regex_open(inode, filp); +} + +static int +event_trigger_release(struct inode *inode, struct file *file) +{ + return event_trigger_regex_release(inode, file); +} + +const struct file_operations event_trigger_fops = { + .open = event_trigger_open, + .read = seq_read, + .write = event_trigger_write, + .llseek = tracing_lseek, + .release = event_trigger_release, +}; + +/* + * Currently we only register event commands from __init, so mark this + * __init too. + */ +static __init int register_event_command(struct event_command *cmd) +{ + struct event_command *p; + int ret = 0; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &trigger_commands); + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/* + * Currently we only unregister event commands from __init, so mark + * this __init too. + */ +static __init int unregister_event_command(struct event_command *cmd) +{ + struct event_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/** + * event_trigger_print - Generic event_trigger_ops @print implementation + * @name: The name of the event trigger + * @m: The seq_file being printed to + * @data: Trigger-specific data + * @filter_str: filter_str to print, if present + * + * Common implementation for event triggers to print themselves. + * + * Usually wrapped by a function that simply sets the @name of the + * trigger command and then invokes this. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_print(const char *name, struct seq_file *m, + void *data, char *filter_str) +{ + long count = (long)data; + + seq_printf(m, "%s", name); + + if (count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", count); + + if (filter_str) + seq_printf(m, " if %s\n", filter_str); + else + seq_puts(m, "\n"); + + return 0; +} + +/** + * event_trigger_init - Generic event_trigger_ops @init implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger initialization. + * + * Usually used directly as the @init method in event trigger + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + return 0; +} + +/** + * event_trigger_free - Generic event_trigger_ops @free implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger de-initialization. + * + * Usually used directly as the @free method in event trigger + * implementations. + */ +static void +event_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) + trigger_data_free(data); +} + +static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, + int trigger_enable) +{ + int ret = 0; + + if (trigger_enable) { + if (atomic_inc_return(&file->tm_ref) > 1) + return ret; + set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 1, 1); + } else { + if (atomic_dec_return(&file->tm_ref) > 0) + return ret; + clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 0, 1); + } + + return ret; +} + +/** + * clear_event_triggers - Clear all triggers associated with a trace array + * @tr: The trace array to clear + * + * For each trigger, the triggering event has its tm_ref decremented + * via trace_event_trigger_enable_disable(), and any associated event + * (in the case of enable/disable_event triggers) will have its sm_ref + * decremented via free()->trace_event_enable_disable(). That + * combination effectively reverses the soft-mode/trigger state added + * by trigger registration. + * + * Must be called with event_mutex held. + */ +void +clear_event_triggers(struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + struct event_trigger_data *data; + list_for_each_entry_rcu(data, &file->triggers, list) { + trace_event_trigger_enable_disable(file, 0); + if (data->ops->free) + data->ops->free(data->ops, data); + } + } +} + +/** + * update_cond_flag - Set or reset the TRIGGER_COND bit + * @file: The ftrace_event_file associated with the event + * + * If an event has triggers and any of those triggers has a filter or + * a post_trigger, trigger invocation needs to be deferred until after + * the current event has logged its data, and the event should have + * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be + * cleared. + */ +static void update_cond_flag(struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool set_cond = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->filter || data->cmd_ops->post_trigger) { + set_cond = true; + break; + } + } + + if (set_cond) + set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); +} + +/** + * register_trigger - Generic event_command @reg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data to associate with the trigger + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger registration. + * + * Usually used directly as the @reg method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +/** + * event_trigger_callback - Generic event_command @func implementation + * @cmd_ops: The command ops, used for trigger registration + * @file: The ftrace_event_file associated with the event + * @glob: The raw string used to register the trigger + * @cmd: The cmd portion of the string used to register the trigger + * @param: The params portion of the string used to register the trigger + * + * Common implementation for event command parsing and trigger + * instantiation. + * + * Usually used directly as the @func method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_callback(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + char *trigger = NULL; + char *number; + int ret; + + /* separate the trigger from the filter (t:n [if filter]) */ + if (param && isdigit(param[0])) + trigger = strsep(¶m, " \t"); + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + ret = 0; + out: + return ret; + + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + goto out; +} + +/** + * set_trigger_filter - Generic event_command @set_filter implementation + * @filter_str: The filter string for the trigger, NULL to remove filter + * @trigger_data: Trigger-specific data + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event command filter parsing and filter + * instantiation. + * + * Usually used directly as the @set_filter method in event command + * implementations. + * + * Also used to remove a filter (if filter_str = NULL). + * + * Return: 0 on success, errno otherwise + */ +static int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data = trigger_data; + struct event_filter *filter = NULL, *tmp; + int ret = -EINVAL; + char *s; + + if (!filter_str) /* clear the current filter */ + goto assign; + + s = strsep(&filter_str, " \t"); + + if (!strlen(s) || strcmp(s, "if") != 0) + goto out; + + if (!filter_str) + goto out; + + /* The filter is for the 'trigger' event, not the triggered event */ + ret = create_event_filter(file->event_call, filter_str, false, &filter); + if (ret) + goto out; + assign: + tmp = rcu_access_pointer(data->filter); + + rcu_assign_pointer(data->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + free_event_filter(tmp); + } + + kfree(data->filter_str); + data->filter_str = NULL; + + if (filter_str) { + data->filter_str = kstrdup(filter_str, GFP_KERNEL); + if (!data->filter_str) { + free_event_filter(rcu_access_pointer(data->filter)); + data->filter = NULL; + ret = -ENOMEM; + } + } + out: + return ret; +} + +static void +traceon_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + tracing_on(); +} + +static void +traceon_count_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_on(); +} + +static void +traceoff_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + tracing_off(); +} + +static void +traceoff_count_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_off(); +} + +static int +traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceon", m, (void *)data->count, + data->filter_str); +} + +static int +traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceoff", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops traceon_trigger_ops = { + .func = traceon_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceon_count_trigger_ops = { + .func = traceon_count_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_trigger_ops = { + .func = traceoff_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_count_trigger_ops = { + .func = traceoff_count_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +onoff_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_trigger_ops : + &traceon_trigger_ops; + else + ops = param ? &traceoff_count_trigger_ops : + &traceoff_trigger_ops; + + return ops; +} + +static struct event_command trigger_traceon_cmd = { + .name = "traceon", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_traceoff_cmd = { + .name = "traceoff", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static void +snapshot_trigger(struct event_trigger_data *data) +{ + tracing_snapshot(); +} + +static void +snapshot_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + snapshot_trigger(data); +} + +static int +register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + int ret = register_trigger(glob, ops, data, file); + + if (ret > 0 && tracing_alloc_snapshot() != 0) { + unregister_trigger(glob, ops, data, file); + ret = 0; + } + + return ret; +} + +static int +snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("snapshot", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops snapshot_trigger_ops = { + .func = snapshot_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops snapshot_count_trigger_ops = { + .func = snapshot_count_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +snapshot_get_trigger_ops(char *cmd, char *param) +{ + return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; +} + +static struct event_command trigger_snapshot_cmd = { + .name = "snapshot", + .trigger_type = ETT_SNAPSHOT, + .func = event_trigger_callback, + .reg = register_snapshot_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = snapshot_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_snapshot_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_snapshot_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ + +#ifdef CONFIG_STACKTRACE +/* + * Skip 3: + * stacktrace_trigger() + * event_triggers_post_call() + * ftrace_raw_event_xxx() + */ +#define STACK_SKIP 3 + +static void +stacktrace_trigger(struct event_trigger_data *data) +{ + trace_dump_stack(STACK_SKIP); +} + +static void +stacktrace_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + stacktrace_trigger(data); +} + +static int +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("stacktrace", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops stacktrace_trigger_ops = { + .func = stacktrace_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops stacktrace_count_trigger_ops = { + .func = stacktrace_count_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +stacktrace_get_trigger_ops(char *cmd, char *param) +{ + return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; +} + +static struct event_command trigger_stacktrace_cmd = { + .name = "stacktrace", + .trigger_type = ETT_STACKTRACE, + .post_trigger = true, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = stacktrace_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_stacktrace_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_stacktrace_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_stacktrace_cmd(void) { return 0; } +#endif /* CONFIG_STACKTRACE */ + +static __init void unregister_trigger_traceon_traceoff_cmds(void) +{ + unregister_event_command(&trigger_traceon_cmd); + unregister_event_command(&trigger_traceoff_cmd); +} + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct enable_trigger_data { + struct ftrace_event_file *file; + bool enable; +}; + +static void +event_enable_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (enable_data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); +} + +static void +event_enable_count_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_trigger(data); +} + +static int +event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + seq_printf(m, "%s:%s:%s", + enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->file->event_call->class->system, + enable_data->file->event_call->name); + + if (data->count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", data->count); + + if (data->filter_str) + seq_printf(m, " if %s\n", data->filter_str); + else + seq_puts(m, "\n"); + + return 0; +} + +static void +event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + trace_event_enable_disable(enable_data->file, 0, 1); + module_put(enable_data->file->event_call->mod); + trigger_data_free(data); + kfree(enable_data); + } +} + +static struct event_trigger_ops event_enable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_enable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static int +event_enable_trigger_func(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct ftrace_event_file *event_enable_file; + struct enable_trigger_data *enable_data; + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + struct trace_array *tr = file->tr; + const char *system; + const char *event; + char *trigger; + char *number; + bool enable; + int ret; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (s:e:n [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + system = strsep(&trigger, ":"); + if (!trigger) + return -EINVAL; + + event = strsep(&trigger, ":"); + + ret = -EINVAL; + event_enable_file = find_event_file(tr, system, event); + if (!event_enable_file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); + if (!enable_data) { + kfree(trigger_data); + goto out; + } + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + enable_data->enable = enable; + enable_data->file = event_enable_file; + trigger_data->private_data = enable_data; + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + kfree(enable_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(event_enable_file->event_call->mod); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = trace_event_enable_disable(event_enable_file, 1, 1); + if (ret < 0) + goto out_put; + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; + out: + return ret; + + out_disable: + trace_event_enable_disable(event_enable_file, 0, 1); + out_put: + module_put(event_enable_file->event_call->mod); + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + kfree(enable_data); + goto out; +} + +static int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct enable_trigger_data *test_enable_data; + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + test_enable_data = test->private_data; + if (test_enable_data && + (test_enable_data->file == enable_data->file)) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +static void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *test_enable_data = test->private_data; + struct enable_trigger_data *enable_data; + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + enable_data = data->private_data; + if (enable_data && + (enable_data->file == test_enable_data->file)) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +static struct event_trigger_ops * +event_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_trigger_ops : + &event_enable_trigger_ops; + else + ops = param ? &event_disable_count_trigger_ops : + &event_disable_trigger_ops; + + return ops; +} + +static struct event_command trigger_enable_cmd = { + .name = ENABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_disable_cmd = { + .name = DISABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_enable_cmd); + unregister_event_command(&trigger_disable_cmd); +} + +static __init int register_trigger_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_enable_disable_cmds(); + + return ret; +} + +static __init int register_trigger_traceon_traceoff_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_traceon_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_traceoff_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_traceon_traceoff_cmds(); + + return ret; +} + +__init int register_trigger_cmds(void) +{ + register_trigger_traceon_traceoff_cmds(); + register_trigger_snapshot_cmd(); + register_trigger_stacktrace_cmd(); + register_trigger_enable_disable_cmds(); + + return 0; +} diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index dae9541..bdbae45 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -27,18 +27,12 @@ /** * Kprobe event core functions */ -struct trace_probe { +struct trace_kprobe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ unsigned long nhit; - unsigned int flags; /* For TP_FLAG_* */ const char *symbol; /* symbol name */ - struct ftrace_event_class class; - struct ftrace_event_call call; - struct list_head files; - ssize_t size; /* trace entry size */ - unsigned int nr_args; - struct probe_arg args[]; + struct trace_probe tp; }; struct event_file_link { @@ -46,56 +40,46 @@ struct event_file_link { struct list_head list; }; -#define SIZEOF_TRACE_PROBE(n) \ - (offsetof(struct trace_probe, args) + \ +#define SIZEOF_TRACE_KPROBE(n) \ + (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -static __kprobes bool trace_probe_is_return(struct trace_probe *tp) +static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) { - return tp->rp.handler != NULL; + return tk->rp.handler != NULL; } -static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) +static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) { - return tp->symbol ? tp->symbol : "unknown"; + return tk->symbol ? tk->symbol : "unknown"; } -static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) +static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) { - return tp->rp.kp.offset; + return tk->rp.kp.offset; } -static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) +static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) { - return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); + return !!(kprobe_gone(&tk->rp.kp)); } -static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) -{ - return !!(tp->flags & TP_FLAG_REGISTERED); -} - -static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) -{ - return !!(kprobe_gone(&tp->rp.kp)); -} - -static __kprobes bool trace_probe_within_module(struct trace_probe *tp, - struct module *mod) +static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, + struct module *mod) { int len = strlen(mod->name); - const char *name = trace_probe_symbol(tp); + const char *name = trace_kprobe_symbol(tk); return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) +static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) { - return !!strchr(trace_probe_symbol(tp), ':'); + return !!strchr(trace_kprobe_symbol(tk), ':'); } -static int register_probe_event(struct trace_probe *tp); -static int unregister_probe_event(struct trace_probe *tp); +static int register_kprobe_event(struct trace_kprobe *tk); +static int unregister_kprobe_event(struct trace_kprobe *tk); static DEFINE_MUTEX(probe_lock); static LIST_HEAD(probe_list); @@ -104,45 +88,224 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + + if (sc->addr) + sc->addr += sc->offset; + + return sc->addr; +} + +void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + update_symbol_cache(sc); + + return sc; +} + +/* + * Kprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + + if (!maxlen) + return; + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else { + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); + } +} + +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +#define DEFINE_FETCH_symbol(type) \ +__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) + +/* kprobes don't support file_offset fetch methods */ +#define fetch_file_offset_u8 NULL +#define fetch_file_offset_u16 NULL +#define fetch_file_offset_u32 NULL +#define fetch_file_offset_u64 NULL +#define fetch_file_offset_string NULL +#define fetch_file_offset_string_size NULL + +/* Fetch type information table */ +const struct fetch_type kprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + /* * Allocate new trace_probe and initialize it (including kprobes). */ -static struct trace_probe *alloc_trace_probe(const char *group, +static struct trace_kprobe *alloc_trace_kprobe(const char *group, const char *event, void *addr, const char *symbol, unsigned long offs, int nargs, bool is_return) { - struct trace_probe *tp; + struct trace_kprobe *tk; int ret = -ENOMEM; - tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); - if (!tp) + tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + if (!tk) return ERR_PTR(ret); if (symbol) { - tp->symbol = kstrdup(symbol, GFP_KERNEL); - if (!tp->symbol) + tk->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tk->symbol) goto error; - tp->rp.kp.symbol_name = tp->symbol; - tp->rp.kp.offset = offs; + tk->rp.kp.symbol_name = tk->symbol; + tk->rp.kp.offset = offs; } else - tp->rp.kp.addr = addr; + tk->rp.kp.addr = addr; if (is_return) - tp->rp.handler = kretprobe_dispatcher; + tk->rp.handler = kretprobe_dispatcher; else - tp->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.kp.pre_handler = kprobe_dispatcher; if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; } - tp->call.class = &tp->class; - tp->call.name = kstrdup(event, GFP_KERNEL); - if (!tp->call.name) + tk->tp.call.class = &tk->tp.class; + tk->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tk->tp.call.name) goto error; if (!group || !is_good_name(group)) { @@ -150,42 +313,42 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; } - tp->class.system = kstrdup(group, GFP_KERNEL); - if (!tp->class.system) + tk->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tk->tp.class.system) goto error; - INIT_LIST_HEAD(&tp->list); - INIT_LIST_HEAD(&tp->files); - return tp; + INIT_LIST_HEAD(&tk->list); + INIT_LIST_HEAD(&tk->tp.files); + return tk; error: - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); return ERR_PTR(ret); } -static void free_trace_probe(struct trace_probe *tp) +static void free_trace_kprobe(struct trace_kprobe *tk) { int i; - for (i = 0; i < tp->nr_args; i++) - traceprobe_free_probe_arg(&tp->args[i]); + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_free_probe_arg(&tk->tp.args[i]); - kfree(tp->call.class->system); - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); + kfree(tk->tp.call.class->system); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); } -static struct trace_probe *find_trace_probe(const char *event, - const char *group) +static struct trace_kprobe *find_trace_kprobe(const char *event, + const char *group) { - struct trace_probe *tp; + struct trace_kprobe *tk; - list_for_each_entry(tp, &probe_list, list) - if (strcmp(tp->call.name, event) == 0 && - strcmp(tp->call.class->system, group) == 0) - return tp; + list_for_each_entry(tk, &probe_list, list) + if (strcmp(tk->tp.call.name, event) == 0 && + strcmp(tk->tp.call.class->system, group) == 0) + return tk; return NULL; } @@ -194,7 +357,7 @@ static struct trace_probe *find_trace_probe(const char *event, * if the file is NULL, enable "perf" handler, or enable "trace" handler. */ static int -enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) +enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) { int ret = 0; @@ -208,17 +371,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } link->file = file; - list_add_tail_rcu(&link->list, &tp->files); + list_add_tail_rcu(&link->list, &tk->tp.files); - tp->flags |= TP_FLAG_TRACE; + tk->tp.flags |= TP_FLAG_TRACE; } else - tp->flags |= TP_FLAG_PROFILE; + tk->tp.flags |= TP_FLAG_PROFILE; - if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { - if (trace_probe_is_return(tp)) - ret = enable_kretprobe(&tp->rp); + if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); else - ret = enable_kprobe(&tp->rp.kp); + ret = enable_kprobe(&tk->rp.kp); } out: return ret; @@ -241,14 +404,14 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) * if the file is NULL, disable "perf" handler, or disable "trace" handler. */ static int -disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) +disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) { struct event_file_link *link = NULL; int wait = 0; int ret = 0; if (file) { - link = find_event_file_link(tp, file); + link = find_event_file_link(&tk->tp, file); if (!link) { ret = -EINVAL; goto out; @@ -256,18 +419,18 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) list_del_rcu(&link->list); wait = 1; - if (!list_empty(&tp->files)) + if (!list_empty(&tk->tp.files)) goto out; - tp->flags &= ~TP_FLAG_TRACE; + tk->tp.flags &= ~TP_FLAG_TRACE; } else - tp->flags &= ~TP_FLAG_PROFILE; + tk->tp.flags &= ~TP_FLAG_PROFILE; - if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); + if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); else - disable_kprobe(&tp->rp.kp); + disable_kprobe(&tk->rp.kp); wait = 1; } out: @@ -288,40 +451,40 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } /* Internal register function - just handle k*probes and flags */ -static int __register_trace_probe(struct trace_probe *tp) +static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; - if (trace_probe_is_registered(tp)) + if (trace_probe_is_registered(&tk->tp)) return -EINVAL; - for (i = 0; i < tp->nr_args; i++) - traceprobe_update_arg(&tp->args[i]); + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_update_arg(&tk->tp.args[i]); /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(tp)) - tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + if (trace_probe_is_enabled(&tk->tp)) + tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; else - tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + tk->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (trace_probe_is_return(tp)) - ret = register_kretprobe(&tp->rp); + if (trace_kprobe_is_return(tk)) + ret = register_kretprobe(&tk->rp); else - ret = register_kprobe(&tp->rp.kp); + ret = register_kprobe(&tk->rp.kp); if (ret == 0) - tp->flags |= TP_FLAG_REGISTERED; + tk->tp.flags |= TP_FLAG_REGISTERED; else { pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_probe_symbol(tp), trace_probe_offset(tp), ret); - if (ret == -ENOENT && trace_probe_is_on_module(tp)) { + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { pr_warning("This probe might be able to register after" "target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { pr_warning("Probing address(0x%p) is not an " "instruction boundary.\n", - tp->rp.kp.addr); + tk->rp.kp.addr); ret = -EINVAL; } } @@ -330,67 +493,67 @@ static int __register_trace_probe(struct trace_probe *tp) } /* Internal unregister function - just handle k*probes and flags */ -static void __unregister_trace_probe(struct trace_probe *tp) +static void __unregister_trace_kprobe(struct trace_kprobe *tk) { - if (trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - unregister_kretprobe(&tp->rp); + if (trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + unregister_kretprobe(&tk->rp); else - unregister_kprobe(&tp->rp.kp); - tp->flags &= ~TP_FLAG_REGISTERED; + unregister_kprobe(&tk->rp.kp); + tk->tp.flags &= ~TP_FLAG_REGISTERED; /* Cleanup kprobe for reuse */ - if (tp->rp.kp.symbol_name) - tp->rp.kp.addr = NULL; + if (tk->rp.kp.symbol_name) + tk->rp.kp.addr = NULL; } } /* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static int unregister_trace_probe(struct trace_probe *tp) +static int unregister_trace_kprobe(struct trace_kprobe *tk) { /* Enabled event can not be unregistered */ - if (trace_probe_is_enabled(tp)) + if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; /* Will fail if probe is being used by ftrace or perf */ - if (unregister_probe_event(tp)) + if (unregister_kprobe_event(tk)) return -EBUSY; - __unregister_trace_probe(tp); - list_del(&tp->list); + __unregister_trace_kprobe(tk); + list_del(&tk->list); return 0; } /* Register a trace_probe and probe_event */ -static int register_trace_probe(struct trace_probe *tp) +static int register_trace_kprobe(struct trace_kprobe *tk) { - struct trace_probe *old_tp; + struct trace_kprobe *old_tk; int ret; mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tp = find_trace_probe(tp->call.name, tp->call.class->system); - if (old_tp) { - ret = unregister_trace_probe(old_tp); + old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system); + if (old_tk) { + ret = unregister_trace_kprobe(old_tk); if (ret < 0) goto end; - free_trace_probe(old_tp); + free_trace_kprobe(old_tk); } /* Register new event */ - ret = register_probe_event(tp); + ret = register_kprobe_event(tk); if (ret) { pr_warning("Failed to register probe event(%d)\n", ret); goto end; } /* Register k*probe */ - ret = __register_trace_probe(tp); + ret = __register_trace_kprobe(tk); if (ret < 0) - unregister_probe_event(tp); + unregister_kprobe_event(tk); else - list_add_tail(&tp->list, &probe_list); + list_add_tail(&tk->list, &probe_list); end: mutex_unlock(&probe_lock); @@ -398,11 +561,11 @@ end: } /* Module notifier call back, checking event on the module */ -static int trace_probe_module_callback(struct notifier_block *nb, +static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct module *mod = data; - struct trace_probe *tp; + struct trace_kprobe *tk; int ret; if (val != MODULE_STATE_COMING) @@ -410,15 +573,15 @@ static int trace_probe_module_callback(struct notifier_block *nb, /* Update probes on coming module */ mutex_lock(&probe_lock); - list_for_each_entry(tp, &probe_list, list) { - if (trace_probe_within_module(tp, mod)) { + list_for_each_entry(tk, &probe_list, list) { + if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ - __unregister_trace_probe(tp); - ret = __register_trace_probe(tp); + __unregister_trace_kprobe(tk); + ret = __register_trace_kprobe(tk); if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - tp->call.name, mod->name, ret); + tk->tp.call.name, mod->name, ret); } } mutex_unlock(&probe_lock); @@ -426,12 +589,12 @@ static int trace_probe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; } -static struct notifier_block trace_probe_module_nb = { - .notifier_call = trace_probe_module_callback, +static struct notifier_block trace_kprobe_module_nb = { + .notifier_call = trace_kprobe_module_callback, .priority = 1 /* Invoked after kprobe module callback */ }; -static int create_trace_probe(int argc, char **argv) +static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: @@ -451,7 +614,7 @@ static int create_trace_probe(int argc, char **argv) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_probe *tp; + struct trace_kprobe *tk; int i, ret = 0; bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; @@ -498,16 +661,16 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } mutex_lock(&probe_lock); - tp = find_trace_probe(event, group); - if (!tp) { + tk = find_trace_kprobe(event, group); + if (!tk) { mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); return -ENOENT; } /* delete an event */ - ret = unregister_trace_probe(tp); + ret = unregister_trace_kprobe(tk); if (ret == 0) - free_trace_probe(tp); + free_trace_kprobe(tk); mutex_unlock(&probe_lock); return ret; } @@ -554,47 +717,49 @@ static int create_trace_probe(int argc, char **argv) is_return ? 'r' : 'p', addr); event = buf; } - tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, is_return); - if (IS_ERR(tp)) { + if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tp)); - return PTR_ERR(tp); + (int)PTR_ERR(tk)); + return PTR_ERR(tk); } /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + /* Increment count for freeing args in error case */ - tp->nr_args++; + tk->tp.nr_args++; /* Parse argument name */ arg = strchr(argv[i], '='); if (arg) { *arg++ = '\0'; - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + parg->name = kstrdup(argv[i], GFP_KERNEL); } else { arg = argv[i]; /* If argument name is omitted, set "argN" */ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - tp->args[i].name = kstrdup(buf, GFP_KERNEL); + parg->name = kstrdup(buf, GFP_KERNEL); } - if (!tp->args[i].name) { + if (!parg->name) { pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - if (!is_good_name(tp->args[i].name)) { + if (!is_good_name(parg->name)) { pr_info("Invalid argument[%d] name: %s\n", - i, tp->args[i].name); + i, parg->name); ret = -EINVAL; goto error; } - if (traceprobe_conflict_field_name(tp->args[i].name, - tp->args, i)) { + if (traceprobe_conflict_field_name(parg->name, + tk->tp.args, i)) { pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; @@ -602,7 +767,7 @@ static int create_trace_probe(int argc, char **argv) } /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], + ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, is_return, true); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); @@ -610,35 +775,35 @@ static int create_trace_probe(int argc, char **argv) } } - ret = register_trace_probe(tp); + ret = register_trace_kprobe(tk); if (ret) goto error; return 0; error: - free_trace_probe(tp); + free_trace_kprobe(tk); return ret; } -static int release_all_trace_probes(void) +static int release_all_trace_kprobes(void) { - struct trace_probe *tp; + struct trace_kprobe *tk; int ret = 0; mutex_lock(&probe_lock); /* Ensure no probe is in use. */ - list_for_each_entry(tp, &probe_list, list) - if (trace_probe_is_enabled(tp)) { + list_for_each_entry(tk, &probe_list, list) + if (trace_probe_is_enabled(&tk->tp)) { ret = -EBUSY; goto end; } /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { - tp = list_entry(probe_list.next, struct trace_probe, list); - ret = unregister_trace_probe(tp); + tk = list_entry(probe_list.next, struct trace_kprobe, list); + ret = unregister_trace_kprobe(tk); if (ret) goto end; - free_trace_probe(tp); + free_trace_kprobe(tk); } end: @@ -666,22 +831,22 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { - struct trace_probe *tp = v; + struct trace_kprobe *tk = v; int i; - seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); + seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name); - if (!tp->symbol) - seq_printf(m, " 0x%p", tp->rp.kp.addr); - else if (tp->rp.kp.offset) - seq_printf(m, " %s+%u", trace_probe_symbol(tp), - tp->rp.kp.offset); + if (!tk->symbol) + seq_printf(m, " 0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + seq_printf(m, " %s+%u", trace_kprobe_symbol(tk), + tk->rp.kp.offset); else - seq_printf(m, " %s", trace_probe_symbol(tp)); + seq_printf(m, " %s", trace_kprobe_symbol(tk)); - for (i = 0; i < tp->nr_args; i++) - seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); + for (i = 0; i < tk->tp.nr_args; i++) + seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); seq_printf(m, "\n"); return 0; @@ -699,7 +864,7 @@ static int probes_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_trace_probes(); + ret = release_all_trace_kprobes(); if (ret < 0) return ret; } @@ -711,7 +876,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return traceprobe_probes_write(file, buffer, count, ppos, - create_trace_probe); + create_trace_kprobe); } static const struct file_operations kprobe_events_ops = { @@ -726,10 +891,10 @@ static const struct file_operations kprobe_events_ops = { /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { - struct trace_probe *tp = v; + struct trace_kprobe *tk = v; - seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, - tp->rp.kp.nmissed); + seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit, + tk->rp.kp.nmissed); return 0; } @@ -754,57 +919,9 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; -/* Sum up total data length for dynamic arraies (strings) */ -static __kprobes int __get_data_size(struct trace_probe *tp, - struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, - struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - /* Kprobe handler */ static __kprobes void -__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, +__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { struct kprobe_trace_entry_head *entry; @@ -812,18 +929,18 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; WARN_ON(call != ftrace_file->event_call); - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; local_save_flags(irq_flags); pc = preempt_count(); - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, call->event.type, @@ -832,26 +949,25 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, return; entry = ring_buffer_event_data(event); - entry->ip = (unsigned long)tp->rp.kp.addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + entry->ip = (unsigned long)tk->rp.kp.addr; + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) - trace_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); } static __kprobes void -kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) +kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct event_file_link *link; - list_for_each_entry_rcu(link, &tp->files, list) - __kprobe_trace_func(tp, regs, link->file); + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kprobe_trace_func(tk, regs, link->file); } /* Kretprobe handler */ static __kprobes void -__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, +__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { @@ -860,18 +976,18 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; WARN_ON(call != ftrace_file->event_call); - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; local_save_flags(irq_flags); pc = preempt_count(); - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, call->event.type, @@ -880,23 +996,22 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, return; entry = ring_buffer_event_data(event); - entry->func = (unsigned long)tp->rp.kp.addr; + entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) - trace_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); } static __kprobes void -kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, +kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct event_file_link *link; - list_for_each_entry_rcu(link, &tp->files, list) - __kretprobe_trace_func(tp, ri, regs, link->file); + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kretprobe_trace_func(tk, ri, regs, link->file); } /* Event entry printers */ @@ -983,16 +1098,18 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, FILTER_OTHER); if (ret) return ret; @@ -1004,17 +1121,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kretprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, FILTER_OTHER); if (ret) return ret; @@ -1022,74 +1141,13 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) -{ - int i; - int pos = 0; - - const char *fmt, *arg; - - if (!trace_probe_is_return(tp)) { - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; - } else { - fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; - } - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - - for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); - } - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - - for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_print_fmt(struct trace_probe *tp) -{ - int len; - char *print_fmt; - - /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0); - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1); - tp->call.print_fmt = print_fmt; - - return 0; -} - #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ static __kprobes void -kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) +kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1099,8 +1157,8 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) if (hlist_empty(head)) return; - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1108,18 +1166,18 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) if (!entry) return; - entry->ip = (unsigned long)tp->rp.kp.addr; + entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } /* Kretprobe profile handler */ static __kprobes void -kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, +kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1129,8 +1187,8 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, if (hlist_empty(head)) return; - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1138,9 +1196,9 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, if (!entry) return; - entry->func = (unsigned long)tp->rp.kp.addr; + entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ @@ -1155,20 +1213,20 @@ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { - struct trace_probe *tp = (struct trace_probe *)event->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_probe(tp, file); + return enable_trace_kprobe(tk, file); case TRACE_REG_UNREGISTER: - return disable_trace_probe(tp, file); + return disable_trace_kprobe(tk, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_probe(tp, NULL); + return enable_trace_kprobe(tk, NULL); case TRACE_REG_PERF_UNREGISTER: - return disable_trace_probe(tp, NULL); + return disable_trace_kprobe(tk, NULL); case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: @@ -1182,15 +1240,15 @@ int kprobe_register(struct ftrace_event_call *event, static __kprobes int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - tp->nhit++; + tk->nhit++; - if (tp->flags & TP_FLAG_TRACE) - kprobe_trace_func(tp, regs); + if (tk->tp.flags & TP_FLAG_TRACE) + kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kprobe_perf_func(tp, regs); + if (tk->tp.flags & TP_FLAG_PROFILE) + kprobe_perf_func(tk, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1198,15 +1256,15 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) static __kprobes int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); - tp->nhit++; + tk->nhit++; - if (tp->flags & TP_FLAG_TRACE) - kretprobe_trace_func(tp, ri, regs); + if (tk->tp.flags & TP_FLAG_TRACE) + kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kretprobe_perf_func(tp, ri, regs); + if (tk->tp.flags & TP_FLAG_PROFILE) + kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1219,21 +1277,21 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_probe_event(struct trace_probe *tp) +static int register_kprobe_event(struct trace_kprobe *tk) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; int ret; /* Initialize ftrace_event_call */ INIT_LIST_HEAD(&call->class->fields); - if (trace_probe_is_return(tp)) { + if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; } else { call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } - if (set_print_fmt(tp) < 0) + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_ftrace_event(&call->event); if (!ret) { @@ -1242,7 +1300,7 @@ static int register_probe_event(struct trace_probe *tp) } call->flags = 0; call->class->reg = kprobe_register; - call->data = tp; + call->data = tk; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); @@ -1252,14 +1310,14 @@ static int register_probe_event(struct trace_probe *tp) return ret; } -static int unregister_probe_event(struct trace_probe *tp) +static int unregister_kprobe_event(struct trace_kprobe *tk) { int ret; /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tp->call); + ret = trace_remove_event_call(&tk->tp.call); if (!ret) - kfree(tp->call.print_fmt); + kfree(tk->tp.call.print_fmt); return ret; } @@ -1269,7 +1327,7 @@ static __init int init_kprobe_trace(void) struct dentry *d_tracer; struct dentry *entry; - if (register_module_notifier(&trace_probe_module_nb)) + if (register_module_notifier(&trace_kprobe_module_nb)) return -EINVAL; d_tracer = tracing_init_dentry(); @@ -1309,26 +1367,26 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, } static struct ftrace_event_file * -find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) +find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { struct ftrace_event_file *file; list_for_each_entry(file, &tr->events, list) - if (file->event_call == &tp->call) + if (file->event_call == &tk->tp.call) return file; return NULL; } /* - * Nobody but us can call enable_trace_probe/disable_trace_probe at this + * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this * stage, we can do this lockless. */ static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; int (*target)(int, int, int, int, int, int); - struct trace_probe *tp; + struct trace_kprobe *tk; struct ftrace_event_file *file; target = kprobe_trace_selftest_target; @@ -1337,44 +1395,44 @@ static __init int kprobe_trace_self_tests_init(void) ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " "$stack $stack0 +0($stack)", - create_trace_probe); + create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function entry.\n"); warn++; } else { /* Enable trace point */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting new probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_probe(tp, file); + enable_trace_kprobe(tk, file); } } ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " - "$retval", create_trace_probe); + "$retval", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function return.\n"); warn++; } else { /* Enable trace point */ - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting 2nd new probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_probe(tp, file); + enable_trace_kprobe(tk, file); } } @@ -1384,46 +1442,46 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); /* Disable trace points before removing it */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting test probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_probe(tp, file); + disable_trace_kprobe(tk, file); } - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting 2nd test probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_probe(tp, file); + disable_trace_kprobe(tk, file); } - ret = traceprobe_command("-:testprobe", create_trace_probe); + ret = traceprobe_command("-:testprobe", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } - ret = traceprobe_command("-:testprobe2", create_trace_probe); + ret = traceprobe_command("-:testprobe2", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } end: - release_all_trace_probes(); + release_all_trace_kprobes(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 412e959..8364a42 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -35,46 +35,27 @@ const char *reserved_field_names[] = { FIELD_STRING_FUNC, }; -/* Printing function type */ -#define PRINT_TYPE_FUNC_NAME(type) print_type_##type -#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type - /* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ -static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ +__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ const char *name, \ - void *data, void *ent)\ + void *data, void *ent) \ { \ - return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ + return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ } \ -static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) - -static inline void *get_rloc_data(u32 *dl) -{ - return (u8 *)dl + get_rloc_offs(*dl); -} +const char PRINT_TYPE_FMT_NAME(type)[] = fmt; -/* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) -{ - return (u8 *)ent + get_rloc_offs(*dl); -} - -/* For defining macros, define string/string_size types */ -typedef u32 string; -typedef u32 string_size; +DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") /* Print type function for string type */ -static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, +__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, void *data, void *ent) { @@ -87,18 +68,7 @@ static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, (const char *)get_loc_data(data, ent)); } -static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; - -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) +const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; #define CHECK_FETCH_FUNCS(method, fn) \ (((FETCH_FUNC_NAME(method, u8) == fn) || \ @@ -111,7 +81,7 @@ DEFINE_FETCH_##method(u64) /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ -static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ +__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ @@ -122,20 +92,8 @@ DEFINE_BASIC_FETCH_FUNCS(reg) #define fetch_reg_string NULL #define fetch_reg_string_size NULL -#define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - #define DEFINE_FETCH_retval(type) \ -static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ void *dummy, void *dest) \ { \ *(type *)dest = (type)regs_return_value(regs); \ @@ -145,150 +103,16 @@ DEFINE_BASIC_FETCH_FUNCS(retval) #define fetch_retval_string NULL #define fetch_retval_string_size NULL -#define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - long ret; - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); - - if (!maxlen) - return; - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); - } -} - -/* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - mm_segment_t old_fs; - int ret, len = 0; - u8 c; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} - -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -static unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - - if (sc->addr) - sc->addr += sc->offset; - - return sc->addr; -} - -static void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - update_symbol_cache(sc); - - return sc; -} - -#define DEFINE_FETCH_symbol(type) \ -static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - /* Dereference memory access function */ struct deref_fetch_param { struct fetch_param orig; long offset; + fetch_func_t fetch; + fetch_func_t fetch_size; }; #define DEFINE_FETCH_deref(type) \ -static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ void *data, void *dest) \ { \ struct deref_fetch_param *dprm = data; \ @@ -296,13 +120,26 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ call_fetch(&dprm->orig, regs, &addr); \ if (addr) { \ addr += dprm->offset; \ - fetch_memory_##type(regs, (void *)addr, dest); \ + dprm->fetch(regs, (void *)addr, dest); \ } else \ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) -DEFINE_FETCH_deref(string_size) + +__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + struct deref_fetch_param *dprm = data; + unsigned long addr; + + call_fetch(&dprm->orig, regs, &addr); + if (addr && dprm->fetch_size) { + addr += dprm->offset; + dprm->fetch_size(regs, (void *)addr, dest); + } else + *(string_size *)dest = 0; +} static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) { @@ -329,7 +166,7 @@ struct bitfield_fetch_param { }; #define DEFINE_FETCH_bitfield(type) \ -static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ void *data, void *dest) \ { \ struct bitfield_fetch_param *bprm = data; \ @@ -374,58 +211,8 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) kfree(data); } -/* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t -#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) -#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) -#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) - -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) - -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ - .size = _size, \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ - .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ - } \ - } - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 - -/* Fetch type information table */ -static const struct fetch_type fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), -}; - -static const struct fetch_type *find_fetch_type(const char *type) +static const struct fetch_type *find_fetch_type(const char *type, + const struct fetch_type *ftbl) { int i; @@ -446,44 +233,52 @@ static const struct fetch_type *find_fetch_type(const char *type) switch (bs) { case 8: - return find_fetch_type("u8"); + return find_fetch_type("u8", ftbl); case 16: - return find_fetch_type("u16"); + return find_fetch_type("u16", ftbl); case 32: - return find_fetch_type("u32"); + return find_fetch_type("u32", ftbl); case 64: - return find_fetch_type("u64"); + return find_fetch_type("u64", ftbl); default: goto fail; } } - for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) - if (strcmp(type, fetch_type_table[i].name) == 0) - return &fetch_type_table[i]; + for (i = 0; ftbl[i].name; i++) { + if (strcmp(type, ftbl[i].name) == 0) + return &ftbl[i]; + } fail: return NULL; } /* Special function : only accept unsigned long */ -static __kprobes void fetch_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, + void *dummy, void *dest) { *(unsigned long *)dest = kernel_stack_pointer(regs); } +static __kprobes void fetch_user_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = user_stack_pointer(regs); +} + static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn) + fetch_func_t orig_fn, + const struct fetch_type *ftbl) { int i; - if (type != &fetch_type_table[FETCH_TYPE_STRING]) + if (type != &ftbl[FETCH_TYPE_STRING]) return NULL; /* Only string type needs size function */ for (i = 0; i < FETCH_MTD_END; i++) if (type->fetch[i] == orig_fn) - return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; WARN_ON(1); /* This should not happen */ @@ -516,7 +311,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return) + struct fetch_param *f, bool is_return, + bool is_kprobe) { int ret = 0; unsigned long param; @@ -528,13 +324,16 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) - f->fn = fetch_stack_address; + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) + return -EINVAL; + + if (is_kprobe) + f->fn = fetch_kernel_stack_address; else - ret = -EINVAL; + f->fn = fetch_user_stack_address; } else if (isdigit(arg[5])) { ret = kstrtoul(arg + 5, 10, ¶m); - if (ret || param > PARAM_MAX_STACK) + if (ret || (is_kprobe && param > PARAM_MAX_STACK)) ret = -EINVAL; else { f->fn = t->fetch[FETCH_MTD_stack]; @@ -552,20 +351,18 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, static int parse_probe_arg(char *arg, const struct fetch_type *t, struct fetch_param *f, bool is_return, bool is_kprobe) { + const struct fetch_type *ftbl; unsigned long param; long offset; char *tmp; - int ret; - - ret = 0; + int ret = 0; - /* Until uprobe_events supports only reg arguments */ - if (!is_kprobe && arg[0] != '%') - return -EINVAL; + ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; + BUG_ON(ftbl == NULL); switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return); + ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); break; case '%': /* named register */ @@ -577,7 +374,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, } break; - case '@': /* memory or symbol */ + case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); if (ret) @@ -585,7 +382,22 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, f->fn = t->fetch[FETCH_MTD_memory]; f->data = (void *)param; + } else if (arg[1] == '+') { + /* kprobes don't support file offsets */ + if (is_kprobe) + return -EINVAL; + + ret = kstrtol(arg + 2, 0, &offset); + if (ret) + break; + + f->fn = t->fetch[FETCH_MTD_file_offset]; + f->data = (void *)offset; } else { + /* uprobes don't support symbols */ + if (!is_kprobe) + return -EINVAL; + ret = traceprobe_split_symbol_offset(arg + 1, &offset); if (ret) break; @@ -616,7 +428,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, struct deref_fetch_param *dprm; const struct fetch_type *t2; - t2 = find_fetch_type(NULL); + t2 = find_fetch_type(NULL, ftbl); *tmp = '\0'; dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); @@ -624,6 +436,9 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, return -ENOMEM; dprm->offset = offset; + dprm->fetch = t->fetch[FETCH_MTD_memory]; + dprm->fetch_size = get_fetch_size_function(t, + dprm->fetch, ftbl); ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, is_kprobe); if (ret) @@ -685,9 +500,13 @@ static int __parse_bitfield_probe_arg(const char *bf, int traceprobe_parse_probe_arg(char *arg, ssize_t *size, struct probe_arg *parg, bool is_return, bool is_kprobe) { + const struct fetch_type *ftbl; const char *t; int ret; + ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; + BUG_ON(ftbl == NULL); + if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; @@ -702,7 +521,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, arg[t - parg->comm] = '\0'; t++; } - parg->type = find_fetch_type(t); + parg->type = find_fetch_type(t, ftbl); if (!parg->type) { pr_info("Unsupported type: %s\n", t); return -EINVAL; @@ -716,7 +535,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn); + parg->fetch.fn, + ftbl); parg->fetch_size.data = parg->fetch.data; } @@ -837,3 +657,65 @@ out: return ret; } + +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, + bool is_return) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!is_return) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +int set_print_fmt(struct trace_probe *tp, bool is_return) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0, is_return); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1, is_return); + tp->call.print_fmt = print_fmt; + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5c7e09d..b73574a 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -81,6 +81,17 @@ */ #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + /* Data fetch function type */ typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); /* Printing function type */ @@ -95,6 +106,7 @@ enum { FETCH_MTD_symbol, FETCH_MTD_deref, FETCH_MTD_bitfield, + FETCH_MTD_file_offset, FETCH_MTD_END, }; @@ -115,6 +127,148 @@ struct fetch_param { void *data; }; +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ +__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, \ + void *data, void *ent); \ +extern const char PRINT_TYPE_FMT_NAME(type)[] + +DECLARE_BASIC_PRINT_TYPE_FUNC(u8); +DECLARE_BASIC_PRINT_TYPE_FUNC(u16); +DECLARE_BASIC_PRINT_TYPE_FUNC(u32); +DECLARE_BASIC_PRINT_TYPE_FUNC(u64); +DECLARE_BASIC_PRINT_TYPE_FUNC(s8); +DECLARE_BASIC_PRINT_TYPE_FUNC(s16); +DECLARE_BASIC_PRINT_TYPE_FUNC(s32); +DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(string); + +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type + +/* Declare macro for basic types */ +#define DECLARE_FETCH_FUNC(method, type) \ +extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ + void *data, void *dest) + +#define DECLARE_BASIC_FETCH_FUNCS(method) \ +DECLARE_FETCH_FUNC(method, u8); \ +DECLARE_FETCH_FUNC(method, u16); \ +DECLARE_FETCH_FUNC(method, u32); \ +DECLARE_FETCH_FUNC(method, u64) + +DECLARE_BASIC_FETCH_FUNCS(reg); +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(retval); +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(symbol); +DECLARE_FETCH_FUNC(symbol, string); +DECLARE_FETCH_FUNC(symbol, string_size); + +DECLARE_BASIC_FETCH_FUNCS(deref); +DECLARE_FETCH_FUNC(deref, string); +DECLARE_FETCH_FUNC(deref, string_size); + +DECLARE_BASIC_FETCH_FUNCS(bitfield); +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ +ASSIGN_FETCH_FUNC(file_offset, ftype), \ + } \ + } + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define ASSIGN_FETCH_TYPE_END {} + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + +/* + * Fetch type information table. + * It's declared as a weak symbol due to conditional compilation. + */ +extern __weak const struct fetch_type kprobes_fetch_type_table[]; +extern __weak const struct fetch_type uprobes_fetch_type_table[]; + +#ifdef CONFIG_KPROBE_EVENT +struct symbol_cache; +unsigned long update_symbol_cache(struct symbol_cache *sc); +void free_symbol_cache(struct symbol_cache *sc); +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +#else +/* uprobes do not support symbol fetch methods */ +#define fetch_symbol_u8 NULL +#define fetch_symbol_u16 NULL +#define fetch_symbol_u32 NULL +#define fetch_symbol_u64 NULL +#define fetch_symbol_string NULL +#define fetch_symbol_string_size NULL + +struct symbol_cache { +}; +static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) +{ + return 0; +} + +static inline void __used free_symbol_cache(struct symbol_cache *sc) +{ +} + +static inline struct symbol_cache * __used +alloc_symbol_cache(const char *sym, long offset) +{ + return NULL; +} +#endif /* CONFIG_KPROBE_EVENT */ + struct probe_arg { struct fetch_param fetch; struct fetch_param fetch_size; @@ -124,6 +278,26 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; +struct trace_probe { + unsigned int flags; /* For TP_FLAG_* */ + struct ftrace_event_class class; + struct ftrace_event_call call; + struct list_head files; + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +static inline bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static inline bool trace_probe_is_registered(struct trace_probe *tp) +{ + return !!(tp->flags & TP_FLAG_REGISTERED); +} + static inline __kprobes void call_fetch(struct fetch_param *fprm, struct pt_regs *regs, void *dest) { @@ -158,3 +332,53 @@ extern ssize_t traceprobe_probes_write(struct file *file, int (*createfn)(int, char**)); extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); + +/* Sum up total data length for dynamic arraies (strings) */ +static inline __kprobes int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static inline __kprobes void +store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + +extern int set_print_fmt(struct trace_probe *tp, bool is_return); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b20428c..e6be585 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -382,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = { .open = stack_trace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ea90eb5..759d5e0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!ftrace_file) return; - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -343,9 +343,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) - trace_current_buffer_unlock_commit(buffer, event, - irq_flags, pc); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -369,7 +368,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!ftrace_file) return; - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -390,9 +389,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) - trace_current_buffer_unlock_commit(buffer, event, - irq_flags, pc); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); } static int reg_event_syscall_enter(struct ftrace_event_file *file, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b6dcc42..79e52d9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -51,22 +51,17 @@ struct trace_uprobe_filter { */ struct trace_uprobe { struct list_head list; - struct ftrace_event_class class; - struct ftrace_event_call call; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; unsigned long nhit; - unsigned int flags; /* For TP_FLAG_* */ - ssize_t size; /* trace entry size */ - unsigned int nr_args; - struct probe_arg args[]; + struct trace_probe tp; }; -#define SIZEOF_TRACE_UPROBE(n) \ - (offsetof(struct trace_uprobe, args) + \ +#define SIZEOF_TRACE_UPROBE(n) \ + (offsetof(struct trace_uprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) static int register_uprobe_event(struct trace_uprobe *tu); @@ -75,10 +70,151 @@ static int unregister_uprobe_event(struct trace_uprobe *tu); static DEFINE_MUTEX(uprobe_lock); static LIST_HEAD(uprobe_list); +struct uprobe_dispatch_data { + struct trace_uprobe *tu; + unsigned long bp_addr; +}; + static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs); +#ifdef CONFIG_STACK_GROWSUP +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr - (n * sizeof(long)); +} +#else +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr + (n * sizeof(long)); +} +#endif + +static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long ret; + unsigned long addr = user_stack_pointer(regs); + + addr = adjust_stack_addr(addr, n); + + if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) + return 0; + + return ret; +} + +/* + * Uprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)get_user_stack_nth(regs, \ + ((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + void __user *vaddr = (void __force __user *) addr; \ + \ + if (copy_from_user(&retval, vaddr, sizeof(type))) \ + *(type *)dest = 0; \ + else \ + *(type *) dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + u32 rloc = *(u32 *)dest; + int maxlen = get_rloc_len(rloc); + u8 *dst = get_rloc_data(dest); + void __user *src = (void __force __user *) addr; + + if (!maxlen) + return; + + ret = strncpy_from_user(dst, src, maxlen); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); + } else { + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + } +} + +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int len; + void __user *vaddr = (void __force __user *) addr; + + len = strnlen_user(vaddr, MAX_STRING_SIZE); + + if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +static unsigned long translate_user_vaddr(void *file_offset) +{ + unsigned long base_addr; + struct uprobe_dispatch_data *udd; + + udd = (void *) current->utask->vaddr; + + base_addr = udd->bp_addr - udd->tu->offset; + return base_addr + (unsigned long)file_offset; +} + +#define DEFINE_FETCH_file_offset(type) \ +static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + void *vaddr = (void *)translate_user_vaddr(offset); \ + \ + FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ +} +DEFINE_BASIC_FETCH_FUNCS(file_offset) +DEFINE_FETCH_file_offset(string) +DEFINE_FETCH_file_offset(string_size) + +/* Fetch type information table */ +const struct fetch_type uprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { rwlock_init(&filter->rwlock); @@ -114,13 +250,13 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - tu->call.class = &tu->class; - tu->call.name = kstrdup(event, GFP_KERNEL); - if (!tu->call.name) + tu->tp.call.class = &tu->tp.class; + tu->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tu->tp.call.name) goto error; - tu->class.system = kstrdup(group, GFP_KERNEL); - if (!tu->class.system) + tu->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tu->tp.class.system) goto error; INIT_LIST_HEAD(&tu->list); @@ -128,11 +264,11 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); - tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; + tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: - kfree(tu->call.name); + kfree(tu->tp.call.name); kfree(tu); return ERR_PTR(-ENOMEM); @@ -142,12 +278,12 @@ static void free_trace_uprobe(struct trace_uprobe *tu) { int i; - for (i = 0; i < tu->nr_args; i++) - traceprobe_free_probe_arg(&tu->args[i]); + for (i = 0; i < tu->tp.nr_args; i++) + traceprobe_free_probe_arg(&tu->tp.args[i]); iput(tu->inode); - kfree(tu->call.class->system); - kfree(tu->call.name); + kfree(tu->tp.call.class->system); + kfree(tu->tp.call.name); kfree(tu->filename); kfree(tu); } @@ -157,8 +293,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; list_for_each_entry(tu, &uprobe_list, list) - if (strcmp(tu->call.name, event) == 0 && - strcmp(tu->call.class->system, group) == 0) + if (strcmp(tu->tp.call.name, event) == 0 && + strcmp(tu->tp.call.class->system, group) == 0) return tu; return NULL; @@ -181,16 +317,16 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) /* Register a trace_uprobe and probe_event */ static int register_trace_uprobe(struct trace_uprobe *tu) { - struct trace_uprobe *old_tp; + struct trace_uprobe *old_tu; int ret; mutex_lock(&uprobe_lock); /* register as an event */ - old_tp = find_probe_event(tu->call.name, tu->call.class->system); - if (old_tp) { + old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system); + if (old_tu) { /* delete old event */ - ret = unregister_trace_uprobe(old_tp); + ret = unregister_trace_uprobe(old_tu); if (ret) goto end; } @@ -211,7 +347,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] * * - Remove uprobe: -:[GRP/]EVENT */ @@ -360,34 +496,36 @@ static int create_trace_uprobe(int argc, char **argv) /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + /* Increment count for freeing args in error case */ - tu->nr_args++; + tu->tp.nr_args++; /* Parse argument name */ arg = strchr(argv[i], '='); if (arg) { *arg++ = '\0'; - tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); + parg->name = kstrdup(argv[i], GFP_KERNEL); } else { arg = argv[i]; /* If argument name is omitted, set "argN" */ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - tu->args[i].name = kstrdup(buf, GFP_KERNEL); + parg->name = kstrdup(buf, GFP_KERNEL); } - if (!tu->args[i].name) { + if (!parg->name) { pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - if (!is_good_name(tu->args[i].name)) { - pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); + if (!is_good_name(parg->name)) { + pr_info("Invalid argument[%d] name: %s\n", i, parg->name); ret = -EINVAL; goto error; } - if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { + if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; @@ -395,7 +533,8 @@ static int create_trace_uprobe(int argc, char **argv) } /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); + ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, + is_return, false); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -459,11 +598,11 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); + seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); - for (i = 0; i < tu->nr_args; i++) - seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); + for (i = 0; i < tu->tp.nr_args; i++) + seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); seq_printf(m, "\n"); return 0; @@ -509,7 +648,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; - seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); + seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit); return 0; } @@ -533,21 +672,117 @@ static const struct file_operations uprobe_profile_ops = { .release = seq_release, }; +struct uprobe_cpu_buffer { + struct mutex mutex; + void *buf; +}; +static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; +static int uprobe_buffer_refcnt; + +static int uprobe_buffer_init(void) +{ + int cpu, err_cpu; + + uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); + if (uprobe_cpu_buffer == NULL) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct page *p = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL, 0); + if (p == NULL) { + err_cpu = cpu; + goto err; + } + per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); + mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); + } + + return 0; + +err: + for_each_possible_cpu(cpu) { + if (cpu == err_cpu) + break; + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); + } + + free_percpu(uprobe_cpu_buffer); + return -ENOMEM; +} + +static int uprobe_buffer_enable(void) +{ + int ret = 0; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (uprobe_buffer_refcnt++ == 0) { + ret = uprobe_buffer_init(); + if (ret < 0) + uprobe_buffer_refcnt--; + } + + return ret; +} + +static void uprobe_buffer_disable(void) +{ + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (--uprobe_buffer_refcnt == 0) { + free_percpu(uprobe_cpu_buffer); + uprobe_cpu_buffer = NULL; + } +} + +static struct uprobe_cpu_buffer *uprobe_buffer_get(void) +{ + struct uprobe_cpu_buffer *ucb; + int cpu; + + cpu = raw_smp_processor_id(); + ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); + + /* + * Use per-cpu buffers for fastest access, but we might migrate + * so the mutex makes sure we have sole access to it. + */ + mutex_lock(&ucb->mutex); + + return ucb; +} + +static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) +{ + mutex_unlock(&ucb->mutex); +} + static void uprobe_trace_print(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + struct uprobe_cpu_buffer *ucb; void *data; - int size, i; - struct ftrace_event_call *call = &tu->call; + int size, dsize, esize; + struct ftrace_event_call *call = &tu->tp.call; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE)) + return; + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + + size = esize + tu->tp.size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size + tu->size, 0, 0); + size, 0, 0); if (!event) - return; + goto out; entry = ring_buffer_event_data(event); if (is_ret_probe(tu)) { @@ -559,11 +794,13 @@ static void uprobe_trace_print(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) - call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); + memcpy(data, ucb->buf, tu->tp.size + dsize); if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, 0); + +out: + uprobe_buffer_put(ucb); } /* uprobe handler */ @@ -591,23 +828,24 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e int i; entry = (struct uprobe_trace_entry_head *)iter->ent; - tu = container_of(event, struct trace_uprobe, call.event); + tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name, entry->vaddr[1], entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, + if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name, entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) { - if (!tu->args[i].type->print(s, tu->args[i].name, - data + tu->args[i].offset, entry)) + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + if (!parg->type->print(s, parg->name, data + parg->offset, entry)) goto partial; } @@ -618,11 +856,6 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) -{ - return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); -} - typedef bool (*filter_func_t)(struct uprobe_consumer *self, enum uprobe_filter_ctx ctx, struct mm_struct *mm); @@ -632,29 +865,35 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) { int ret = 0; - if (is_trace_uprobe_enabled(tu)) + if (trace_probe_is_enabled(&tu->tp)) return -EINTR; + ret = uprobe_buffer_enable(); + if (ret < 0) + return ret; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - tu->flags |= flag; + tu->tp.flags |= flag; tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) - tu->flags &= ~flag; + tu->tp.flags &= ~flag; return ret; } static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!is_trace_uprobe_enabled(tu)) + if (!trace_probe_is_enabled(&tu->tp)) return; WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->flags &= ~flag; + tu->tp.flags &= ~flag; + + uprobe_buffer_disable(); } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -672,12 +911,12 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) size = SIZEOF_TRACE_ENTRY(false); } /* Set argument names as fields */ - for (i = 0; i < tu->nr_args; i++) { - ret = trace_define_field(event_call, tu->args[i].type->fmttype, - tu->args[i].name, - size + tu->args[i].offset, - tu->args[i].type->size, - tu->args[i].type->is_signed, + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, size + parg->offset, + parg->type->size, parg->type->is_signed, FILTER_OTHER); if (ret) @@ -686,59 +925,6 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -#define LEN_OR_ZERO (len ? len - pos : 0) -static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) -{ - const char *fmt, *arg; - int i; - int pos = 0; - - if (is_ret_probe(tu)) { - fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; - } else { - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; - } - - /* When len=0, we just calculate the needed length */ - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - - for (i = 0; i < tu->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tu->args[i].name, tu->args[i].type->fmt); - } - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - - for (i = 0; i < tu->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tu->args[i].name); - } - - return pos; /* return the length of print_fmt */ -} -#undef LEN_OR_ZERO - -static int set_print_fmt(struct trace_uprobe *tu) -{ - char *print_fmt; - int len; - - /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tu, NULL, 0); - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_print_fmt(tu, print_fmt, len + 1); - tu->call.print_fmt = print_fmt; - - return 0; -} - #ifdef CONFIG_PERF_EVENTS static bool __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) @@ -831,14 +1017,27 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void uprobe_perf_print(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs) { - struct ftrace_event_call *call = &tu->call; + struct ftrace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; + struct uprobe_cpu_buffer *ucb; void *data; - int size, rctx, i; + int size, dsize, esize; + int rctx; - size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return; + + size = esize + tu->tp.size + dsize; + size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) + return; + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); preempt_disable(); head = this_cpu_ptr(call->perf_events); @@ -858,12 +1057,18 @@ static void uprobe_perf_print(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) - call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); + memcpy(data, ucb->buf, tu->tp.size + dsize); + + if (size - esize > tu->tp.size + dsize) { + int len = tu->tp.size + dsize; + + memset(data + len, 0, size - esize - len); + } perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: preempt_enable(); + uprobe_buffer_put(ucb); } /* uprobe profile handler */ @@ -921,16 +1126,22 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; - if (tu->flags & TP_FLAG_TRACE) + udd.tu = tu; + udd.bp_addr = instruction_pointer(regs); + + current->utask->vaddr = (unsigned long) &udd; + + if (tu->tp.flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs); #ifdef CONFIG_PERF_EVENTS - if (tu->flags & TP_FLAG_PROFILE) + if (tu->tp.flags & TP_FLAG_PROFILE) ret |= uprobe_perf_func(tu, regs); #endif return ret; @@ -940,14 +1151,20 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) { struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; tu = container_of(con, struct trace_uprobe, consumer); - if (tu->flags & TP_FLAG_TRACE) + udd.tu = tu; + udd.bp_addr = func; + + current->utask->vaddr = (unsigned long) &udd; + + if (tu->tp.flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs); #ifdef CONFIG_PERF_EVENTS - if (tu->flags & TP_FLAG_PROFILE) + if (tu->tp.flags & TP_FLAG_PROFILE) uretprobe_perf_func(tu, func, regs); #endif return 0; @@ -959,7 +1176,7 @@ static struct trace_event_functions uprobe_funcs = { static int register_uprobe_event(struct trace_uprobe *tu) { - struct ftrace_event_call *call = &tu->call; + struct ftrace_event_call *call = &tu->tp.call; int ret; /* Initialize ftrace_event_call */ @@ -967,7 +1184,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; - if (set_print_fmt(tu) < 0) + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; ret = register_ftrace_event(&call->event); @@ -994,11 +1211,11 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) int ret; /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tu->call); + ret = trace_remove_event_call(&tu->tp.call); if (ret) return ret; - kfree(tu->call.print_fmt); - tu->call.print_fmt = NULL; + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; return 0; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b010eac..82ef9f3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4789,6 +4789,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb, /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); break; } return NOTIFY_OK; @@ -4828,6 +4829,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); + destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); |