diff options
author | Frederic Weisbecker <fweisbec@gmail.com> | 2009-10-18 01:09:09 +0200 |
---|---|---|
committer | Frederic Weisbecker <fweisbec@gmail.com> | 2009-10-18 01:12:33 +0200 |
commit | 0f8f86c7bdd1c954fbe153af437a0d91a6c5721a (patch) | |
tree | 94a8d419a470a4f9852ca397bb9bbe48db92ff5c /kernel | |
parent | dca2d6ac09d9ef59ff46820d4f0c94b08a671202 (diff) | |
parent | f39cdf25bf77219676ec5360980ac40b1a7e144a (diff) | |
download | op-kernel-dev-0f8f86c7bdd1c954fbe153af437a0d91a6c5721a.zip op-kernel-dev-0f8f86c7bdd1c954fbe153af437a0d91a6c5721a.tar.gz |
Merge commit 'perf/core' into perf/hw-breakpoint
Conflicts:
kernel/Makefile
kernel/trace/Makefile
kernel/trace/trace.h
samples/Makefile
Merge reason: We need to be uptodate with the perf events development
branch because we plan to rewrite the breakpoints API on top of
perf events.
Diffstat (limited to 'kernel')
113 files changed, 11434 insertions, 10160 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 69943fd..17b575e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o @@ -87,18 +86,16 @@ obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o -obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit.c b/kernel/audit.c index defc2e6..5feed23 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_SIGNAL_INFO: - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); - if (err) - return err; + len = 0; + if (audit_sig_sid) { + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (err) + return err; + } sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); if (!sig_data) { - security_release_secctx(ctx, len); + if (audit_sig_sid) + security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = audit_sig_uid; sig_data->pid = audit_sig_pid; - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); + if (audit_sig_sid) { + memcpy(sig_data->ctx, ctx, len); + security_release_secctx(ctx, len); + } audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0e96dbc..cc7e879 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -45,8 +45,8 @@ struct audit_watch { atomic_t count; /* reference count */ - char *path; /* insertion path */ dev_t dev; /* associated superblock device */ + char *path; /* insertion path */ unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 68d3c6a..267e484 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,12 +168,12 @@ struct audit_context { int in_syscall; /* 1 if task is in a syscall */ enum audit_state state, current_state; unsigned int serial; /* serial number for record */ - struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ - int return_valid; /* return code is valid */ long return_code;/* syscall return code */ u64 prio; + int return_valid; /* return code is valid */ int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -198,8 +198,8 @@ struct audit_context { char target_comm[TASK_COMM_LEN]; struct audit_tree_refs *trees, *first_trees; - int tree_count; struct list_head killed_trees; + int tree_count; int type; union { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c7ece8f..ca83b73 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -23,6 +23,7 @@ */ #include <linux/cgroup.h> +#include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -48,6 +49,8 @@ #include <linux/namei.h> #include <linux/smp_lock.h> #include <linux/pid_namespace.h> +#include <linux/idr.h> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <asm/atomic.h> @@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = { #include <linux/cgroup_subsys.h> }; +#define MAX_CGROUP_ROOT_NAMELEN 64 + /* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active @@ -74,6 +79,9 @@ struct cgroupfs_root { */ unsigned long subsys_bits; + /* Unique id for this hierarchy. */ + int hierarchy_id; + /* The bitmask of subsystems currently attached to this hierarchy */ unsigned long actual_subsys_bits; @@ -94,6 +102,9 @@ struct cgroupfs_root { /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* @@ -141,6 +152,10 @@ struct css_id { static LIST_HEAD(roots); static int root_count; +static DEFINE_IDA(hierarchy_ida); +static int next_hierarchy_id; +static DEFINE_SPINLOCK(hierarchy_id_lock); + /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -201,6 +216,7 @@ struct cg_cgroup_link { * cgroup, anchored on cgroup->css_sets */ struct list_head cgrp_link_list; + struct cgroup *cgrp; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links @@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); static DEFINE_RWLOCK(css_set_lock); static int css_set_count; -/* hash table for cgroup groups. This improves the performance to - * find an existing css_set */ +/* + * hash table for cgroup groups. This improves the performance to find + * an existing css_set. This hash doesn't (currently) take into + * account cgroups in empty hierarchies. + */ #define CSS_SET_HASH_BITS 7 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; @@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + kfree(cg); +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -/* When we create or destroy a css_set, the operation simply - * takes/releases a reference count on all the cgroups referenced - * by subsystems in this css_set. This can end up multiple-counting - * some cgroups, but that's OK - the ref-count is just a - * busy/not-busy indicator; ensuring that we only count each cgroup - * once would require taking a global lock to ensure that no - * subsystems moved between hierarchies while we were doing so. - * - * Possible TODO: decide at boot time based on the number of - * registered subsystems and the number of CPUs or NUMA nodes whether - * it's better for performance to ref-count every subsystem, or to - * take a global lock and only add one ref count to each hierarchy. - */ - -/* - * unlink a css_set from the list and free it - */ -static void unlink_css_set(struct css_set *cg) +static void __put_css_set(struct css_set *cg, int taskexit) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } -} - -static void __put_css_set(struct css_set *cg, int taskexit) -{ - int i; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit) write_unlock(&css_set_lock); return; } - unlink_css_set(cg); - write_unlock(&css_set_lock); - rcu_read_lock(); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); + /* This css_set is dead. unlink it and release cgroup refcounts */ + hlist_del(&cg->hlist); + css_set_count--; + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { + struct cgroup *cgrp = link->cgrp; + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + + kfree(link); } - rcu_read_unlock(); - kfree(cg); + + write_unlock(&css_set_lock); + call_rcu(&cg->rcu_head, free_css_set_rcu); } /* @@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg) } /* + * compare_css_sets - helper function for find_existing_css_set(). + * @cg: candidate css_set being tested + * @old_cg: existing css_set for a task + * @new_cgrp: cgroup that's being entered by the task + * @template: desired set of css pointers in css_set (pre-calculated) + * + * Returns true if "cg" matches "old_cg" except for the hierarchy + * which "new_cgrp" belongs to, for which it should match "new_cgrp". + */ +static bool compare_css_sets(struct css_set *cg, + struct css_set *old_cg, + struct cgroup *new_cgrp, + struct cgroup_subsys_state *template[]) +{ + struct list_head *l1, *l2; + + if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* Not all subsystems matched */ + return false; + } + + /* + * Compare cgroup pointers in order to distinguish between + * different cgroups in heirarchies with no subsystems. We + * could get by with just this check alone (and skip the + * memcmp above) but on most setups the memcmp check will + * avoid the need for this more expensive check on almost all + * candidates. + */ + + l1 = &cg->cg_links; + l2 = &old_cg->cg_links; + while (1) { + struct cg_cgroup_link *cgl1, *cgl2; + struct cgroup *cg1, *cg2; + + l1 = l1->next; + l2 = l2->next; + /* See if we reached the end - both lists are equal length. */ + if (l1 == &cg->cg_links) { + BUG_ON(l2 != &old_cg->cg_links); + break; + } else { + BUG_ON(l2 == &old_cg->cg_links); + } + /* Locate the cgroups associated with these links. */ + cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); + cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); + cg1 = cgl1->cgrp; + cg2 = cgl2->cgrp; + /* Hierarchies should be linked in the same order. */ + BUG_ON(cg1->root != cg2->root); + + /* + * If this hierarchy is the hierarchy of the cgroup + * that's changing, then we need to check that this + * css_set points to the new cgroup; if it's any other + * hierarchy, then this css_set should point to the + * same cgroup as the old css_set. + */ + if (cg1->root == new_cgrp->root) { + if (cg1 != new_cgrp) + return false; + } else { + if (cg1 != cg2) + return false; + } + } + return true; +} + +/* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing * css_set is suitable. @@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set( hhead = css_set_hash(template); hlist_for_each_entry(cg, node, hhead, hlist) { - if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* All subsystems matched */ - return cg; - } + if (!compare_css_sets(cg, oldcg, cgrp, template)) + continue; + + /* This css_set matches what we need */ + return cg; } /* No existing cgroup group matched */ @@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links, link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, cgrp_link_list); link->cg = cg; + link->cgrp = cgrp; + atomic_inc(&cgrp->count); list_move(&link->cgrp_link_list, &cgrp->css_sets); - list_add(&link->cg_link_list, &cg->cg_links); + /* + * Always add links to the tail of the list so that the list + * is sorted by order of hierarchy creation + */ + list_add_tail(&link->cg_link_list, &cg->cg_links); } /* @@ -451,11 +530,11 @@ static struct css_set *find_css_set( { struct css_set *res; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - int i; struct list_head tmp_cg_links; struct hlist_head *hhead; + struct cg_cgroup_link *link; /* First see if we already have a cgroup group that matches * the desired set */ @@ -489,20 +568,12 @@ static struct css_set *find_css_set( write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = res->subsys[i]->cgroup; - struct cgroup_subsys *ss = subsys[i]; - atomic_inc(&cgrp->count); - /* - * We want to add a link once per cgroup, so we - * only do it for the first subsystem in each - * hierarchy - */ - if (ss->root->subsys_list.next == &ss->sibling) - link_css_set(&tmp_cg_links, res, cgrp); + list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) + c = cgrp; + link_css_set(&tmp_cg_links, res, c); } - if (list_empty(&rootnode.subsys_list)) - link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -518,6 +589,41 @@ static struct css_set *find_css_set( } /* + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroupfs_root *root) +{ + struct css_set *css; + struct cgroup *res = NULL; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + read_lock(&css_set_lock); + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + css = task->cgroups; + if (css == &init_css_set) { + res = &root->top_cgroup; + } else { + struct cg_cgroup_link *link; + list_for_each_entry(link, &css->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == root) { + res = c; + break; + } + } + } + read_unlock(&css_set_lock); + BUG_ON(!res); + return res; +} + +/* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. * See "The task_lock() exception", at the end of this comment. @@ -596,8 +702,8 @@ void cgroup_unlock(void) static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); -static struct inode_operations cgroup_dir_inode_operations; -static struct file_operations proc_cgroupstats_operations; +static const struct inode_operations cgroup_dir_inode_operations; +static const struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { .name = "cgroup", @@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (strlen(root->name)) + seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); return 0; } @@ -849,6 +963,12 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + char *name; + /* User explicitly requested empty subsystem */ + bool none; + + struct cgroupfs_root *new_root; + }; /* Convert a hierarchy specifier into a bitmask of subsystems and @@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data, mask = ~(1UL << cpuset_subsys_id); #endif - opts->subsys_bits = 0; - opts->flags = 0; - opts->release_agent = NULL; + memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { if (!*token) @@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data, if (!ss->disabled) opts->subsys_bits |= 1ul << i; } + } else if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); + opts->release_agent = + kstrndup(token + 14, PATH_MAX, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - strncpy(opts->release_agent, token + 14, PATH_MAX - 1); - opts->release_agent[PATH_MAX - 1] = 0; + } else if (!strncmp(token, "name=", 5)) { + int i; + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; } else { struct cgroup_subsys *ss; int i; @@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data, } } + /* Consistency checks */ + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just @@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data, (opts->subsys_bits & mask)) return -EINVAL; - /* We can't have an empty hierarchy */ - if (!opts->subsys_bits) + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_bits && opts->none) + return -EINVAL; + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_bits && !opts->name) return -EINVAL; return 0; @@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* Don't allow name to change at remount */ + if (opts.name && strcmp(opts.name, root->name)) { + ret = -EINVAL; + goto out_unlock; + } + ret = rebind_subsystems(root, opts.subsys_bits); if (ret) goto out_unlock; @@ -955,13 +1114,14 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); out_unlock: kfree(opts.release_agent); + kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); unlock_kernel(); return ret; } -static struct super_operations cgroup_ops = { +static const struct super_operations cgroup_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = cgroup_show_options, @@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pids_list); - init_rwsem(&cgrp->pids_mutex); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); } + static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root) init_cgroup_housekeeping(cgrp); } +static bool init_root_id(struct cgroupfs_root *root) +{ + int ret = 0; + + do { + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) + return false; + spin_lock(&hierarchy_id_lock); + /* Try to allocate the next unused ID */ + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, + &root->hierarchy_id); + if (ret == -ENOSPC) + /* Try again starting from 0 */ + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); + if (!ret) { + next_hierarchy_id = root->hierarchy_id + 1; + } else if (ret != -EAGAIN) { + /* Can only get here if the 31-bit IDR is full ... */ + BUG_ON(ret); + } + spin_unlock(&hierarchy_id_lock); + } while (ret); + return true; +} + static int cgroup_test_super(struct super_block *sb, void *data) { - struct cgroupfs_root *new = data; + struct cgroup_sb_opts *opts = data; struct cgroupfs_root *root = sb->s_fs_info; - /* First check subsystems */ - if (new->subsys_bits != root->subsys_bits) - return 0; + /* If we asked for a name then it must match */ + if (opts->name && strcmp(opts->name, root->name)) + return 0; - /* Next check flags */ - if (new->flags != root->flags) + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match + */ + if ((opts->subsys_bits || opts->none) + && (opts->subsys_bits != root->subsys_bits)) return 0; return 1; } +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) +{ + struct cgroupfs_root *root; + + if (!opts->subsys_bits && !opts->none) + return NULL; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + if (!init_root_id(root)) { + kfree(root); + return ERR_PTR(-ENOMEM); + } + init_cgroup_root(root); + + root->subsys_bits = opts->subsys_bits; + root->flags = opts->flags; + if (opts->release_agent) + strcpy(root->release_agent_path, opts->release_agent); + if (opts->name) + strcpy(root->name, opts->name); + return root; +} + +static void cgroup_drop_root(struct cgroupfs_root *root) +{ + if (!root) + return; + + BUG_ON(!root->hierarchy_id); + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + kfree(root); +} + static int cgroup_set_super(struct super_block *sb, void *data) { int ret; - struct cgroupfs_root *root = data; + struct cgroup_sb_opts *opts = data; + + /* If we don't have a new root, we can't set up a new sb */ + if (!opts->new_root) + return -EINVAL; + + BUG_ON(!opts->subsys_bits && !opts->none); ret = set_anon_super(sb, NULL); if (ret) return ret; - sb->s_fs_info = root; - root->sb = sb; + sb->s_fs_info = opts->new_root; + opts->new_root->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type, void *data, struct vfsmount *mnt) { struct cgroup_sb_opts opts; + struct cgroupfs_root *root; int ret = 0; struct super_block *sb; - struct cgroupfs_root *root; - struct list_head tmp_cg_links; + struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - if (ret) { - kfree(opts.release_agent); - return ret; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - kfree(opts.release_agent); - return -ENOMEM; - } + if (ret) + goto out_err; - init_cgroup_root(root); - root->subsys_bits = opts.subsys_bits; - root->flags = opts.flags; - if (opts.release_agent) { - strcpy(root->release_agent_path, opts.release_agent); - kfree(opts.release_agent); + /* + * Allocate a new cgroup root. We may not need it if we're + * reusing an existing hierarchy. + */ + new_root = cgroup_root_from_opts(&opts); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out_err; } + opts.new_root = new_root; - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); - + /* Locate an existing or new sb for this hierarchy */ + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); if (IS_ERR(sb)) { - kfree(root); - return PTR_ERR(sb); + ret = PTR_ERR(sb); + cgroup_drop_root(opts.new_root); + goto out_err; } - if (sb->s_fs_info != root) { - /* Reusing an existing superblock */ - BUG_ON(sb->s_root == NULL); - kfree(root); - root = NULL; - } else { - /* New superblock */ + root = sb->s_fs_info; + BUG_ON(!root); + if (root == opts.new_root) { + /* We used the new root structure, so this is a new hierarchy */ + struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; + struct cgroupfs_root *existing_root; int i; BUG_ON(sb->s_root != NULL); @@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + if (strlen(root->name)) { + /* Check for name clashes with existing mounts */ + for_each_active_root(existing_root) { + if (!strcmp(existing_root->name, root->name)) { + ret = -EBUSY; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + } + } + /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be @@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - goto free_cg_links; + free_cg_links(&tmp_cg_links); + goto drop_new_super; } /* EBUSY should be the only error here */ @@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cgroup_populate_dir(root_cgrp); - mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + } else { + /* + * We re-used an existing hierarchy - the new root (if + * any) is not needed + */ + cgroup_drop_root(opts.new_root); } simple_set_mnt(mnt, sb); + kfree(opts.release_agent); + kfree(opts.name); return 0; - free_cg_links: - free_cg_links(&tmp_cg_links); drop_new_super: deactivate_locked_super(sb); + out_err: + kfree(opts.release_agent); + kfree(opts.name); + return ret; } @@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); kill_litter_super(sb); - kfree(root); + cgroup_drop_root(root); } static struct file_system_type cgroup_fs_type = { @@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } -/* - * Return the first subsystem attached to a cgroup's hierarchy, and - * its subsystem id. - */ - -static void get_first_subsys(const struct cgroup *cgrp, - struct cgroup_subsys_state **css, int *subsys_id) -{ - const struct cgroupfs_root *root = cgrp->root; - const struct cgroup_subsys *test_ss; - BUG_ON(list_empty(&root->subsys_list)); - test_ss = list_entry(root->subsys_list.next, - struct cgroup_subsys, sibling); - if (css) { - *css = cgrp->subsys[test_ss->subsys_id]; - BUG_ON(!*css); - } - if (subsys_id) - *subsys_id = test_ss->subsys_id; -} - /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; - int subsys_id; - - get_first_subsys(cgrp, NULL, &subsys_id); /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup(tsk, subsys_id); + oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) return 0; for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, tsk, false); if (retval) return retval; } @@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) return ret; } -/* The various types of files and directories in a cgroup file system */ -enum cgroup_filetype { - FILE_ROOT, - FILE_DIR, - FILE_TASKLIST, - FILE_NOTIFY_ON_RELEASE, - FILE_RELEASE_AGENT, -}; - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1644,7 +1863,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file) return single_release(inode, file); } -static struct file_operations cgroup_seqfile_operations = { +static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, @@ -1703,7 +1922,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cgroup_file_operations = { +static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, .llseek = generic_file_llseek, @@ -1711,7 +1930,7 @@ static struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; -static struct inode_operations cgroup_dir_inode_operations = { +static const struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, @@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * the start of a css_set */ static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) + struct cgroup_iter *it) { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; @@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) } /* - * Stuff for reading the 'tasks' file. + * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), @@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* - * Load into 'pidarray' up to 'npids' of the tasks using cgroup - * 'cgrp'. Return actual number of pids loaded. No need to - * task_lock(p) when reading out p->cgroup, since we're in an RCU - * read section, so the css_set can't go away, and is - * immutable after creation. + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * If the new stripped list is sufficiently smaller and there's enough memory + * to allocate a new buffer, will let go of the unneeded memory. Returns the + * number of unique elements. + */ +/* is the size difference enough that we should re-allocate the array? */ +#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) +static int pidlist_uniq(pid_t **p, int length) +{ + int src, dest = 1; + pid_t *list = *p; + pid_t *newlist; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + /* + * if the length difference is large enough, we want to allocate a + * smaller buffer to save memory. if this fails due to out of memory, + * we'll just stay with what we've got. + */ + if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { + newlist = pidlist_resize(list, dest); + if (newlist) + *p = newlist; + } + return dest; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. */ -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) { - int n = 0, pid; + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* found a matching list - drop the extra refcount */ + put_pid_ns(ns); + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + l->use_count++; + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + put_pid_ns(ns); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = ns; + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ struct cgroup_iter it; struct task_struct *tsk; + struct cgroup_pidlist *l; + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == npids)) + if (unlikely(n == length)) break; - pid = task_pid_vnr(tsk); - if (pid > 0) - pidarray[n++] = pid; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; } cgroup_iter_end(cgrp, &it); - return n; + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(&array, length); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); + l->list = array; + l->length = length; + l->use_count++; + up_write(&l->mutex); + *lp = l; + return 0; } /** @@ -2216,37 +2604,14 @@ err: return ret; } -/* - * Cache pids for all threads in the same pid namespace that are - * opening the same "tasks" file. - */ -struct cgroup_pids { - /* The node in cgrp->pids_list */ - struct list_head list; - /* The cgroup those pids belong to */ - struct cgroup *cgrp; - /* The namepsace those pids belong to */ - struct pid_namespace *ns; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the this tasks_pids array */ - int use_count; - /* Length of the current tasks_pids array */ - int length; -}; - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} /* - * seq_file methods for the "tasks" file. The seq_file position is the + * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->tasks_pids array. + * in the cgroup->l->list array. */ -static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to @@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; + struct cgroup_pidlist *l = s->private; int index = 0, pid = *pos; int *iter; - down_read(&cgrp->pids_mutex); + down_read(&l->mutex); if (pid) { - int end = cp->length; + int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (cp->tasks_pids[mid] == pid) { + if (l->list[mid] == pid) { index = mid; break; - } else if (cp->tasks_pids[mid] <= pid) + } else if (l->list[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cp->length) + if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cp->tasks_pids + index; + iter = l->list + index; *pos = *iter; return iter; } -static void cgroup_tasks_stop(struct seq_file *s, void *v) +static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; - up_read(&cgrp->pids_mutex); + struct cgroup_pidlist *l = s->private; + up_read(&l->mutex); } -static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pids *cp = s->private; - int *p = v; - int *end = cp->tasks_pids + cp->length; - + struct cgroup_pidlist *l = s->private; + pid_t *p = v; + pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done @@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) } } -static int cgroup_tasks_show(struct seq_file *s, void *v) +static int cgroup_pidlist_show(struct seq_file *s, void *v) { return seq_printf(s, "%d\n", *(int *)v); } -static struct seq_operations cgroup_tasks_seq_operations = { - .start = cgroup_tasks_start, - .stop = cgroup_tasks_stop, - .next = cgroup_tasks_next, - .show = cgroup_tasks_show, +/* + * seq_operations functions for iterating on pidlists through seq_file - + * independent of whether it's tasks or procs + */ +static const struct seq_operations cgroup_pidlist_seq_operations = { + .start = cgroup_pidlist_start, + .stop = cgroup_pidlist_stop, + .next = cgroup_pidlist_next, + .show = cgroup_pidlist_show, }; -static void release_cgroup_pid_array(struct cgroup_pids *cp) +static void cgroup_release_pid_array(struct cgroup_pidlist *l) { - struct cgroup *cgrp = cp->cgrp; - - down_write(&cgrp->pids_mutex); - BUG_ON(!cp->use_count); - if (!--cp->use_count) { - list_del(&cp->list); - put_pid_ns(cp->ns); - kfree(cp->tasks_pids); - kfree(cp); + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); + down_write(&l->mutex); + BUG_ON(!l->use_count); + if (!--l->use_count) { + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; } - up_write(&cgrp->pids_mutex); + mutex_unlock(&l->owner->pidlist_mutex); + up_write(&l->mutex); } -static int cgroup_tasks_release(struct inode *inode, struct file *file) +static int cgroup_pidlist_release(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct cgroup_pids *cp; - + struct cgroup_pidlist *l; if (!(file->f_mode & FMODE_READ)) return 0; - - seq = file->private_data; - cp = seq->private; - - release_cgroup_pid_array(cp); + /* + * the seq_file will only be initialized if the file was opened for + * reading; hence we check if it's not null only in that case. + */ + l = ((struct seq_file *)file->private_data)->private; + cgroup_release_pid_array(l); return seq_release(inode, file); } -static struct file_operations cgroup_tasks_operations = { +static const struct file_operations cgroup_pidlist_operations = { .read = seq_read, .llseek = seq_lseek, .write = cgroup_file_write, - .release = cgroup_tasks_release, + .release = cgroup_pidlist_release, }; /* - * Handle an open on 'tasks' file. Prepare an array containing the - * process id's of tasks currently attached to the cgroup being opened. + * The following functions handle opens on a file that displays a pidlist + * (tasks or procs). Prepare an array of the process/thread IDs of whoever's + * in the cgroup. */ - -static int cgroup_tasks_open(struct inode *unused, struct file *file) +/* helper function for the two below it */ +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct pid_namespace *ns = current->nsproxy->pid_ns; - struct cgroup_pids *cp; - pid_t *pidarray; - int npids; + struct cgroup_pidlist *l; int retval; /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - npids = cgroup_task_count(cgrp); - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - return -ENOMEM; - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* - * Store the array in the cgroup, freeing the old - * array if necessary - */ - down_write(&cgrp->pids_mutex); - - list_for_each_entry(cp, &cgrp->pids_list, list) { - if (ns == cp->ns) - goto found; - } - - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - up_write(&cgrp->pids_mutex); - kfree(pidarray); - return -ENOMEM; - } - cp->cgrp = cgrp; - cp->ns = ns; - get_pid_ns(ns); - list_add(&cp->list, &cgrp->pids_list); -found: - kfree(cp->tasks_pids); - cp->tasks_pids = pidarray; - cp->length = npids; - cp->use_count++; - up_write(&cgrp->pids_mutex); - - file->f_op = &cgroup_tasks_operations; + /* have the array populated */ + retval = pidlist_array_load(cgrp, type, &l); + if (retval) + return retval; + /* configure file information */ + file->f_op = &cgroup_pidlist_operations; - retval = seq_open(file, &cgroup_tasks_seq_operations); + retval = seq_open(file, &cgroup_pidlist_seq_operations); if (retval) { - release_cgroup_pid_array(cp); + cgroup_release_pid_array(l); return retval; } - ((struct seq_file *)file->private_data)->private = cp; + ((struct seq_file *)file->private_data)->private = l; return 0; } +static int cgroup_tasks_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); +} +static int cgroup_procs_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); +} static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft) @@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, /* * for the common functions, 'private' gives the type of file */ +/* for hysterical raisins, we can't put this on the older files */ +#define CGROUP_FILE_GENERIC_PREFIX "cgroup." static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, - .release = cgroup_tasks_release, - .private = FILE_TASKLIST, + .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, - + { + .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .open = cgroup_procs_open, + /* .write_u64 = cgroup_procs_write, TODO */ + .release = cgroup_pidlist_release, + .mode = S_IRUGO, + }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, - .private = FILE_NOTIFY_ON_RELEASE, }, }; @@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = { .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, - .private = FILE_RELEASE_AGENT, }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void) init_task.cgroups = &init_css_set; init_css_set_link.cg = &init_css_set; + init_css_set_link.cgrp = dummytop; list_add(&init_css_set_link.cgrp_link_list, &rootnode.top_cgroup.css_sets); list_add(&init_css_set_link.cg_link_list, @@ -2933,7 +3284,7 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); - + BUG_ON(!init_root_id(&rootnode)); err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; @@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int subsys_id; int count = 0; - seq_printf(m, "%lu:", root->subsys_bits); + seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (strlen(root->name)) + seq_printf(m, "%sname=%s", count ? "," : "", + root->name); seq_putc(m, ':'); - get_first_subsys(&root->top_cgroup, NULL, &subsys_id); - cgrp = task_cgroup(tsk, subsys_id); + cgrp = task_cgroup_from_root(tsk, root); retval = cgroup_path(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; @@ -3017,7 +3369,7 @@ static int cgroup_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroup_show, pid); } -struct file_operations proc_cgroup_operations = { +const struct file_operations proc_cgroup_operations = { .open = cgroup_open, .read = seq_read, .llseek = seq_lseek, @@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\t%d\n", - ss->name, ss->root->subsys_bits, + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); @@ -3046,7 +3398,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroupstats_show, NULL); } -static struct file_operations proc_cgroupstats_operations = { +static const struct file_operations proc_cgroupstats_operations = { .open = cgroupstats_open, .read = seq_read, .llseek = seq_lseek, @@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) { int ret; struct cgroup *target; - int subsys_id; if (cgrp == dummytop) return 1; - get_first_subsys(cgrp, NULL, &subsys_id); - target = task_cgroup(task, subsys_id); + target = task_cgroup_from_root(task, cgrp->root); while (cgrp != target && cgrp!= cgrp->top_cgroup) cgrp = cgrp->parent; ret = (cgrp == target); @@ -3358,8 +3708,10 @@ static void check_for_release(struct cgroup *cgrp) void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int val; rcu_read_lock(); - if (atomic_dec_return(&css->refcnt) == 1) { + val = atomic_dec_return(&css->refcnt); + if (val == 1) { if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); @@ -3367,6 +3719,7 @@ void __css_put(struct cgroup_subsys_state *css) cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); + WARN_ON_ONCE(val < 1); } /* @@ -3693,3 +4046,154 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + return cgroup_task_count(cont); +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + struct css_set *cg; + + read_lock(&css_set_lock); + rcu_read_lock(); + cg = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + const char *name; + + if (c->dentry) + name = c->dentry->d_name.name; + else + name = "?"; + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name); + } + rcu_read_unlock(); + read_unlock(&css_set_lock); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + struct task_struct *task; + int count = 0; + seq_printf(seq, "css_set %p\n", cg); + list_for_each_entry(task, &cg->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) { + seq_puts(seq, " ...\n"); + break; + } else { + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + } + } + read_unlock(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +static struct cftype debug_files[] = { + { + .name = "cgroup_refcount", + .read_u64 = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .read_seq_string = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .read_seq_string = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, debug_files, + ARRAY_SIZE(debug_files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c deleted file mode 100644 index 0c92d79..0000000 --- a/kernel/cgroup_debug.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * kernel/cgroup_debug.c - Example cgroup subsystem that - * exposes debug info - * - * Copyright (C) Google Inc, 2007 - * - * Developed by Paul Menage (menage@google.com) - * - */ - -#include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> - -#include <asm/atomic.h> - -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - u64 count; - - count = cgroup_task_count(cont); - return count; -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); - return count; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fb249e2..59e9ef6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task) */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct task_struct *task, bool threadgroup) { struct freezer *freezer; @@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (is_task_frozen_enough(c)) { + rcu_read_unlock(); + return -EBUSY; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce1004..6ba0f1e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -401,6 +401,7 @@ int disable_nonboot_cpus(void) break; } } + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ @@ -413,6 +414,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7e75a41..b5cb469 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp) static cpumask_var_t cpus_attach; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct task_struct *tsk) +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *tsk, bool threadgroup) { + int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - return security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk, 0, NULL); + if (ret) + return ret; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + ret = security_task_setscheduler(c, 0, NULL); + if (ret) { + rcu_read_unlock(); + return ret; + } + } + rcu_read_unlock(); + } + return 0; +} + +static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, + struct cpuset *cs) +{ + int err; + /* + * can_attach beforehand should guarantee that this doesn't fail. + * TODO: have a better way to handle failure here + */ + err = set_cpus_allowed_ptr(tsk, cpus_attach); + WARN_ON_ONCE(err); + + task_lock(tsk); + cpuset_change_task_nodemask(tsk, to); + task_unlock(tsk); + cpuset_update_task_spread_flag(cs, tsk); + } -static void cpuset_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct cgroup *oldcont, - struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *oldcont, struct task_struct *tsk, + bool threadgroup) { nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - int err; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss, guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &to); } - err = set_cpus_allowed_ptr(tsk, cpus_attach); - if (err) - return; - task_lock(tsk); - cpuset_change_task_nodemask(tsk, &to); - task_unlock(tsk); - cpuset_update_task_spread_flag(cs, tsk); + /* do per-task migration stuff possibly for each in the threadgroup */ + cpuset_attach_task(tsk, &to, cs); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + cpuset_attach_task(c, &to, cs); + } + rcu_read_unlock(); + } + /* change mm; only needs to be done once even if threadgroup */ from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); diff --git a/kernel/cred.c b/kernel/cred.c index d7f7a01..dd76cfe 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as); #ifdef CONFIG_DEBUG_CREDENTIALS +bool creds_are_invalid(const struct cred *cred) +{ + if (cred->magic != CRED_MAGIC) + return true; + if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) + return true; +#ifdef CONFIG_SECURITY_SELINUX + if (selinux_is_enabled()) { + if ((unsigned long) cred->security < PAGE_SIZE) + return true; + if ((*(u32 *)cred->security & 0xffffff00) == + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) + return true; + } +#endif + return false; +} +EXPORT_SYMBOL(creds_are_invalid); + /* * dump invalid credentials */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index abb6e17..ead9b61 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/taskstats.h> #include <linux/time.h> #include <linux/sysctl.h> #include <linux/delayacct.h> diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c deleted file mode 100644 index 962a3b5..0000000 --- a/kernel/dma-coherent.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Coherent per-device memory handling. - * Borrowed from i386 - */ -#include <linux/kernel.h> -#include <linux/dma-mapping.h> - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -/** - * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area - * - * @dev: device from which we allocate memory - * @size: size of requested memory area - * @dma_handle: This will be filled with the correct dma handle - * @ret: This pointer will be filled with the virtual address - * to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem; - int order = get_order(size); - int pageno; - - if (!dev) - return 0; - mem = dev->dma_mem; - if (!mem) - return 0; - - *ret = NULL; - - if (unlikely(size > (mem->size << PAGE_SHIFT))) - goto err; - - pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); - if (unlikely(pageno < 0)) - goto err; - - /* - * Memory was found in the per-device area. - */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); - *ret = mem->virt_base + (pageno << PAGE_SHIFT); - memset(*ret, 0, size); - - return 1; - -err: - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; -} -EXPORT_SYMBOL(dma_alloc_from_coherent); - -/** - * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool - * @dev: device from which the memory was allocated - * @order: the order of pages allocated - * @vaddr: virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if - * dma_release_coherent() should proceed with releasing memory from - * generic pools. - */ -int dma_release_from_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -EXPORT_SYMBOL(dma_release_from_coherent); diff --git a/kernel/exit.c b/kernel/exit.c index ae5d866..e61891f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <trace/events/sched.h> #include <asm/uaccess.h> @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) + if (task_session(curr) != pid) { change_pid(curr, PIDTYPE_SID, pid); + proc_sid_connector(curr); + } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); @@ -945,6 +947,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); + if (tsk->mm) + setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) @@ -972,8 +976,6 @@ NORET_TYPE void do_exit(long code) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); proc_exit_connector(tsk); @@ -981,7 +983,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA @@ -989,8 +991,6 @@ NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif @@ -1093,28 +1093,28 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; + wait_queue_t child_wait; int notask_error; }; -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { - int err; - - if (wo->wo_type < PIDTYPE_MAX) { - if (task_pid_type(p, wo->wo_type) != wo->wo_pid) - return 0; - } + return wo->wo_type == PIDTYPE_MAX || + task_pid_type(p, wo->wo_type) == wo->wo_pid; +} +static int eligible_child(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return 0; /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1124,10 +1124,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p) && !(wo->wo_flags & __WALL)) return 0; - err = security_task_wait(p); - if (err) - return err; - return 1; } @@ -1140,18 +1136,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } if (!retval) retval = pid; return retval; @@ -1208,6 +1206,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (likely(!traced) && likely(!task_detached(p))) { struct signal_struct *psig; struct signal_struct *sig; + unsigned long maxrss; /* * The resource counters for the group leader are in its @@ -1256,6 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; + maxrss = max(sig->maxrss, sig->cmaxrss); + if (psig->cmaxrss < maxrss) + psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->real_parent->sighand->siglock); @@ -1477,13 +1479,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. */ -static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, - int ptrace, struct task_struct *p) +static int wait_consider_task(struct wait_opts *wo, int ptrace, + struct task_struct *p) { int ret = eligible_child(wo, p); if (!ret) return ret; + ret = security_task_wait(p); if (unlikely(ret < 0)) { /* * If we have not yet seen any eligible child, @@ -1545,7 +1548,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) * Do not consider detached threads. */ if (!task_detached(p)) { - int ret = wait_consider_task(wo, tsk, 0, p); + int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } @@ -1559,7 +1562,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, tsk, 1, p); + int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } @@ -1567,15 +1570,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +static int child_wait_callback(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, + child_wait); + struct task_struct *p = key; + + if (!eligible_pid(wo, p)) + return 0; + + if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) + return 0; + + return default_wake_function(wait, mode, sync, key); +} + +void __wake_up_parent(struct task_struct *p, struct task_struct *parent) +{ + __wake_up_sync_key(&parent->signal->wait_chldexit, + TASK_INTERRUPTIBLE, 1, p); +} + static long do_wait(struct wait_opts *wo) { - DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); - add_wait_queue(¤t->signal->wait_chldexit,&wait); + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our critiera just get out. @@ -1616,32 +1642,7 @@ notask: } end: __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (wo->wo_info) { - struct siginfo __user *infop = wo->wo_info; - - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } + remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } @@ -1686,6 +1687,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); + + if (ret > 0) { + ret = 0; + } else if (infop) { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + put_pid(pid); /* avoid REGPARM breakage on x86: */ diff --git a/kernel/fork.c b/kernel/fork.c index bfee931..4c20fff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -49,6 +49,7 @@ #include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> +#include <linux/ksm.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> @@ -61,7 +62,8 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> +#include <linux/posix-timers.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +static void account_kernel_stack(struct thread_info *ti, int account) +{ + struct zone *zone = page_zone(virt_to_page(ti)); + + mod_zone_page_state(zone, NR_KERNEL_STACK, account); +} + void free_task(struct task_struct *tsk) { prop_local_destroy_single(&tsk->dirties); + account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + + account_kernel_stack(ti, 1); + return tsk; out: @@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> +static void mm_init_aio(struct mm_struct *mm) +{ +#ifdef CONFIG_AIO + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); +#endif +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->flags = (current->mm) ? + (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); - spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_aio(mm); mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { @@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); + ksm_exit(mm); exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + if (mm->binfmt) + module_put(mm->binfmt->module); mmdrop(mm); } } @@ -543,12 +570,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) + if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); + tsk->robust_list = NULL; + } #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) + if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } #endif + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); #endif /* Get rid of any cached register state */ @@ -618,9 +651,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; + if (mm->binfmt && !try_module_get(mm->binfmt->module)) + goto free_pt; + return mm; free_pt: + /* don't put binfmt in mmput, we haven't got module yet */ + mm->binfmt = NULL; mmput(mm); fail_nomem: @@ -788,10 +826,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) thread_group_cputime_init(sig); /* Expiration times and increments. */ - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; + sig->it[CPUCLOCK_PROF].expires = cputime_zero; + sig->it[CPUCLOCK_PROF].incr = cputime_zero; + sig->it[CPUCLOCK_VIRT].expires = cputime_zero; + sig->it[CPUCLOCK_VIRT].incr = cputime_zero; /* Cached expiration times. */ sig->cputime_expires.prof_exp = cputime_zero; @@ -849,6 +887,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; + sig->maxrss = sig->cmaxrss = 0; task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); @@ -863,6 +902,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); + sig->oom_adj = current->signal->oom_adj; + return 0; } @@ -958,6 +999,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); + /* + * Siblings of global init remain as zombies on exit since they are + * not reaped by their parent (swapper). To solve this and to avoid + * multi-rooted process trees, prevent global and container-inits + * from creating siblings. + */ + if ((clone_flags & CLONE_PARENT) && + current->signal->flags & SIGNAL_UNKILLABLE) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -999,9 +1050,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - if (p->binfmt && !try_module_get(p->binfmt->module)) - goto bad_fork_cleanup_put_domain; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); @@ -1075,10 +1123,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->bts = NULL; + p->stack_start = stack_start; + /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - retval = perf_counter_init_task(p); + retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; @@ -1253,7 +1303,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); - perf_counter_fork(p); + perf_event_fork(p); return p; bad_fork_free_pid: @@ -1280,16 +1330,13 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_free_task(p); + perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); - if (p->binfmt) - module_put(p->binfmt->module); -bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/futex.c b/kernel/futex.c index 248dd11..4949d33 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -89,36 +89,36 @@ struct futex_pi_state { union futex_key key; }; -/* - * We use this hashed waitqueue instead of a normal wait_queue_t, so +/** + * struct futex_q - The hashed futex queue entry, one per waiting task + * @task: the task waiting on the futex + * @lock_ptr: the hash bucket lock + * @key: the key the futex is hashed on + * @pi_state: optional priority inheritance state + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup + * + * We use this hashed waitqueue, instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakup is always to make the first condition true, then - * wake up q->waiter, then make the second condition true. + * the second. + * + * PI futexes are typically woken before they are removed from the hash list via + * the rt_mutex code. See unqueue_me_pi(). */ struct futex_q { struct plist_node list; - /* Waiter reference */ - struct task_struct *task; - /* Which hash list lock to use: */ + struct task_struct *task; spinlock_t *lock_ptr; - - /* Key which the futex is hashed on: */ union futex_key key; - - /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - - /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; - - /* The expected requeue pi target futex key: */ union futex_key *requeue_pi_key; - - /* Bitset for the optional bitmasked wakeup */ u32 bitset; }; @@ -198,11 +198,12 @@ static void drop_futex_key_refs(union futex_key *key) } /** - * get_futex_key - Get parameters which are the keys for a futex. - * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED - * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, + * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -288,8 +289,8 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } -/* - * fault_in_user_writeable - fault in user address and verify RW access +/** + * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write @@ -309,8 +310,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) /** * futex_top_waiter() - Return the highest priority waiter on a futex - * @hb: the hash bucket the futex_q's reside in - * @key: the futex key (to distinguish it from other futex futex_q's) + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ @@ -588,7 +589,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, } /** - * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex * @uaddr: the pi futex user address * @hb: the pi futex hash bucket * @key: the futex key associated with uaddr and hb @@ -915,8 +916,8 @@ retry: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); - double_lock_hb(hb1, hb2); retry_private: + double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { @@ -1011,9 +1012,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue - * q: the futex_q - * key: the key of the requeue target futex - * hb: the hash_bucket of the requeue target futex + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key @@ -1350,6 +1351,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) return hb; } +static inline void +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +{ + spin_unlock(&hb->lock); + drop_futex_key_refs(&q->key); +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -1373,19 +1393,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) spin_unlock(&hb->lock); } -static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) -{ - spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); -} - -/* - * queue_me and unqueue_me must be called as a pair, each - * exactly once. They are called with the hashed spinlock held. +/** + * unqueue_me() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must + * be paired with exactly one earlier call to queue_me(). + * + * Returns: + * 1 - if the futex_q was still queued (and we removed unqueued it) + * 0 - if the futex_q was already removed by the waking thread */ - -/* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { spinlock_t *lock_ptr; @@ -1638,17 +1656,14 @@ out: static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { - queue_me(q, hb); - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using set_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE); + queue_me(q, hb); /* Arm the timer */ if (timeout) { @@ -1658,8 +1673,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, } /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + * If we have been removed from the hash list, then another task + * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* @@ -2102,7 +2117,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &q->list.plist); - drop_futex_key_refs(&q->key); if (timeout && !timeout->task) ret = -ETIMEDOUT; @@ -2114,12 +2128,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 - * @uaddr: the futex we initialyl wait on (non-pi) + * @uaddr: the futex we initially wait on (non-pi) * @fshared: whether the futexes are shared (1) or not (0). They must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout - * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * @@ -2246,7 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, res = fixup_owner(uaddr2, fshared, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it - * acquired the lock, clear our -ETIMEDOUT or -EINTR. + * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) ret = (res < 0) ? res : 0; @@ -2302,9 +2316,9 @@ out: */ /** - * sys_set_robust_list - set the robust-futex list head of a task - * @head: pointer to the list-head - * @len: length of the list-head, as userspace expects + * sys_set_robust_list() - Set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) @@ -2323,10 +2337,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, } /** - * sys_get_robust_list - get the robust-futex list head of a task - * @pid: pid of the process [zero for current task] - * @head_ptr: pointer to a list-head pointer, the kernel fills it in - * @len_ptr: pointer to a length field, the kernel fills in the header size + * sys_get_robust_list() - Get the robust-futex list head of a task + * @pid: pid of the process [zero for current task] + * @head_ptr: pointer to a list-head pointer, the kernel fills it in + * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 22e9dcf..70a298d 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 + depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 05071bf..3e1c36e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,36 +48,7 @@ #include <asm/uaccess.h> -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} - -EXPORT_SYMBOL_GPL(ktime_get_real); +#include <trace/events/timer.h> /* * The timer bases: @@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. @@ -498,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -543,13 +509,14 @@ static inline int hrtimer_hres_active(void) * next event * Called with interrupts disabled and base->lock held */ -static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { int i; struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires; + ktime_t expires, expires_next; - cpu_base->expires_next.tv64 = KTIME_MAX; + expires_next.tv64 = KTIME_MAX; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -565,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) */ if (expires.tv64 < 0) expires.tv64 = 0; - if (expires.tv64 < cpu_base->expires_next.tv64) - cpu_base->expires_next = expires; + if (expires.tv64 < expires_next.tv64) + expires_next = expires; } + if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) + return; + + cpu_base->expires_next.tv64 = expires_next.tv64; + if (cpu_base->expires_next.tv64 != KTIME_MAX) tick_program_event(cpu_base->expires_next, 1); } @@ -651,7 +623,7 @@ static void retrigger_next_event(void *arg) base->clock_base[CLOCK_REALTIME].offset = timespec_to_ktime(realtime_offset); - hrtimer_force_reprogram(base); + hrtimer_force_reprogram(base, 0); spin_unlock(&base->lock); } @@ -754,8 +726,6 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } @@ -764,7 +734,8 @@ static int hrtimer_switch_to_hres(void) static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } -static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base, int wakeup) @@ -854,7 +825,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer *entry; int leftmost = 1; - debug_hrtimer_activate(timer); + debug_activate(timer); /* * Find the right place in the rbtree: @@ -907,19 +878,29 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { - if (timer->state & HRTIMER_STATE_ENQUEUED) { - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) { - base->first = rb_next(&timer->node); - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) - hrtimer_force_reprogram(base->cpu_base); + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; + + /* + * Remove the timer from the rbtree and replace the first + * entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); +#ifdef CONFIG_HIGH_RES_TIMERS + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (base->cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(base->cpu_base, 1); } - rb_erase(&timer->node, &base->active); +#endif } + rb_erase(&timer->node, &base->active); +out: timer->state = newstate; } @@ -940,7 +921,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, @@ -1155,7 +1136,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &cpu_base->clock_base[clock_id]; - INIT_LIST_HEAD(&timer->cb_entry); hrtimer_init_timer_hres(timer); #ifdef CONFIG_TIMER_STATS @@ -1174,7 +1154,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - debug_hrtimer_init(timer); + debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -1198,7 +1178,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); -static void __run_hrtimer(struct hrtimer *timer) +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; struct hrtimer_cpu_base *cpu_base = base->cpu_base; @@ -1207,7 +1187,7 @@ static void __run_hrtimer(struct hrtimer *timer) WARN_ON(!irqs_disabled()); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1218,7 +1198,9 @@ static void __run_hrtimer(struct hrtimer *timer) * the timer base. */ spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); restart = fn(timer); + trace_hrtimer_expire_exit(timer); spin_lock(&cpu_base->lock); /* @@ -1329,7 +1311,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - __run_hrtimer(timer); + __run_hrtimer(timer, &basenow); } base++; } @@ -1451,7 +1433,7 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - __run_hrtimer(timer); + __run_hrtimer(timer, &base->softirq_time); } spin_unlock(&cpu_base->lock); } @@ -1628,7 +1610,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 022a492..d4e8417 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout) * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a81cf80..17c71bb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include <linux/irq.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> diff --git a/kernel/itimer.c b/kernel/itimer.c index 58762f7..b03451e 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -12,6 +12,7 @@ #include <linux/time.h> #include <linux/posix-timers.h> #include <linux/hrtimer.h> +#include <trace/events/timer.h> #include <asm/uaccess.h> @@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) return ktime_to_timeval(rem); } +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime_add(cputime.utime, cputime.stime); + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cputime_le(cval, t)) + /* about to fire */ + cval = cputime_one_jiffy; + else + cval = cputime_sub(cval, t); + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; - cputime_t cinterval, cval; switch (which) { case ITIMER_REAL: @@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime cputime; - cputime_t utime; - - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - if (cputime_le(cval, utime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, utime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); break; case ITIMER_PROF: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime times; - cputime_t ptime; - - thread_group_cputimer(tsk, ×); - ptime = cputime_add(times.utime, times.stime); - if (cputime_le(cval, ptime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, ptime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); break; default: return(-EINVAL); @@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; } +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); + it->error = cputime_sub_ns(nval, ns_nval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, cputime_one_jiffy); + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + /* * Returns true if the timeval is in canonical form */ @@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - cputime_t cval, cinterval, nval, ninterval; /* * Validate the timevals in value. @@ -171,51 +221,14 @@ again: } else tsk->signal->it_real_incr.tv64 = 0; + trace_itimer_state(ITIMER_REAL, value, 0); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_VIRT, - &nval, &cval); - } - tsk->signal->it_virt_expires = nval; - tsk->signal->it_virt_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); break; case ITIMER_PROF: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_PROF, - &nval, &cval); - } - tsk->signal->it_prof_expires = nval; - tsk->signal->it_prof_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); break; default: return -EINVAL; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3a29dbe..8b6b8b6 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr) static inline int is_kernel_text(unsigned long addr) { - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || + arch_is_kernel_text(addr)) return 1; return in_gate_area_no_task(addr); } diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 26539e3..3765ff3 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free); * writer, you don't need extra locking to use these functions. */ unsigned int __kfifo_put(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) + const unsigned char *buffer, unsigned int len) { unsigned int l; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ef177d6..5240d75 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_seq_ops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, @@ -1333,7 +1333,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp) return seq_open(filp, &kprobes_seq_ops); } -static struct file_operations debugfs_kprobes_operations = { +static const struct file_operations debugfs_kprobes_operations = { .open = kprobes_open, .read = seq_read, .llseek = seq_lseek, @@ -1515,7 +1515,7 @@ static ssize_t write_enabled_file_bool(struct file *file, return count; } -static struct file_operations fops_kp = { +static const struct file_operations fops_kp = { .read = read_enabled_file_bool, .write = write_enabled_file_bool, }; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f74d2d7..9af5672 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); +static inline u64 lockstat_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + static int lock_point(unsigned long points[], unsigned long ip) { int i; @@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip) return i; } -static void lock_time_inc(struct lock_time *lt, s64 time) +static void lock_time_inc(struct lock_time *lt, u64 time) { if (time > lt->max) lt->max = time; @@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats) static void lock_release_holdtime(struct held_lock *hlock) { struct lock_class_stats *stats; - s64 holdtime; + u64 holdtime; if (!lock_stat) return; - holdtime = sched_clock() - hlock->holdtime_stamp; + holdtime = lockstat_clock() - hlock->holdtime_stamp; stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) @@ -578,6 +583,9 @@ static int static_obj(void *obj) if ((addr >= start) && (addr < end)) return 1; + if (arch_is_kernel_data(addr)) + return 1; + #ifdef CONFIG_SMP /* * percpu var? @@ -2789,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; - hlock->holdtime_stamp = sched_clock(); + hlock->holdtime_stamp = lockstat_clock(); #endif if (check == 2 && !mark_irqflags(curr, hlock)) @@ -3319,7 +3327,7 @@ found_it: if (hlock->instance != lock) return; - hlock->waittime_stamp = sched_clock(); + hlock->waittime_stamp = lockstat_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); contending_point = lock_point(hlock_class(hlock)->contending_point, @@ -3342,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; - u64 now; - s64 waittime = 0; + u64 now, waittime = 0; int i, cpu; depth = curr->lockdep_depth; @@ -3371,7 +3378,7 @@ found_it: cpu = smp_processor_id(); if (hlock->waittime_stamp) { - now = sched_clock(); + now = lockstat_clock(); waittime = now - hlock->waittime_stamp; hlock->holdtime_stamp = now; } diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d4b3dbc..d4aba4f 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations lockstat_ops = { +static const struct seq_operations lockstat_ops = { .start = ls_start, .next = ls_next, .stop = ls_stop, diff --git a/kernel/marker.c b/kernel/marker.c deleted file mode 100644 index ea54f26..0000000 --- a/kernel/marker.c +++ /dev/null @@ -1,930 +0,0 @@ -/* - * Copyright (C) 2007 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/types.h> -#include <linux/jhash.h> -#include <linux/list.h> -#include <linux/rcupdate.h> -#include <linux/marker.h> -#include <linux/err.h> -#include <linux/slab.h> - -extern struct marker __start___markers[]; -extern struct marker __stop___markers[]; - -/* Set to 1 to enable marker debug output */ -static const int marker_debug; - -/* - * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers and the hash table. - */ -static DEFINE_MUTEX(markers_mutex); - -/* - * Marker hash table, containing the active markers. - * Protected by module_mutex. - */ -#define MARKER_HASH_BITS 6 -#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - -/* - * Note about RCU : - * It is used to make sure every handler has finished using its private data - * between two consecutive operation (add or remove) on a given marker. It is - * also used to delay the free of multiple probes array until a quiescent state - * is reached. - * marker entries modifications are protected by the markers_mutex. - */ -struct marker_entry { - struct hlist_node hlist; - char *format; - /* Probe wrapper */ - void (*call)(const struct marker *mdata, void *call_private, ...); - struct marker_probe_closure single; - struct marker_probe_closure *multi; - int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - int rcu_pending; - unsigned char ptype:1; - unsigned char format_allocated:1; - char name[0]; /* Contains name'\0'format'\0' */ -}; - -/** - * __mark_empty_function - Empty probe callback - * @probe_private: probe private data - * @call_private: call site private data - * @fmt: format string - * @...: variable argument list - * - * Empty callback provided as a probe to the markers. By providing this to a - * disabled marker, we make sure the execution flow is always valid even - * though the function pointer change and the marker enabling are two distinct - * operations that modifies the execution flow of preemptible code. - */ -notrace void __mark_empty_function(void *probe_private, void *call_private, - const char *fmt, va_list *args) -{ -} -EXPORT_SYMBOL_GPL(__mark_empty_function); - -/* - * marker_probe_cb Callback that prepares the variable argument list for probes. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Since we do not use "typical" pointer based RCU in the 1 argument case, we - * need to put a full smp_rmb() in this branch. This is why we do not use - * rcu_dereference() for the pointer read. - */ -notrace void marker_probe_cb(const struct marker *mdata, - void *call_private, ...) -{ - va_list args; - char ptype; - - /* - * rcu_read_lock_sched does two things : disabling preemption to make - * sure the teardown of the callbacks can be done correctly when they - * are in modules and they insure RCU read coherency. - */ - rcu_read_lock_sched_notrace(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - va_start(args, call_private); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - va_end(args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) { - va_start(args, call_private); - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - va_end(args); - } - } - rcu_read_unlock_sched_notrace(); -} -EXPORT_SYMBOL_GPL(marker_probe_cb); - -/* - * marker_probe_cb Callback that does not prepare the variable argument list. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Should be connected to markers "MARK_NOARGS". - */ -static notrace void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, ...) -{ - va_list args; /* not initialized */ - char ptype; - - rcu_read_lock_sched_notrace(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - } - rcu_read_unlock_sched_notrace(); -} - -static void free_old_closure(struct rcu_head *head) -{ - struct marker_entry *entry = container_of(head, - struct marker_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; -} - -static void debug_print_probes(struct marker_entry *entry) -{ - int i; - - if (!marker_debug) - return; - - if (!entry->ptype) { - printk(KERN_DEBUG "Single probe : %p %p\n", - entry->single.func, - entry->single.probe_private); - } else { - for (i = 0; entry->multi[i].func; i++) - printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, - entry->multi[i].func, - entry->multi[i].probe_private); - } -} - -static struct marker_probe_closure * -marker_entry_add_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0; - struct marker_probe_closure *old, *new; - - WARN_ON(!probe); - - debug_print_probes(entry); - old = entry->multi; - if (!entry->ptype) { - if (entry->single.func == probe && - entry->single.probe_private == probe_private) - return ERR_PTR(-EBUSY); - if (entry->single.func == __mark_empty_function) { - /* 0 -> 1 probes */ - entry->single.func = probe; - entry->single.probe_private = probe_private; - entry->refcount = 1; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* 1 -> 2 probes */ - nr_probes = 1; - old = NULL; - } - } else { - /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe - && old[nr_probes].probe_private - == probe_private) - return ERR_PTR(-EBUSY); - } - /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), - GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - if (!old) - new[0] = entry->single; - else - memcpy(new, old, - nr_probes * sizeof(struct marker_probe_closure)); - new[nr_probes].func = probe; - new[nr_probes].probe_private = probe_private; - entry->refcount = nr_probes + 1; - entry->multi = new; - entry->ptype = 1; - debug_print_probes(entry); - return old; -} - -static struct marker_probe_closure * -marker_entry_remove_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0, nr_del = 0, i; - struct marker_probe_closure *old, *new; - - old = entry->multi; - - debug_print_probes(entry); - if (!entry->ptype) { - /* 0 -> N is an error */ - WARN_ON(entry->single.func == __mark_empty_function); - /* 1 -> 0 probes */ - WARN_ON(probe && entry->single.func != probe); - WARN_ON(entry->single.probe_private != probe_private); - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if ((!probe || old[nr_probes].func == probe) - && old[nr_probes].probe_private - == probe_private) - nr_del++; - } - } - - if (nr_probes - nr_del == 0) { - /* N -> 0, (N > 1) */ - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - } else if (nr_probes - nr_del == 1) { - /* N -> 1, (N > 1) */ - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - entry->single = old[i]; - entry->refcount = 1; - entry->ptype = 0; - } else { - int j = 0; - /* N -> M, (N > 1, M > 1) */ - /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(struct marker_probe_closure), GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - new[j++] = old[i]; - entry->refcount = nr_probes - nr_del; - entry->ptype = 1; - entry->multi = new; - } - debug_print_probes(entry); - return old; -} - -/* - * Get marker if the marker is present in the marker hash table. - * Must be called with markers_mutex held. - * Returns NULL if not present. - */ -static struct marker_entry *get_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} - -/* - * Add the marker to the marker hash table. Must be called with markers_mutex - * held. - */ -static struct marker_entry *add_marker(const char *name, const char *format) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - size_t format_len = 0; - u32 hash = jhash(name, name_len-1, 0); - - if (format) - format_len = strlen(format) + 1; - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "Marker %s busy\n", name); - return ERR_PTR(-EBUSY); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - if (format) { - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); - } else { - e->format = NULL; - e->call = marker_probe_cb; - } - e->single.func = __mark_empty_function; - e->single.probe_private = NULL; - e->multi = NULL; - e->ptype = 0; - e->format_allocated = 0; - e->refcount = 0; - e->rcu_pending = 0; - hlist_add_head(&e->hlist, head); - return e; -} - -/* - * Remove the marker from the marker hash table. Must be called with mutex_lock - * held. - */ -static int remove_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->single.func != __mark_empty_function) - return -EBUSY; - hlist_del(&e->hlist); - if (e->format_allocated) - kfree(e->format); - /* Make sure the call_rcu has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); - kfree(e); - return 0; -} - -/* - * Set the mark_entry format to the format found in the element. - */ -static int marker_set_format(struct marker_entry *entry, const char *format) -{ - entry->format = kstrdup(format, GFP_KERNEL); - if (!entry->format) - return -ENOMEM; - entry->format_allocated = 1; - - trace_mark(core_marker_format, "name %s format %s", - entry->name, entry->format); - return 0; -} - -/* - * Sets the probe callback corresponding to one marker. - */ -static int set_marker(struct marker_entry *entry, struct marker *elem, - int active) -{ - int ret = 0; - WARN_ON(strcmp(entry->name, elem->name) != 0); - - if (entry->format) { - if (strcmp(entry->format, elem->format) != 0) { - printk(KERN_NOTICE - "Format mismatch for probe %s " - "(%s), marker (%s)\n", - entry->name, - entry->format, - elem->format); - return -EPERM; - } - } else { - ret = marker_set_format(entry, elem->format); - if (ret) - return ret; - } - - /* - * probe_cb setup (statically known) is done here. It is - * asynchronous with the rest of execution, therefore we only - * pass from a "safe" callback (with argument) to an "unsafe" - * callback (does not set arguments). - */ - elem->call = entry->call; - /* - * Sanity check : - * We only update the single probe private data when the ptr is - * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) - */ - WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private != entry->single.probe_private - && !elem->ptype); - elem->single.probe_private = entry->single.probe_private; - /* - * Make sure the private data is valid when we update the - * single probe ptr. - */ - smp_wmb(); - elem->single.func = entry->single.func; - /* - * We also make sure that the new probe callbacks array is consistent - * before setting a pointer to it. - */ - rcu_assign_pointer(elem->multi, entry->multi); - /* - * Update the function or multi probe array pointer before setting the - * ptype. - */ - smp_wmb(); - elem->ptype = entry->ptype; - - if (elem->tp_name && (active ^ elem->state)) { - WARN_ON(!elem->tp_cb); - /* - * It is ok to directly call the probe registration because type - * checking has been done in the __trace_mark_tp() macro. - */ - - if (active) { - /* - * try_module_get should always succeed because we hold - * lock_module() to get the tp_cb address. - */ - ret = try_module_get(__module_text_address( - (unsigned long)elem->tp_cb)); - BUG_ON(!ret); - ret = tracepoint_probe_register_noupdate( - elem->tp_name, - elem->tp_cb); - } else { - ret = tracepoint_probe_unregister_noupdate( - elem->tp_name, - elem->tp_cb); - /* - * tracepoint_probe_update_all() must be called - * before the module containing tp_cb is unloaded. - */ - module_put(__module_text_address( - (unsigned long)elem->tp_cb)); - } - } - elem->state = active; - - return ret; -} - -/* - * Disable a marker and its probe callback. - * Note: only waiting an RCU period after setting elem->call to the empty - * function insures that the original callback is not used anymore. This insured - * by rcu_read_lock_sched around the call site. - */ -static void disable_marker(struct marker *elem) -{ - int ret; - - /* leave "call" as is. It is known statically. */ - if (elem->tp_name && elem->state) { - WARN_ON(!elem->tp_cb); - /* - * It is ok to directly call the probe registration because type - * checking has been done in the __trace_mark_tp() macro. - */ - ret = tracepoint_probe_unregister_noupdate(elem->tp_name, - elem->tp_cb); - WARN_ON(ret); - /* - * tracepoint_probe_update_all() must be called - * before the module containing tp_cb is unloaded. - */ - module_put(__module_text_address((unsigned long)elem->tp_cb)); - } - elem->state = 0; - elem->single.func = __mark_empty_function; - /* Update the function before setting the ptype */ - smp_wmb(); - elem->ptype = 0; /* single probe */ - /* - * Leave the private data and id there, because removal is racy and - * should be done only after an RCU period. These are never used until - * the next initialization anyway. - */ -} - -/** - * marker_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of markers. - */ -void marker_update_probe_range(struct marker *begin, - struct marker *end) -{ - struct marker *iter; - struct marker_entry *mark_entry; - - mutex_lock(&markers_mutex); - for (iter = begin; iter < end; iter++) { - mark_entry = get_marker(iter->name); - if (mark_entry) { - set_marker(mark_entry, iter, !!mark_entry->refcount); - /* - * ignore error, continue - */ - } else { - disable_marker(iter); - } - } - mutex_unlock(&markers_mutex); -} - -/* - * Update probes, removing the faulty probes. - * - * Internal callback only changed before the first probe is connected to it. - * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 - * transitions. All other transitions will leave the old private data valid. - * This makes the non-atomicity of the callback/private data updates valid. - * - * "special case" updates : - * 0 -> 1 callback - * 1 -> 0 callback - * 1 -> 2 callbacks - * 2 -> 1 callbacks - * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. - * Site effect : marker_set_format may delete the marker entry (creating a - * replacement). - */ -static void marker_update_probes(void) -{ - /* Core kernel markers */ - marker_update_probe_range(__start___markers, __stop___markers); - /* Markers in modules. */ - module_update_markers(); - tracepoint_probe_update_all(); -} - -/** - * marker_probe_register - Connect a probe to a marker - * @name: marker name - * @format: format string - * @probe: probe handler - * @probe_private: probe private data - * - * private data must be a valid allocated memory address, or NULL. - * Returns 0 if ok, error value on error. - * The probe address must at least be aligned on the architecture pointer size. - */ -int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - entry = add_marker(name, format); - if (IS_ERR(entry)) - ret = PTR_ERR(entry); - } else if (format) { - if (!entry->format) - ret = marker_set_format(entry, format); - else if (strcmp(entry->format, format)) - ret = -EPERM; - } - if (ret) - goto end; - - /* - * If we detect that a call_rcu is pending for this marker, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_add_probe(entry, probe, probe_private); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_register); - -/** - * marker_probe_unregister - Disconnect a probe from a marker - * @name: marker name - * @probe: probe function pointer - * @probe_private: probe private data - * - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister(const char *name, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - struct marker_probe_closure *old; - int ret = -ENOENT; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, probe, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(name); /* Ignore busy error message */ - ret = 0; -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister); - -static struct marker_entry * -get_marker_from_private_data(marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - unsigned int i; - struct hlist_head *head; - struct hlist_node *node; - - for (i = 0; i < MARKER_TABLE_SIZE; i++) { - head = &marker_table[i]; - hlist_for_each_entry(entry, node, head, hlist) { - if (!entry->ptype) { - if (entry->single.func == probe - && entry->single.probe_private - == probe_private) - return entry; - } else { - struct marker_probe_closure *closure; - closure = entry->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func == probe && - closure[i].probe_private - == probe_private) - return entry; - } - } - } - } - return NULL; -} - -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @probe: probe function - * @probe_private: probe private data - * - * Unregister a probe by providing the registered private data. - * Only removes the first marker found in hash table. - * Return 0 on success or error value. - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister_private_data(marker_probe_func *probe, - void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) { - ret = -ENOENT; - goto end; - } - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, NULL, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(entry->name); /* Ignore busy error message */ -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); - -/** - * marker_get_private_data - Get a marker's probe private data - * @name: marker name - * @probe: probe to match - * @num: get the nth matching probe's private data - * - * Returns the nth private data pointer (starting from 0) matching, or an - * ERR_PTR. - * Returns the private data pointer, or an ERR_PTR. - * The private data pointer should _only_ be dereferenced if the caller is the - * owner of the data, or its content could vanish. This is mostly used to - * confirm that a caller is the owner of a registered probe. - */ -void *marker_get_private_data(const char *name, marker_probe_func *probe, - int num) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - int i; - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - if (!e->ptype) { - if (num == 0 && e->single.func == probe) - return e->single.probe_private; - } else { - struct marker_probe_closure *closure; - int match = 0; - closure = e->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func != probe) - continue; - if (match++ == num) - return closure[i].probe_private; - } - } - break; - } - } - return ERR_PTR(-ENOENT); -} -EXPORT_SYMBOL_GPL(marker_get_private_data); - -#ifdef CONFIG_MODULES - -int marker_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - break; - case MODULE_STATE_GOING: - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - break; - } - return 0; -} - -struct notifier_block marker_module_nb = { - .notifier_call = marker_module_notify, - .priority = 0, -}; - -static int init_markers(void) -{ - return register_module_notifier(&marker_module_nb); -} -__initcall(init_markers); - -#endif /* CONFIG_MODULES */ diff --git a/kernel/module.c b/kernel/module.c index 46580ed..8b7d880 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -47,6 +47,7 @@ #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/mmu_context.h> #include <linux/license.h> #include <asm/sections.h> #include <linux/tracepoint.h> @@ -369,7 +370,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -394,7 +395,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -540,7 +541,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -1535,6 +1536,10 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_free(mod, mod->module_core); + +#ifdef CONFIG_MPU + update_protections(current->mm); +#endif } void *__symbol_get(const char *symbol) @@ -1792,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } } +static void free_modinfo(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->free) + attr->free(mod); + } +} + #ifdef CONFIG_KALLSYMS /* lookup symbol in given range of kernel_symbols */ @@ -1857,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym, return '?'; } +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF + || src->st_shndx >= shnum + || !src->st_name) + return false; + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +static unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + unsigned long symoffs; + Elf_Shdr *symsect = sechdrs + symindex; + Elf_Shdr *strsect = sechdrs + strindex; + const Elf_Sym *src; + const char *strtab; + unsigned int i, nsrc, ndst; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + symsect->sh_name); + + src = (void *)hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + strtab = (void *)hdr + strsect->sh_offset; + for (ndst = i = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + unsigned int j = src->st_name; + + while(!__test_and_set_bit(j, strmap) && strtab[j]) + ++j; + ++ndst; + } + + /* Append room for core symbols at end of core part. */ + symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + strsect->sh_name); + + /* Append room for core symbols' strings at end of core part. */ + *pstroffs = mod->core_size; + __set_bit(0, strmap); + mod->core_size += bitmap_weight(strmap, strsect->sh_size); + + return symoffs; +} + static void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + unsigned long *strmap) { - unsigned int i; + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; mod->symtab = (void *)sechdrs[symindex].sh_addr; mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); @@ -1873,13 +1969,46 @@ static void add_kallsyms(struct module *mod, for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + + mod->core_symtab = dst = mod->module_core + symoffs; + src = mod->symtab; + *dst = *src; + for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { + if (!is_core_symbol(src, sechdrs, shnum)) + continue; + dst[ndst] = *src; + dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + ++ndst; + } + mod->core_num_syms = ndst; + + mod->core_strtab = s = mod->module_core + stroffs; + for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) + if (test_bit(i, strmap)) + *++s = mod->strtab[i]; } #else +static inline unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + return 0; +} + static inline void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + const unsigned long *strmap) { } #endif /* CONFIG_KALLSYMS */ @@ -1954,6 +2083,8 @@ static noinline struct module *load_module(void __user *umod, struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + unsigned long symoffs, stroffs, *strmap; + mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2035,11 +2166,6 @@ static noinline struct module *load_module(void __user *umod, /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; -#ifdef CONFIG_KALLSYMS - /* Keep symbol and string tables for decoding later. */ - sechdrs[symindex].sh_flags |= SHF_ALLOC; - sechdrs[strindex].sh_flags |= SHF_ALLOC; -#endif /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2075,6 +2201,13 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) + * sizeof(long), GFP_KERNEL); + if (!strmap) { + err = -ENOMEM; + goto free_mod; + } + if (find_module(mod->name)) { err = -EEXIST; goto free_mod; @@ -2104,6 +2237,8 @@ static noinline struct module *load_module(void __user *umod, this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, hdr, sechdrs, secstrings); + symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, + secstrings, &stroffs, strmap); /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2237,10 +2372,6 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->ctors), &mod->num_ctors); #endif -#ifdef CONFIG_MARKERS - mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", - sizeof(*mod->markers), &mod->num_markers); -#endif #ifdef CONFIG_TRACEPOINTS mod->tracepoints = section_objs(hdr, sechdrs, secstrings, "__tracepoints", @@ -2312,7 +2443,10 @@ static noinline struct module *load_module(void __user *umod, percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); - add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, + symoffs, stroffs, secstrings, strmap); + kfree(strmap); + strmap = NULL; if (!mod->taints) { struct _ddebug *debug; @@ -2384,13 +2518,14 @@ static noinline struct module *load_module(void __user *umod, synchronize_sched(); module_arch_cleanup(mod); cleanup: + free_modinfo(mod); kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - free_init: percpu_modfree(mod->refptr); + free_init: #endif module_free(mod, mod->module_init); free_core: @@ -2401,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod, percpu_modfree(percpu); free_mod: kfree(args); + kfree(strmap); free_hdr: vfree(hdr); return ERR_PTR(err); @@ -2490,6 +2626,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, /* Drop initial reference. */ module_put(mod); trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2951,27 +3092,12 @@ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, - struct marker *marker, struct tracepoint *tp) { } EXPORT_SYMBOL(module_layout); #endif -#ifdef CONFIG_MARKERS -void module_update_markers(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - mutex_unlock(&module_mutex); -} -#endif - #ifdef CONFIG_TRACEPOINTS void module_update_tracepoints(void) { diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 50d022e..ec815a9 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/poison.h> +#include <linux/sched.h> #include <linux/spinlock.h> #include <linux/kallsyms.h> #include <linux/interrupt.h> diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 5aa854f..2a5dfec 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid) * (hence either you are in the same cgroup as task, or in an * ancestor cgroup thereof) */ -static int ns_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task) +static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, + struct task_struct *task, bool threadgroup) { if (current != task) { if (!capable(CAP_SYS_ADMIN)) @@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss, if (!cgroup_is_descendant(new_cgroup, task)) return -EPERM; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (!cgroup_is_descendant(new_cgroup, c)) { + rcu_read_unlock(); + return -EPERM; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/panic.c b/kernel/panic.c index 512ab73..96b45d0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...) atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + bust_spinlocks(0); + if (!panic_blink) panic_blink = no_blink; @@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...) mdelay(1); i++; } - bust_spinlocks(0); } EXPORT_SYMBOL(panic); @@ -177,7 +178,7 @@ static const struct tnt tnts[] = { * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. * - * The string is overwritten by the next call to print_taint(). + * The string is overwritten by the next call to print_tainted(). */ const char *print_tainted(void) { diff --git a/kernel/params.c b/kernel/params.c index 7f6912c..9da58ea 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,7 @@ #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/ctype.h> #if 0 #define DEBUGP printk @@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val) } for (i = 0; args[i]; i++) { - if (args[i] == ' ' && !in_quote) + if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') @@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val) next = args + i; /* Chew up trailing spaces. */ - while (*next == ' ') + while (isspace(*next)) next++; return next; } @@ -138,7 +139,7 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); /* Chew leading spaces */ - while (*args == ' ') + while (isspace(*args)) args++; while (*args) { diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c deleted file mode 100644 index e0d91fd..0000000 --- a/kernel/perf_counter.c +++ /dev/null @@ -1,4962 +0,0 @@ -/* - * Performance counter core code - * - * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING - */ - -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/file.h> -#include <linux/poll.h> -#include <linux/sysfs.h> -#include <linux/dcache.h> -#include <linux/percpu.h> -#include <linux/ptrace.h> -#include <linux/vmstat.h> -#include <linux/hardirq.h> -#include <linux/rculist.h> -#include <linux/uaccess.h> -#include <linux/syscalls.h> -#include <linux/anon_inodes.h> -#include <linux/kernel_stat.h> -#include <linux/perf_counter.h> - -#include <asm/irq_regs.h> - -/* - * Each CPU has a list of per CPU counters: - */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_counters __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_comm_counters __read_mostly; -static atomic_t nr_task_counters __read_mostly; - -/* - * perf counter paranoia level: - * -1 - not paranoid at all - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu counters for unpriv - * 2 - disallow kernel profiling for unpriv - */ -int sysctl_perf_counter_paranoid __read_mostly = 1; - -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_counter_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_counter_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_counter_paranoid > 1; -} - -int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ - -/* - * max perf counter sample rate - */ -int sysctl_perf_counter_sample_rate __read_mostly = 100000; - -static atomic64_t perf_counter_id; - -/* - * Lock for (sysadmin-configurable) counter reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); - -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} - -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak hw_perf_counter_setup(int cpu) { barrier(); } -void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } - -int __weak -hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - return 0; -} - -void __weak perf_counter_print_debug(void) { } - -static DEFINE_PER_CPU(int, disable_count); - -void __perf_disable(void) -{ - __get_cpu_var(disable_count)++; -} - -bool __perf_enable(void) -{ - return !--__get_cpu_var(disable_count); -} - -void perf_disable(void) -{ - __perf_disable(); - hw_perf_disable(); -} - -void perf_enable(void) -{ - if (__perf_enable()) - hw_perf_enable(); -} - -static void get_ctx(struct perf_counter_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void free_ctx(struct rcu_head *head) -{ - struct perf_counter_context *ctx; - - ctx = container_of(head, struct perf_counter_context, rcu_head); - kfree(ctx); -} - -static void put_ctx(struct perf_counter_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); - } -} - -static void unclone_ctx(struct perf_counter_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -/* - * If we inherit counters we want to return the parent counter id - * to userspace. - */ -static u64 primary_counter_id(struct perf_counter *counter) -{ - u64 id = counter->id; - - if (counter->parent) - id = counter->parent->id; - - return id; -} - -/* - * Get the perf_counter_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_counter_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) -{ - struct perf_counter_context *ctx; - - rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_counter_ctxp); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_counter_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) -{ - struct perf_counter_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_counter_context *ctx) -{ - unsigned long flags; - - spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); -} - -/* - * Add a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *group_leader = counter->group_leader; - - /* - * Depending on whether it is a standalone or sibling counter, - * add it straight to the context's counter list, or to the group - * leader's sibling list: - */ - if (group_leader == counter) - list_add_tail(&counter->list_entry, &ctx->counter_list); - else { - list_add_tail(&counter->list_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - } - - list_add_rcu(&counter->event_entry, &ctx->event_list); - ctx->nr_counters++; - if (counter->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Remove a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *sibling, *tmp; - - if (list_empty(&counter->list_entry)) - return; - ctx->nr_counters--; - if (counter->attr.inherit_stat) - ctx->nr_stat--; - - list_del_init(&counter->list_entry); - list_del_rcu(&counter->event_entry); - - if (counter->group_leader != counter) - counter->group_leader->nr_siblings--; - - /* - * If this was a group counter with sibling counters then - * upgrade the siblings to singleton counters by adding them - * to the context list directly: - */ - list_for_each_entry_safe(sibling, tmp, - &counter->sibling_list, list_entry) { - - list_move_tail(&sibling->list_entry, &ctx->counter_list); - sibling->group_leader = sibling; - } -} - -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (counter->pending_disable) { - counter->pending_disable = 0; - counter->state = PERF_COUNTER_STATE_OFF; - } - counter->tstamp_stopped = ctx->time; - counter->pmu->disable(counter); - counter->oncpu = -1; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (counter->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter_sched_out(group_counter, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) - counter_sched_out(counter, cpuctx, ctx); - - if (group_counter->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance counter - * - * We disable the counter on the hardware level first. After that we - * remove it from the context list. - */ -static void __perf_counter_remove_from_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. - */ - perf_disable(); - - counter_sched_out(counter, cpuctx, ctx); - - list_del_counter(counter, ctx); - - if (!ctx->task) { - /* - * Allow more per task counters with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_counters - ctx->nr_counters, - perf_max_counters - perf_reserved_percpu); - } - - perf_enable(); - spin_unlock(&ctx->lock); -} - - -/* - * Remove the counter from a task's (or a CPU's) list of counters. - * - * Must be called with ctx->mutex held. - * - * CPU counters are removed with a smp call. For task counters we only - * call when the task is on a CPU. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_counter_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_counter_remove_from_context(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are removed via an smp call and - * the removal is always sucessful. - */ - smp_call_function_single(counter->cpu, - __perf_counter_remove_from_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_counter_remove_from_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * If the context is active we need to retry the smp call. - */ - if (ctx->nr_active && !list_empty(&counter->list_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if the call above did not - * succeed. - */ - if (!list_empty(&counter->list_entry)) { - list_del_counter(counter, ctx); - } - spin_unlock_irq(&ctx->lock); -} - -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_counter_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a counter. - */ -static void update_counter_times(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - u64 run_end; - - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) - return; - - counter->total_time_enabled = ctx->time - counter->tstamp_enabled; - - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time; - - counter->total_time_running = run_end - counter->tstamp_running; -} - -/* - * Update total_time_enabled and total_time_running for all counters in a group. - */ -static void update_group_times(struct perf_counter *leader) -{ - struct perf_counter *counter; - - update_counter_times(leader); - list_for_each_entry(counter, &leader->sibling_list, list_entry) - update_counter_times(counter); -} - -/* - * Cross CPU call to disable a performance counter - */ -static void __perf_counter_disable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - - /* - * If the counter is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx); - update_group_times(counter); - if (counter == counter->group_leader) - group_sched_out(counter, cpuctx, ctx); - else - counter_sched_out(counter, cpuctx, ctx); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock(&ctx->lock); -} - -/* - * Disable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_counter_for_each_child or perf_counter_for_each because they - * hold the top-level counter's child_mutex, so any descendant that - * goes to exit will block in sync_child_counter. - * When called from perf_pending_counter it's OK because counter->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_counter_task_sched_out for this context. - */ -static void perf_counter_disable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_disable, - counter, 1); - return; - } - - retry: - task_oncpu_function_call(task, __perf_counter_disable, counter); - - spin_lock_irq(&ctx->lock); - /* - * If the counter is still active, we need to retry the cross-call. - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock_irq(&ctx->lock); -} - -static int -counter_sched_in(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - if (counter->state <= PERF_COUNTER_STATE_OFF) - return 0; - - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (counter->pmu->enable(counter)) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->oncpu = -1; - return -EAGAIN; - } - - counter->tstamp_running += ctx->time - counter->tstamp_stopped; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu++; - ctx->nr_active++; - - if (counter->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - - list_for_each_entry(counter, &leader->sibling_list, list_entry) - if (!is_software_counter(counter)) - return 0; - - return 1; -} - -/* - * Work out whether we can put this counter group on the CPU now. - */ -static int group_can_go_on(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software counters can always go on. - */ - if (is_software_only_group(counter)) - return 1; - /* - * If an exclusive group is already on, no other hardware - * counters can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * counters on the CPU, it can't go on. - */ - if (counter->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_counter_to_ctx(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - list_add_counter(counter, ctx); - counter->tstamp_enabled = ctx->time; - counter->tstamp_running = ctx->time; - counter->tstamp_stopped = ctx->time; -} - -/* - * Cross CPU call to install and enable a performance counter - * - * Must be called with ctx->mutex held - */ -static void __perf_install_in_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int cpu = smp_processor_id(); - int err; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no counters. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. - */ - perf_disable(); - - add_counter_to_ctx(counter, ctx); - - /* - * Don't put the counter on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE || - (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) - goto unlock; - - /* - * An exclusive counter can't go on if there are already active - * hardware counters, and no hardware counter can go on if there - * is already an exclusive counter on. - */ - if (!group_can_go_on(counter, cpuctx, 1)) - err = -EEXIST; - else - err = counter_sched_in(counter, cpuctx, ctx, cpu); - - if (err) { - /* - * This counter couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the counter group is pinned then put it in error state. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - - spin_unlock(&ctx->lock); -} - -/* - * Attach a performance counter to a context - * - * First we add the counter to the list with the hardware enable bit - * in counter->hw_config cleared. - * - * If the counter is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. - */ -static void -perf_install_in_context(struct perf_counter_context *ctx, - struct perf_counter *counter, - int cpu) -{ - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are installed via an smp call and - * the install is always sucessful. - */ - smp_call_function_single(cpu, __perf_install_in_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_install_in_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * we need to retry the smp call. - */ - if (ctx->is_active && list_empty(&counter->list_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can add the counter safely, if it the call above did not - * succeed. - */ - if (list_empty(&counter->list_entry)) - add_counter_to_ctx(counter, ctx); - spin_unlock_irq(&ctx->lock); -} - -/* - * Put a counter into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_counter_mark_enabled(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - struct perf_counter *sub; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - list_for_each_entry(sub, &counter->sibling_list, list_entry) - if (sub->state >= PERF_COUNTER_STATE_INACTIVE) - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; -} - -/* - * Cross CPU call to enable a performance counter - */ -static void __perf_counter_enable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int err; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto unlock; - __perf_counter_mark_enabled(counter, ctx); - - /* - * If the counter is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(counter, cpuctx, 1)) { - err = -EEXIST; - } else { - perf_disable(); - if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - perf_enable(); - } - - if (err) { - /* - * If this counter can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - unlock: - spin_unlock(&ctx->lock); -} - -/* - * Enable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_counter_for_each_child or perf_counter_for_each as described - * for perf_counter_disable. - */ -static void perf_counter_enable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_enable, - counter, 1); - return; - } - - spin_lock_irq(&ctx->lock); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto out; - - /* - * If the counter is in error state, clear that first. - * That way, if we see the counter in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - counter->state = PERF_COUNTER_STATE_OFF; - - retry: - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_counter_enable, counter); - - spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the counter is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) - goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_OFF) - __perf_counter_mark_enabled(counter, ctx); - - out: - spin_unlock_irq(&ctx->lock); -} - -static int perf_counter_refresh(struct perf_counter *counter, int refresh) -{ - /* - * not supported on inherited counters - */ - if (counter->attr.inherit) - return -EINVAL; - - atomic_add(refresh, &counter->event_limit); - perf_counter_enable(counter); - - return 0; -} - -void __perf_counter_sched_out(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_counter *counter; - - spin_lock(&ctx->lock); - ctx->is_active = 0; - if (likely(!ctx->nr_counters)) - goto out; - update_context_time(ctx); - - perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter != counter->group_leader) - counter_sched_out(counter, cpuctx, ctx); - else - group_sched_out(counter, cpuctx, ctx); - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled counters. - * If the number of enabled counters is the same, then the set - * of enabled counters should be the same, because these are both - * inherited contexts, therefore we can't access individual counters - * in them directly with an fd; we can only enable/disable all - * counters via prctl, or enable/disable all counters in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_counter_context *ctx1, - struct perf_counter_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_counter_read(void *counter); - -static void __perf_counter_sync_stat(struct perf_counter *counter, - struct perf_counter *next_counter) -{ - u64 value; - - if (!counter->attr.inherit_stat) - return; - - /* - * Update the counter value, we cannot use perf_counter_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the counter must be on the current CPU, therefore we - * don't need to use it. - */ - switch (counter->state) { - case PERF_COUNTER_STATE_ACTIVE: - __perf_counter_read(counter); - break; - - case PERF_COUNTER_STATE_INACTIVE: - update_counter_times(counter); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the counter - * values when we flip the contexts. - */ - value = atomic64_read(&next_counter->count); - value = atomic64_xchg(&counter->count, value); - atomic64_set(&next_counter->count, value); - - swap(counter->total_time_enabled, next_counter->total_time_enabled); - swap(counter->total_time_running, next_counter->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_counter_update_userpage(counter); - perf_counter_update_userpage(next_counter); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_counter_sync_stat(struct perf_counter_context *ctx, - struct perf_counter_context *next_ctx) -{ - struct perf_counter *counter, *next_counter; - - if (!ctx->nr_stat) - return; - - counter = list_first_entry(&ctx->event_list, - struct perf_counter, event_entry); - - next_counter = list_first_entry(&next_ctx->event_list, - struct perf_counter, event_entry); - - while (&counter->event_entry != &ctx->event_list && - &next_counter->event_entry != &next_ctx->event_list) { - - __perf_counter_sync_stat(counter, next_counter); - - counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(next_counter, event_entry); - } -} - -/* - * Called from scheduler to remove the counters of the current task, - * with interrupts disabled. - * - * We stop each counter and update the counter value in counter->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * not restart the counter. - */ -void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter_context *next_ctx; - struct perf_counter_context *parent; - struct pt_regs *regs; - int do_switch = 1; - - regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); - - if (likely(!ctx || !cpuctx->task_ctx)) - return; - - update_context_time(ctx); - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_counter_ctxp; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_counter_ctxp - */ - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_counter_sync_stat(ctx, next_ctx); - } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; - } -} - -/* - * Called with IRQs disabled - */ -static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) -{ - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); -} - -static void -__perf_counter_sched_in(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter *counter; - int can_add_hw = 1; - - spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_counters)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->attr.pinned) - continue; - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) - counter_sched_in(counter, cpuctx, ctx, cpu); - else { - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); - } - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_ERROR; - } - } - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - /* - * Ignore counters in OFF or ERROR state, and - * ignore pinned counters since we did them already. - */ - if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->attr.pinned) - continue; - - /* - * Listen to the 'cpu' scheduling filter constraint - * of counters: - */ - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } else { - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Called from scheduler to add the counters of the current task - * with interrupts disabled. - * - * We restore the counter value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. - */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - - if (likely(!ctx)) - return; - if (cpuctx->task_ctx == ctx) - return; - __perf_counter_sched_in(ctx, cpuctx, cpu); - cpuctx->task_ctx = ctx; -} - -static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter_context *ctx = &cpuctx->ctx; - - __perf_counter_sched_in(ctx, cpuctx, cpu); -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_counter *counter, int enable); - -static void perf_adjust_period(struct perf_counter *counter, u64 events) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period, sample_period; - s64 delta; - - events *= hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; -} - -static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - struct hw_perf_counter *hwc; - u64 interrupts, freq; - - spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - continue; - - hwc = &counter->hw; - - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle counters on the tick - */ - if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(counter, 1); - counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_sample_rate/HZ; - } - - if (!counter->attr.freq || !counter->attr.sample_freq) - continue; - - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (counter->attr.sample_freq < HZ) { - freq = counter->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(counter, freq * interrupts); - - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - counter->pmu->disable(counter); - atomic64_set(&hwc->period_left, 0); - counter->pmu->enable(counter); - perf_enable(); - } - } - spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's counters: - */ -static void rotate_ctx(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (!ctx->nr_counters) - return; - - spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group counters too): - */ - perf_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - list_move_tail(&counter->list_entry, &ctx->counter_list); - break; - } - perf_enable(); - - spin_unlock(&ctx->lock); -} - -void perf_counter_task_tick(struct task_struct *curr, int cpu) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - - if (!atomic_read(&nr_counters)) - return; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = curr->perf_counter_ctxp; - - perf_ctx_adjust_freq(&cpuctx->ctx); - if (ctx) - perf_ctx_adjust_freq(ctx); - - perf_counter_cpu_sched_out(cpuctx); - if (ctx) - __perf_counter_task_sched_out(ctx); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - perf_counter_cpu_sched_in(cpuctx, cpu); - if (ctx) - perf_counter_task_sched_in(curr, cpu); -} - -/* - * Enable all of a task's counters that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_counter_enable_on_exec(struct task_struct *task) -{ - struct perf_counter_context *ctx; - struct perf_counter *counter; - unsigned long flags; - int enabled = 0; - - local_irq_save(flags); - ctx = task->perf_counter_ctxp; - if (!ctx || !ctx->nr_counters) - goto out; - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (!counter->attr.enable_on_exec) - continue; - counter->attr.enable_on_exec = 0; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - continue; - __perf_counter_mark_enabled(counter, ctx); - enabled = 1; - } - - /* - * Unclone this context if we enabled any counter. - */ - if (enabled) - unclone_ctx(ctx); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(task, smp_processor_id()); - out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware counter - */ -static void __perf_counter_read(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * counter->count would have been updated to a recent sample - * when the counter was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - counter->pmu->read(counter); - update_counter_times(counter); - local_irq_restore(flags); -} - -static u64 perf_counter_read(struct perf_counter *counter) -{ - /* - * If counter is enabled and currently active on a CPU, update the - * value in the counter structure: - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - smp_call_function_single(counter->oncpu, - __perf_counter_read, counter, 1); - } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); - } - - return atomic64_read(&counter->count); -} - -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); - ctx->task = task; -} - -static struct perf_counter_context *find_get_context(pid_t pid, int cpu) -{ - struct perf_counter_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - - /* - * If cpu is not a wildcard then this is a percpu counter: - */ - if (cpu != -1) { - /* Must be root to operate on a CPU counter: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - if (cpu < 0 || cpu > num_possible_cpus()) - return ERR_PTR(-EINVAL); - - /* - * We could be clever and allow to attach a counter to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_isset(cpu, cpu_online_map)) - return ERR_PTR(-ENODEV); - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - - return ctx; - } - - rcu_read_lock(); - if (!pid) - task = current; - else - task = find_task_by_vpid(pid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* - * Can't attach counters to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - retry: - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { - ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - err = -ENOMEM; - if (!ctx) - goto errout; - __perf_counter_init_context(ctx, task); - get_ctx(ctx); - if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ - kfree(ctx); - goto retry; - } - get_task_struct(task); - } - - put_task_struct(task); - return ctx; - - errout: - put_task_struct(task); - return ERR_PTR(err); -} - -static void free_counter_rcu(struct rcu_head *head) -{ - struct perf_counter *counter; - - counter = container_of(head, struct perf_counter, rcu_head); - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); -} - -static void perf_pending_sync(struct perf_counter *counter); - -static void free_counter(struct perf_counter *counter) -{ - perf_pending_sync(counter); - - if (!counter->parent) { - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); - if (counter->attr.task) - atomic_dec(&nr_task_counters); - } - - if (counter->output) { - fput(counter->output->filp); - counter->output = NULL; - } - - if (counter->destroy) - counter->destroy(counter); - - put_ctx(counter->ctx); - call_rcu(&counter->rcu_head, free_counter_rcu); -} - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_counter *counter = file->private_data; - struct perf_counter_context *ctx = counter->ctx; - - file->private_data = NULL; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&ctx->mutex); - - mutex_lock(&counter->owner->perf_counter_mutex); - list_del_init(&counter->owner_entry); - mutex_unlock(&counter->owner->perf_counter_mutex); - put_task_struct(counter->owner); - - free_counter(counter); - - return 0; -} - -static int perf_counter_read_size(struct perf_counter *counter) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_GROUP) { - nr += counter->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - -static u64 perf_counter_read_value(struct perf_counter *counter) -{ - struct perf_counter *child; - u64 total = 0; - - total += perf_counter_read(counter); - list_for_each_entry(child, &counter->child_list, child_list) - total += perf_counter_read(child); - - return total; -} - -static int perf_counter_read_entry(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} - -static int perf_counter_read_group(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - struct perf_counter *leader = counter->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_counter_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; - - size += err; - - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - err = perf_counter_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; - - size += err; - } - - return size; -} - -static int perf_counter_read_one(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - u64 values[4]; - int n = 0; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance counter - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) -{ - u64 read_format = counter->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a counter that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - return 0; - - if (count < perf_counter_read_size(counter)) - return -ENOSPC; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_counter_read_group(counter, read_format, buf); - else - ret = perf_counter_read_one(counter, read_format, buf); - mutex_unlock(&counter->child_mutex); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_counter *counter = file->private_data; - - return perf_read_hw(counter, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_counter *counter = file->private_data; - struct perf_mmap_data *data; - unsigned int events = POLL_HUP; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - events = atomic_xchg(&data->poll, 0); - rcu_read_unlock(); - - poll_wait(file, &counter->waitq, wait); - - return events; -} - -static void perf_counter_reset(struct perf_counter *counter) -{ - (void)perf_counter_read(counter); - atomic64_set(&counter->count, 0); - perf_counter_update_userpage(counter); -} - -/* - * Holding the top-level counter's child_mutex means that any - * descendant process that has inherited this counter will block - * in sync_child_counter if it goes to exit, thus satisfying the - * task existence requirements of perf_counter_enable/disable. - */ -static void perf_counter_for_each_child(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter *child; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - func(counter); - list_for_each_entry(child, &counter->child_list, child_list) - func(child); - mutex_unlock(&counter->child_mutex); -} - -static void perf_counter_for_each(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - perf_counter_for_each_child(counter, func); - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, list_entry) - perf_counter_for_each_child(counter, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) -{ - struct perf_counter_context *ctx = counter->ctx; - unsigned long size; - int ret = 0; - u64 value; - - if (!counter->attr.sample_period) - return -EINVAL; - - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) - return -EFAULT; - - if (!value) - return -EINVAL; - - spin_lock_irq(&ctx->lock); - if (counter->attr.freq) { - if (value > sysctl_perf_counter_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - counter->attr.sample_freq = value; - } else { - counter->attr.sample_period = value; - counter->hw.sample_period = value; - } -unlock: - spin_unlock_irq(&ctx->lock); - - return ret; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd); - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_counter *counter = file->private_data; - void (*func)(struct perf_counter *); - u32 flags = arg; - - switch (cmd) { - case PERF_COUNTER_IOC_ENABLE: - func = perf_counter_enable; - break; - case PERF_COUNTER_IOC_DISABLE: - func = perf_counter_disable; - break; - case PERF_COUNTER_IOC_RESET: - func = perf_counter_reset; - break; - - case PERF_COUNTER_IOC_REFRESH: - return perf_counter_refresh(counter, arg); - - case PERF_COUNTER_IOC_PERIOD: - return perf_counter_period(counter, (u64 __user *)arg); - - case PERF_COUNTER_IOC_SET_OUTPUT: - return perf_counter_set_output(counter, arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_counter_for_each(counter, func); - else - perf_counter_for_each_child(counter, func); - - return 0; -} - -int perf_counter_task_enable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_enable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -int perf_counter_task_disable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_disable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -#ifndef PERF_COUNTER_INDEX_OFFSET -# define PERF_COUNTER_INDEX_OFFSET 0 -#endif - -static int perf_counter_index(struct perf_counter *counter) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; - - return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_counter_mmap_page *userpg; - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - userpg = data->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_counter_index(counter); - userpg->offset = atomic64_read(&counter->count); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - userpg->offset -= atomic64_read(&counter->hw.prev_count); - - userpg->time_enabled = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - - userpg->time_running = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_counter *counter = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; - - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; - - vmf->page = virt_to_page(data->data_pages[nr]); - } - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) -{ - struct perf_mmap_data *data; - unsigned long size; - int i; - - WARN_ON(atomic_read(&counter->mmap_count)); - - size = sizeof(struct perf_mmap_data); - size += nr_pages * sizeof(void *); - - data = kzalloc(size, GFP_KERNEL); - if (!data) - goto fail; - - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->data_pages[i]) - goto fail_data_pages; - } - - data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - rcu_assign_pointer(counter->data, data); - - return 0; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); - - free_page((unsigned long)data->user_page); - -fail_user_page: - kfree(data); - -fail: - return -ENOMEM; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void __perf_mmap_data_free(struct rcu_head *rcu_head) -{ - struct perf_mmap_data *data; - int i; - - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - - kfree(data); -} - -static void perf_mmap_data_free(struct perf_counter *counter) -{ - struct perf_mmap_data *data = counter->data; - - WARN_ON(atomic_read(&counter->mmap_count)); - - rcu_assign_pointer(counter->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - atomic_inc(&counter->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - struct user_struct *user = current_user(); - - atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= counter->data->nr_locked; - perf_mmap_data_free(counter); - mutex_unlock(&counter->mmap_mutex); - } -} - -static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_counter *counter = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have data pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->mmap_mutex); - if (counter->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&counter->mmap_count)) { - if (nr_pages != counter->data->nr_pages) - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; - - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(counter->data); - ret = perf_mmap_data_alloc(counter, nr_pages); - if (ret) - goto unlock; - - atomic_set(&counter->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - counter->data->nr_locked = extra; - if (vma->vm_flags & VM_WRITE) - counter->data->writable = 1; - -unlock: - mutex_unlock(&counter->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_counter *counter = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &counter->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf counter wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_counter_wakeup(struct perf_counter *counter) -{ - wake_up_all(&counter->waitq); - - if (counter->pending_kill) { - kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); - counter->pending_kill = 0; - } -} - -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_counter(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - if (counter->pending_disable) { - counter->pending_disable = 0; - __perf_counter_disable(counter); - } - - if (counter->pending_wakeup) { - counter->pending_wakeup = 0; - perf_counter_wakeup(counter); - } -} - -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_counter_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_counter *counter) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return counter->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_counter *counter) -{ - wait_event(counter->waitq, perf_not_pending(counter)); -} - -void perf_counter_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - -/* - * Output - */ - -struct perf_output_handle { - struct perf_counter *counter; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; -}; - -static bool perf_output_space(struct perf_mmap_data *data, - unsigned int offset, unsigned int head) -{ - unsigned long tail; - unsigned long mask; - - if (!data->writable) - return true; - - mask = (data->nr_pages << PAGE_SHIFT) - 1; - /* - * Userspace could choose to issue a mb() before updating the tail - * pointer. So that all reads will be completed before the write is - * issued. - */ - tail = ACCESS_ONCE(data->user_page->data_tail); - smp_rmb(); - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->data->poll, POLL_IN); - - if (handle->nmi) { - handle->counter->pending_wakeup = 1; - perf_pending_queue(&handle->counter->pending, - perf_pending_counter); - } else - perf_counter_wakeup(handle->counter); -} - -/* - * Curious locking construct. - * - * We need to ensure a later event doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_lock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - int cpu; - - handle->locked = 0; - - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; - - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - handle->locked = 1; -} - -static void perf_output_unlock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; - -again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; - - /* - * NMI can happen here, which means we can miss a done_head update. - */ - - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); - - /* - * Therefore we have to validate we did not indeed do so. - */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - goto again; - } - - if (atomic_xchg(&data->wakeup, 0)) - perf_output_wakeup(handle); -out: - local_irq_restore(handle->flags); -} - -static void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - - do { - unsigned int page_offset; - int nr; - - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); - - memcpy(pages[nr] + page_offset, buf, size); - - len -= size; - buf += size; - offset += size; - } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); -} - -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - -static int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample) -{ - struct perf_counter *output_counter; - struct perf_mmap_data *data; - unsigned int offset, head; - int have_lost; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited counters we send all the output towards the parent. - */ - if (counter->parent) - counter = counter->parent; - - output_counter = rcu_dereference(counter->output); - if (output_counter) - counter = output_counter; - - data = rcu_dereference(counter->data); - if (!data) - goto out; - - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->sample = sample; - - if (!data->nr_pages) - goto fail; - - have_lost = atomic_read(&data->lost); - if (have_lost) - size += sizeof(lost_event); - - perf_output_lock(handle); - - do { - offset = head = atomic_long_read(&data->head); - head += size; - if (unlikely(!perf_output_space(data, offset, head))) - goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); - - handle->offset = offset; - handle->head = head; - - if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) - atomic_set(&data->wakeup, 1); - - if (have_lost) { - lost_event.header.type = PERF_EVENT_LOST; - lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); - lost_event.id = counter->id; - lost_event.lost = atomic_xchg(&data->lost, 0); - - perf_output_put(handle, lost_event); - } - - return 0; - -fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -static void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_counter *counter = handle->counter; - struct perf_mmap_data *data = handle->data; - - int wakeup_events = counter->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); - if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); - } - } - - perf_output_unlock(handle); - rcu_read_unlock(); -} - -static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_tgid_nr_ns(p, counter->ns); -} - -static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_pid_nr_ns(p, counter->ns); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - u64 read_format = counter->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = atomic64_read(&counter->count); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - perf_output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - struct perf_counter *leader = counter->group_leader, *sub; - u64 read_format = counter->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; - - if (leader != counter) - leader->pmu->read(leader); - - values[n++] = atomic64_read(&leader->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(leader); - - perf_output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - n = 0; - - if (sub != counter) - sub->pmu->read(sub); - - values[n++] = atomic64_read(&sub->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(sub); - - perf_output_copy(handle, values, n * sizeof(u64)); - } -} - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - if (counter->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, counter); - else - perf_output_read_one(handle, counter); -} - -void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) -{ - int ret; - u64 sample_type = counter->attr.sample_type; - struct perf_output_handle handle; - struct perf_event_header header; - u64 ip; - struct { - u32 pid, tid; - } tid_entry; - struct perf_callchain_entry *callchain = NULL; - int callchain_size = 0; - u64 time; - struct { - u32 cpu, reserved; - } cpu_entry; - - header.type = PERF_EVENT_SAMPLE; - header.size = sizeof(header); - - header.misc = 0; - header.misc |= perf_misc_flags(data->regs); - - if (sample_type & PERF_SAMPLE_IP) { - ip = perf_instruction_pointer(data->regs); - header.size += sizeof(ip); - } - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - tid_entry.pid = perf_counter_pid(counter, current); - tid_entry.tid = perf_counter_tid(counter, current); - - header.size += sizeof(tid_entry); - } - - if (sample_type & PERF_SAMPLE_TIME) { - /* - * Maybe do better on x86 and provide cpu_clock_nmi() - */ - time = sched_clock(); - - header.size += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_ID) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_CPU) { - header.size += sizeof(cpu_entry); - - cpu_entry.cpu = raw_smp_processor_id(); - cpu_entry.reserved = 0; - } - - if (sample_type & PERF_SAMPLE_PERIOD) - header.size += sizeof(u64); - - if (sample_type & PERF_SAMPLE_READ) - header.size += perf_counter_read_size(counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - callchain = perf_callchain(data->regs); - - if (callchain) { - callchain_size = (1 + callchain->nr) * sizeof(u64); - header.size += callchain_size; - } else - header.size += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header.size += size; - } - - ret = perf_output_begin(&handle, counter, header.size, nmi, 1); - if (ret) - return; - - perf_output_put(&handle, header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(&handle, ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(&handle, tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(&handle, time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(&handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - u64 id = primary_counter_id(counter); - - perf_output_put(&handle, id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(&handle, counter->id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(&handle, cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(&handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(&handle, counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); - else { - u64 nr = 0; - perf_output_put(&handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(&handle, data->raw->size); - perf_output_copy(&handle, data->raw->data, data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(&handle, raw); - } - } - - perf_output_end(&handle); -} - -/* - * read event - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_counter_read_event(struct perf_counter *counter, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_read_event event = { - .header = { - .type = PERF_EVENT_READ, - .misc = 0, - .size = sizeof(event) + perf_counter_read_size(counter), - }, - .pid = perf_counter_pid(counter, task), - .tid = perf_counter_tid(counter, task), - }; - int ret; - - ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); - if (ret) - return; - - perf_output_put(&handle, event); - perf_output_read(&handle, counter); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_counter_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - } event; -}; - -static void perf_counter_task_output(struct perf_counter *counter, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - int size = task_event->event.header.size; - struct task_struct *task = task_event->task; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, current); - - task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, current); - - perf_output_put(&handle, task_event->event); - perf_output_end(&handle); -} - -static int perf_counter_task_match(struct perf_counter *counter) -{ - if (counter->attr.comm || counter->attr.mmap || counter->attr.task) - return 1; - - return 0; -} - -static void perf_counter_task_ctx(struct perf_counter_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_task_match(counter)) - perf_counter_task_output(counter, task_event); - } - rcu_read_unlock(); -} - -static void perf_counter_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx = task_event->task_ctx; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_task_ctx(&cpuctx->ctx, task_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - if (!ctx) - ctx = rcu_dereference(task_event->task->perf_counter_ctxp); - if (ctx) - perf_counter_task_ctx(ctx, task_event); - rcu_read_unlock(); -} - -static void perf_counter_task(struct task_struct *task, - struct perf_counter_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_task_counters)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event = { - .header = { - .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, - .misc = 0, - .size = sizeof(task_event.event), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - }, - }; - - perf_counter_task_event(&task_event); -} - -void perf_counter_fork(struct task_struct *task) -{ - perf_counter_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event; -}; - -static void perf_counter_comm_output(struct perf_counter *counter, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - int size = comm_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - comm_event->event.pid = perf_counter_pid(counter, comm_event->task); - comm_event->event.tid = perf_counter_tid(counter, comm_event->task); - - perf_output_put(&handle, comm_event->event); - perf_output_copy(&handle, comm_event->comm, - comm_event->comm_size); - perf_output_end(&handle); -} - -static int perf_counter_comm_match(struct perf_counter *counter) -{ - if (counter->attr.comm) - return 1; - - return 0; -} - -static void perf_counter_comm_ctx(struct perf_counter_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter)) - perf_counter_comm_output(counter, comm_event); - } - rcu_read_unlock(); -} - -static void perf_counter_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - unsigned int size; - char comm[TASK_COMM_LEN]; - - memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event.header.size = sizeof(comm_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(&cpuctx->ctx, comm_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_comm_ctx(ctx, comm_event); - rcu_read_unlock(); -} - -void perf_counter_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - - if (task->perf_counter_ctxp) - perf_counter_enable_on_exec(task); - - if (!atomic_read(&nr_comm_counters)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event = { - .header = { - .type = PERF_EVENT_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_counter_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event; -}; - -static void perf_counter_mmap_output(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - mmap_event->event.pid = perf_counter_pid(counter, current); - mmap_event->event.tid = perf_counter_tid(counter, current); - - perf_output_put(&handle, mmap_event->event); - perf_output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - perf_output_end(&handle); -} - -static int perf_counter_mmap_match(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - if (counter->attr.mmap) - return 1; - - return 0; -} - -static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, - struct perf_mmap_event *mmap_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_mmap_match(counter, mmap_event)) - perf_counter_mmap_output(counter, mmap_event); - } - rcu_read_unlock(); -} - -static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the buffer backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event.header.size = sizeof(mmap_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_mmap_ctx(ctx, mmap_event); - rcu_read_unlock(); - - kfree(buf); -} - -void __perf_counter_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_counters)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event = { - .header = { - .type = PERF_EVENT_MMAP, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = vma->vm_pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_counter *counter, int enable) -{ - struct perf_output_handle handle; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_EVENT_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = sched_clock(), - .id = primary_counter_id(counter), - .stream_id = counter->id, - }; - - if (enable) - throttle_event.header.type = PERF_EVENT_UNTHROTTLE; - - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_output_end(&handle); -} - -/* - * Generic counter overflow handling, sampling. - */ - -int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) -{ - int events = atomic_read(&counter->event_limit); - int throttle = counter->pmu->unthrottle != NULL; - struct hw_perf_counter *hwc = &counter->hw; - int ret = 0; - - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_counter_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling counters even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the counter got enabled again: - */ - ret = 1; - } - } - - if (counter->attr.freq) { - u64 now = sched_clock(); - s64 delta = now - hwc->freq_stamp; - - hwc->freq_stamp = now; - - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * counters - */ - - counter->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&counter->event_limit)) { - ret = 1; - counter->pending_kill = POLL_HUP; - if (nmi) { - counter->pending_disable = 1; - perf_pending_queue(&counter->pending, - perf_pending_counter); - } else - perf_counter_disable(counter); - } - - perf_counter_output(counter, nmi, data); - return ret; -} - -/* - * Generic software counter infrastructure - */ - -/* - * We directly increment counter->count and keep a second value in - * counter->hw.period_left to count intervals. This period counter - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swcounter_set_period(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = atomic64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 overflow; - - data->period = counter->hw.last_period; - overflow = perf_swcounter_set_period(counter); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (perf_counter_overflow(counter, nmi, data)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - } -} - -static void perf_swcounter_unthrottle(struct perf_counter *counter) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - struct hw_perf_counter *hwc = &counter->hw; - - atomic64_add(nr, &counter->count); - - if (!hwc->sample_period) - return; - - if (!data->regs) - return; - - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data); -} - -static int perf_swcounter_is_counting(struct perf_counter *counter) -{ - /* - * The counter is active, we're good! - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - return 1; - - /* - * The counter is off/error, not counting. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE) - return 0; - - /* - * The counter is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (counter->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - -static int perf_swcounter_match(struct perf_counter *counter, - enum perf_type_id type, - u32 event, struct pt_regs *regs) -{ - if (!perf_swcounter_is_counting(counter)) - return 0; - - if (counter->attr.type != type) - return 0; - if (counter->attr.config != event) - return 0; - - if (regs) { - if (counter->attr.exclude_user && user_mode(regs)) - return 0; - - if (counter->attr.exclude_kernel && !user_mode(regs)) - return 0; - } - - return 1; -} - -static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_type_id type, - u32 event, u64 nr, int nmi, - struct perf_sample_data *data) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, data->regs)) - perf_swcounter_add(counter, nr, nmi, data); - } - rcu_read_unlock(); -} - -static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) -{ - if (in_nmi()) - return &cpuctx->recursion[3]; - - if (in_irq()) - return &cpuctx->recursion[2]; - - if (in_softirq()) - return &cpuctx->recursion[1]; - - return &cpuctx->recursion[0]; -} - -static void do_perf_swcounter_event(enum perf_type_id type, u32 event, - u64 nr, int nmi, - struct perf_sample_data *data) -{ - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swcounter_recursion_context(cpuctx); - struct perf_counter_context *ctx; - - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, data); - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); - rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); -} - -void __perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data = { - .regs = regs, - .addr = addr, - }; - - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); -} - -static void perf_swcounter_read(struct perf_counter *counter) -{ -} - -static int perf_swcounter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - - if (hwc->sample_period) { - hwc->last_period = hwc->sample_period; - perf_swcounter_set_period(counter); - } - return 0; -} - -static void perf_swcounter_disable(struct perf_counter *counter) -{ -} - -static const struct pmu perf_ops_generic = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, - .unthrottle = perf_swcounter_unthrottle, -}; - -/* - * hrtimer based swcounter callback - */ - -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); - - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -/* - * Software counter: cpu wall time clock - */ - -static void cpu_clock_perf_counter_update(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = atomic64_read(&counter->hw.prev_count); - atomic64_set(&counter->hw.prev_count, now); - atomic64_add(now - prev, &counter->count); -} - -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int cpu = raw_smp_processor_id(); - - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void cpu_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - cpu_clock_perf_counter_update(counter); -} - -static void cpu_clock_perf_counter_read(struct perf_counter *counter) -{ - cpu_clock_perf_counter_update(counter); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_counter_enable, - .disable = cpu_clock_perf_counter_disable, - .read = cpu_clock_perf_counter_read, -}; - -/* - * Software counter: task time clock - */ - -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) -{ - u64 prev; - s64 delta; - - prev = atomic64_xchg(&counter->hw.prev_count, now); - delta = now - prev; - atomic64_add(delta, &counter->count); -} - -static int task_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 now; - - now = counter->ctx->time; - - atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void task_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, counter->ctx->time); - -} - -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(counter->ctx); - time = counter->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - counter->ctx->timestamp; - time = counter->ctx->time + delta; - } - - task_clock_perf_counter_update(counter, time); -} - -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, -}; - -#ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) -{ - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - struct perf_sample_data data = { - .regs = get_irq_regs(), - .addr = addr, - .raw = &raw, - }; - - if (!data.regs) - data.regs = task_pt_regs(current); - - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); -} -EXPORT_SYMBOL_GPL(perf_tpcounter_event); - -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); - -static void tp_perf_counter_destroy(struct perf_counter *counter) -{ - ftrace_profile_disable(counter->attr.config); -} - -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && - perf_paranoid_tracepoint_raw() && - !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - if (ftrace_profile_enable(counter->attr.config)) - return NULL; - - counter->destroy = tp_perf_counter_destroy; - - return &perf_ops_generic; -} -#else -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} -#endif - -atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_counter_destroy(struct perf_counter *counter) -{ - u64 event = counter->attr.config; - - WARN_ON(counter->parent); - - atomic_dec(&perf_swcounter_enabled[event]); -} - -static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) -{ - const struct pmu *pmu = NULL; - u64 event = counter->attr.config; - - /* - * Software counters (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. - */ - switch (event) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu counter, - * use the cpu_clock counter instead. - */ - if (counter->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event]); - counter->destroy = sw_perf_counter_destroy; - } - pmu = &perf_ops_generic; - break; - } - - return pmu; -} - -/* - * Allocate and initialize a counter structure - */ -static struct perf_counter * -perf_counter_alloc(struct perf_counter_attr *attr, - int cpu, - struct perf_counter_context *ctx, - struct perf_counter *group_leader, - struct perf_counter *parent_counter, - gfp_t gfpflags) -{ - const struct pmu *pmu; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - long err; - - counter = kzalloc(sizeof(*counter), gfpflags); - if (!counter) - return ERR_PTR(-ENOMEM); - - /* - * Single counters are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = counter; - - mutex_init(&counter->child_mutex); - INIT_LIST_HEAD(&counter->child_list); - - INIT_LIST_HEAD(&counter->list_entry); - INIT_LIST_HEAD(&counter->event_entry); - INIT_LIST_HEAD(&counter->sibling_list); - init_waitqueue_head(&counter->waitq); - - mutex_init(&counter->mmap_mutex); - - counter->cpu = cpu; - counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; - - counter->parent = parent_counter; - - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); - - counter->state = PERF_COUNTER_STATE_INACTIVE; - - if (attr->disabled) - counter->state = PERF_COUNTER_STATE_OFF; - - pmu = NULL; - - hwc = &counter->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - atomic64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited counters - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_counter_init(counter); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_counter_init(counter); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_counter_init(counter); - break; - - default: - break; - } -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); - return ERR_PTR(err); - } - - counter->pmu = pmu; - - if (!counter->parent) { - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); - if (counter->attr.task) - atomic_inc(&nr_task_counters); - } - - return counter; -} - -static int perf_copy_attr(struct perf_counter_attr __user *uattr, - struct perf_counter_attr *attr) -{ - int ret; - u32 size; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0. - */ - if (size > sizeof(*attr)) { - unsigned long val; - unsigned long __user *addr; - unsigned long __user *end; - - addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), - sizeof(unsigned long)); - end = PTR_ALIGN((void __user *)uattr + size, - sizeof(unsigned long)); - - for (; addr < end; addr += sizeof(unsigned long)) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd) -{ - struct perf_counter *output_counter = NULL; - struct file *output_file = NULL; - struct perf_counter *old_output; - int fput_needed = 0; - int ret = -EINVAL; - - if (!output_fd) - goto set; - - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) - goto out; - - output_counter = output_file->private_data; - - /* Don't chain output fds */ - if (output_counter->output) - goto out; - - /* Don't set an output fd when we already have an output channel */ - if (counter->data) - goto out; - - atomic_long_inc(&output_file->f_count); - -set: - mutex_lock(&counter->mmap_mutex); - old_output = counter->output; - rcu_assign_pointer(counter->output, output_counter); - mutex_unlock(&counter->mmap_mutex); - - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this counter. - */ - synchronize_rcu(); - fput(old_output->filp); - } - - ret = 0; -out: - fput_light(output_file, fput_needed); - return ret; -} - -/** - * sys_perf_counter_open - open a performance counter, associate it to a task/cpu - * - * @attr_uptr: event type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader counter fd - */ -SYSCALL_DEFINE5(perf_counter_open, - struct perf_counter_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_counter *counter, *group_leader; - struct perf_counter_attr attr; - struct perf_counter_context *ctx; - struct file *counter_file = NULL; - struct file *group_file = NULL; - int fput_needed = 0; - int fput_needed2 = 0; - int err; - - /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) - return -EINVAL; - - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_counter_sample_rate) - return -EINVAL; - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - /* - * Look up the group leader (we will attach this counter to it): - */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { - err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - - group_leader = group_file->private_data; - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_put_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (group_leader->ctx != ctx) - goto err_put_context; - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); - err = PTR_ERR(counter); - if (IS_ERR(counter)) - goto err_put_context; - - err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); - if (err < 0) - goto err_free_put_context; - - counter_file = fget_light(err, &fput_needed2); - if (!counter_file) - goto err_free_put_context; - - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_counter_set_output(counter, group_fd); - if (err) - goto err_fput_free_put_context; - } - - counter->filp = counter_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, counter, cpu); - ++ctx->generation; - mutex_unlock(&ctx->mutex); - - counter->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_counter_mutex); - list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); - mutex_unlock(¤t->perf_counter_mutex); - -err_fput_free_put_context: - fput_light(counter_file, fput_needed2); - -err_free_put_context: - if (err < 0) - kfree(counter); - -err_put_context: - if (err < 0) - put_ctx(ctx); - - fput_light(group_file, fput_needed); - - return err; -} - -/* - * inherit a counter from parent task to child task: - */ -static struct perf_counter * -inherit_counter(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter *group_leader, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *child_counter; - - /* - * Instead of creating recursive hierarchies of counters, - * we link inherited counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_counter->parent) - parent_counter = parent_counter->parent; - - child_counter = perf_counter_alloc(&parent_counter->attr, - parent_counter->cpu, child_ctx, - group_leader, parent_counter, - GFP_KERNEL); - if (IS_ERR(child_counter)) - return child_counter; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent counter, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en, dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - - if (parent_counter->attr.freq) - child_counter->hw.sample_period = parent_counter->hw.sample_period; - - /* - * Link it up in the child's context: - */ - add_counter_to_ctx(child_counter, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child counter exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_counter->filp->f_count); - - /* - * Link this into the parent counter's child list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_add_tail(&child_counter->child_list, &parent_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - return child_counter; -} - -static int inherit_group(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *leader; - struct perf_counter *sub; - struct perf_counter *child_ctr; - - leader = inherit_counter(parent_counter, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - child_ctr = inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static void sync_child_counter(struct perf_counter *child_counter, - struct task_struct *child) -{ - struct perf_counter *parent_counter = child_counter->parent; - u64 child_val; - - if (child_counter->attr.inherit_stat) - perf_counter_read_event(child_counter, child); - - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - atomic64_add(child_counter->total_time_enabled, - &parent_counter->child_total_time_enabled); - atomic64_add(child_counter->total_time_running, - &parent_counter->child_total_time_running); - - /* - * Remove this counter from the parent's list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - /* - * Release the parent counter, if this was the last - * reference to it. - */ - fput(parent_counter->filp); -} - -static void -__perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx, - struct task_struct *child) -{ - struct perf_counter *parent_counter; - - update_counter_times(child_counter); - perf_counter_remove_from_context(child_counter); - - parent_counter = child_counter->parent; - /* - * It can happen that parent exits first, and has counters - * that are still around due to the child reference. These - * counters need to be zapped - but otherwise linger. - */ - if (parent_counter) { - sync_child_counter(child_counter, child); - free_counter(child_counter); - } -} - -/* - * When a child task exits, feed back counter values to parent counters. - */ -void perf_counter_exit_task(struct task_struct *child) -{ - struct perf_counter *child_counter, *tmp; - struct perf_counter_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = child->perf_counter_ctxp; - __perf_counter_task_sched_out(child_ctx); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_counter_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the counters so that we - * won't get any samples after PERF_EVENT_EXIT. We can however still - * get a few PERF_EVENT_READ events. - */ - perf_counter_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_counter_exit_task() - * sync_child_counter() - * fput(parent_counter->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); - -again: - list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, - list_entry) - __perf_counter_exit_task(child_counter, child_ctx, child); - - /* - * If the last counter was a group counter, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->counter_list)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. - */ -void perf_counter_free_task(struct task_struct *task) -{ - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter *counter, *tmp; - - if (!ctx) - return; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { - struct perf_counter *parent = counter->parent; - - if (WARN_ON_ONCE(!parent)) - continue; - - mutex_lock(&parent->child_mutex); - list_del_init(&counter->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - list_del_counter(counter, ctx); - free_counter(counter); - } - - if (!list_empty(&ctx->counter_list)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - -/* - * Initialize the perf_counter context in task_struct - */ -int perf_counter_init_task(struct task_struct *child) -{ - struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter_context *cloned_ctx; - struct perf_counter *counter; - struct task_struct *parent = current; - int inherited_all = 1; - int ret = 0; - - child->perf_counter_ctxp = NULL; - - mutex_init(&child->perf_counter_mutex); - INIT_LIST_HEAD(&child->perf_counter_list); - - if (likely(!parent->perf_counter_ctxp)) - return 0; - - /* - * This is executed from the parent task context, so inherit - * counters that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_counter_init_context(child_ctx, child); - child->perf_counter_ctxp = child_ctx; - get_task_struct(child); - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { - if (counter != counter->group_leader) - continue; - - if (!counter->attr.inherit) { - inherited_all = 0; - continue; - } - - ret = inherit_group(counter, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; - break; - } - } - - if (inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of counters and the generation - * count can't have changed since we took the mutex. - */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - - return ret; -} - -static void __cpuinit perf_counter_init_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_counter_init_context(&cpuctx->ctx, NULL); - - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - - hw_perf_counter_setup(cpu); -} - -#ifdef CONFIG_HOTPLUG_CPU -static void __perf_counter_exit_cpu(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = &cpuctx->ctx; - struct perf_counter *counter, *tmp; - - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) - __perf_counter_remove_from_context(counter); -} -static void perf_counter_exit_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &cpuctx->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); -} -#else -static inline void perf_counter_exit_cpu(int cpu) { } -#endif - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - perf_counter_init_cpu(cpu); - break; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hw_perf_counter_setup_online(cpu); - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - perf_counter_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - -void __init perf_counter_init(void) -{ - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_counters) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, - perf_max_counters - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_counters", -}; - -static int __init perf_counter_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); -} -device_initcall(perf_counter_sysfs_init); diff --git a/kernel/perf_event.c b/kernel/perf_event.c new file mode 100644 index 0000000..12b5ec3 --- /dev/null +++ b/kernel/perf_event.c @@ -0,0 +1,5174 @@ +/* + * Performance events core code: + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * For licensing details see kernel-base/COPYING + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/sysfs.h> +#include <linux/dcache.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/vmstat.h> +#include <linux/vmalloc.h> +#include <linux/hardirq.h> +#include <linux/rculist.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/anon_inodes.h> +#include <linux/kernel_stat.h> +#include <linux/perf_event.h> +#include <linux/ftrace_event.h> + +#include <asm/irq_regs.h> + +/* + * Each CPU has a list of per CPU events: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_events __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_events __read_mostly; +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_event_paranoid > -1; +} + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_event_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_event_paranoid > 1; +} + +int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf event sample rate + */ +int sysctl_perf_event_sample_rate __read_mostly = 100000; + +static atomic64_t perf_event_id; + +/* + * Lock for (sysadmin-configurable) event reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + return NULL; +} + +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + +void __weak hw_perf_event_setup(int cpu) { barrier(); } +void __weak hw_perf_event_setup_online(int cpu) { barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + return 0; +} + +void __weak perf_event_print_debug(void) { } + +static DEFINE_PER_CPU(int, perf_disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(perf_disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(perf_disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_event_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *group_leader = event->group_leader; + + /* + * Depending on whether it is a standalone or sibling event, + * add it straight to the context's event list, or to the group + * leader's sibling list: + */ + if (group_leader == event) + list_add_tail(&event->group_entry, &ctx->group_list); + else { + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } + + list_add_rcu(&event->event_entry, &ctx->event_list); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; + + if (list_empty(&event->group_entry)) + return; + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_init(&event->group_entry); + list_del_rcu(&event->event_entry); + + if (event->group_leader != event) + event->group_leader->nr_siblings--; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + + list_move_tail(&sibling->group_entry, &ctx->group_list); + sibling->group_leader = sibling; + } +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = ctx->time; + event->pmu->disable(event); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_event_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + /* + * Protect the list operation against NMI by disabling the + * events on a global level. + */ + perf_disable(); + + event_sched_out(event, cpuctx, ctx); + + list_del_event(event, ctx); + + if (!ctx->task) { + /* + * Allow more per task events with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_events - ctx->nr_events, + perf_max_events - perf_reserved_percpu); + } + + perf_enable(); + spin_unlock(&ctx->lock); +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * Must be called with ctx->mutex held. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_event_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(event->cpu, + __perf_event_remove_from_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_event_remove_from_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the event safely, if the call above did not + * succeed. + */ + if (!list_empty(&event->group_entry)) { + list_del_event(event, ctx); + } + spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + event->total_time_enabled = ctx->time - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +/* + * Cross CPU call to disable a performance event + */ +static void __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock(&ctx->lock); +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +static void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_disable, + event, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_event_disable, event); + + spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock_irq(&ctx->lock); +} + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->enable(event)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += ctx->time - event->tstamp_stopped; + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + struct perf_event *event, *partial_group; + int ret; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + if (event_sched_in(group_event, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx, cpu)) { + partial_group = event; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + break; + event_sched_out(event, cpuctx, ctx); + } + event_sched_out(group_event, cpuctx, ctx); + + return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software events, + * 0 if the group contains any hardware events. + */ +static int is_software_only_group(struct perf_event *leader) +{ + struct perf_event *event; + + if (!is_software_event(leader)) + return 0; + + list_for_each_entry(event, &leader->sibling_list, group_entry) + if (!is_software_event(event)) + return 0; + + return 1; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (is_software_only_group(event)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + list_add_event(event, ctx); + event->tstamp_enabled = ctx->time; + event->tstamp_running = ctx->time; + event->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int cpu = smp_processor_id(); + int err; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no events. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + /* + * Protect the list operation against NMI by disabling the + * events on a global level. NOP for non NMI based events. + */ + perf_disable(); + + add_event_to_ctx(event, ctx); + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx, cpu); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + if (!err && !ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + unlock: + perf_enable(); + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_install_in_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * we need to retry the smp call. + */ + if (ctx->is_active && list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the event safely, if it the call above did not + * succeed. + */ + if (list_empty(&event->group_entry)) + add_event_to_ctx(event, ctx); + spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = ctx->time - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + +/* + * Cross CPU call to enable a performance event + */ +static void __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int err; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + __perf_event_mark_enabled(event, ctx); + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + perf_disable(); + if (event == leader) + err = group_sched_in(event, cpuctx, ctx, + smp_processor_id()); + else + err = event_sched_in(event, cpuctx, ctx, + smp_processor_id()); + perf_enable(); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + unlock: + spin_unlock(&ctx->lock); +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +static void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_enable, + event, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_event_enable, event); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_OFF) + __perf_event_mark_enabled(event, ctx); + + out: + spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +void __perf_event_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + spin_lock(&ctx->lock); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + + perf_disable(); + if (ctx->nr_active) + list_for_each_entry(event, &ctx->group_list, group_entry) + group_sched_out(event, cpuctx, ctx); + + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_read(void *event); + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + __perf_event_read(event); + break; + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = atomic64_read(&next_event->count); + value = atomic64_xchg(&event->count, value); + atomic64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct pt_regs *regs; + int do_switch = 1; + + regs = task_pt_regs(task); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + + if (likely(!ctx || !cpuctx->task_ctx)) + return; + + update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } +} + +/* + * Called with IRQs disabled + */ +static void __perf_event_task_sched_out(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ + __perf_event_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_event_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event *event; + int can_add_hw = 1; + + spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF || + !event->attr.pinned) + continue; + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } + + list_for_each_entry(event, &ctx->group_list, group_entry) { + /* + * Ignore events in OFF or ERROR state, and + * ignore pinned events since we did them already. + */ + if (event->state <= PERF_EVENT_STATE_OFF || + event->attr.pinned) + continue; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void perf_event_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + __perf_event_sched_in(ctx, cpuctx, cpu); + cpuctx->task_ctx = ctx; +} + +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + __perf_event_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static void perf_adjust_period(struct perf_event *event, u64 events) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, event->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, freq; + + spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->unthrottle(event); + interrupts = 2*sysctl_perf_event_sample_rate/HZ; + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + /* + * if the specified freq < HZ then we need to skip ticks + */ + if (event->attr.sample_freq < HZ) { + freq = event->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + perf_adjust_period(event, freq * interrupts); + + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } + } + spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (!ctx->nr_events) + return; + + spin_lock(&ctx->lock); + /* + * Rotate the first entry last (works just fine for group events too): + */ + perf_disable(); + list_for_each_entry(event, &ctx->group_list, group_entry) { + list_move_tail(&event->group_entry, &ctx->group_list); + break; + } + perf_enable(); + + spin_unlock(&ctx->lock); +} + +void perf_event_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + + if (!atomic_read(&nr_events)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = curr->perf_event_ctxp; + + perf_ctx_adjust_freq(&cpuctx->ctx); + if (ctx) + perf_ctx_adjust_freq(ctx); + + perf_event_cpu_sched_out(cpuctx); + if (ctx) + __perf_event_task_sched_out(ctx); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_event_cpu_sched_in(cpuctx, cpu); + if (ctx) + perf_event_task_sched_in(curr, cpu); +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_event_ctxp; + if (!ctx || !ctx->nr_events) + goto out; + + __perf_event_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (!event->attr.enable_on_exec) + continue; + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + continue; + __perf_event_mark_enabled(event, ctx); + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + spin_unlock(&ctx->lock); + + perf_event_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + event->pmu->read(event); + update_event_times(event); + local_irq_restore(flags); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_event_times(event); + } + + return atomic64_read(&event->count); +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void +__perf_event_init_context(struct perf_event_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + +static struct perf_event_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + struct task_struct *task; + unsigned long flags; + int err; + + /* + * If cpu is not a wildcard then this is a percpu event: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* + * Can't attach events to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry: + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + unclone_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + err = -ENOMEM; + if (!ctx) + goto errout; + __perf_event_init_context(ctx, task); + get_ctx(ctx); + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + goto retry; + } + get_task_struct(task); + } + + put_task_struct(task); + return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); +} + +static void perf_event_free_filter(struct perf_event *event); + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + perf_event_free_filter(event); + kfree(event); +} + +static void perf_pending_sync(struct perf_event *event); + +static void free_event(struct perf_event *event) +{ + perf_pending_sync(event); + + if (!event->parent) { + atomic_dec(&nr_events); + if (event->attr.mmap) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + } + + if (event->output) { + fput(event->output->filp); + event->output = NULL; + } + + if (event->destroy) + event->destroy(event); + + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx = event->ctx; + + file->private_data = NULL; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_event_remove_from_context(event); + mutex_unlock(&ctx->mutex); + + mutex_lock(&event->owner->perf_event_mutex); + list_del_init(&event->owner_entry); + mutex_unlock(&event->owner->perf_event_mutex); + put_task_struct(event->owner); + + free_event(event); + + return 0; +} + +static int perf_event_read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_event_read_value(struct perf_event *event) +{ + struct perf_event *child; + u64 total = 0; + + total += perf_event_read(event); + list_for_each_entry(child, &event->child_list, child_list) + total += perf_event_read(child); + + return total; +} + +static int perf_event_read_entry(struct perf_event *event, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_event_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + err = perf_event_read_entry(sub, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < perf_event_read_size(event)) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + mutex_unlock(&event->child_mutex); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_mmap_data *data; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) + events = atomic_xchg(&data->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + atomic64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!event->attr.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + +static int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + return perf_event_set_output(event, arg); + + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + userpg = data->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = atomic64_read(&event->count); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= atomic64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static unsigned long perf_data_size(struct perf_mmap_data *data) +{ + return data->nr_pages << (PAGE_SHIFT + data->data_order); +} + +#ifndef CONFIG_PERF_USE_VMALLOC + +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ + +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > data->nr_pages) + return NULL; + + if (pgoff == 0) + return virt_to_page(data->user_page); + + return virt_to_page(data->data_pages[pgoff - 1]); +} + +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->data_order = 0; + data->nr_pages = nr_pages; + + return data; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return NULL; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void perf_mmap_data_free(struct perf_mmap_data *data) +{ + int i; + + perf_mmap_free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + perf_mmap_free_page((unsigned long)data->data_pages[i]); +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > (1UL << data->data_order)) + return NULL; + + return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void perf_mmap_data_free_work(struct work_struct *work) +{ + struct perf_mmap_data *data; + void *base; + int i, nr; + + data = container_of(work, struct perf_mmap_data, work); + nr = 1 << data->data_order; + + base = data->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); +} + +static void perf_mmap_data_free(struct perf_mmap_data *data) +{ + schedule_work(&data->work); +} + +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + void *all_buf; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + INIT_WORK(&data->work, perf_mmap_data_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + data->user_page = all_buf; + data->data_pages[0] = all_buf + PAGE_SIZE; + data->data_order = ilog2(nr_pages); + data->nr_pages = 1; + + return data; + +fail_all_buf: + kfree(data); + +fail: + return NULL; +} + +#endif + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(data, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static void +perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +{ + long max_size = perf_data_size(data); + + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, max_size, + event->attr.wakeup_watermark); + } + + if (!data->watermark) + data->watermark = max_t(long, PAGE_SIZE, max_size / 2); + + + rcu_assign_pointer(event->data, data); +} + +static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + perf_mmap_data_free(data); + kfree(data); +} + +static void perf_mmap_data_release(struct perf_event *event) +{ + struct perf_mmap_data *data = event->data; + + WARN_ON(atomic_read(&event->mmap_count)); + + rcu_assign_pointer(event->data, NULL); + call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->data); + struct user_struct *user = current_user(); + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->data->nr_locked; + perf_mmap_data_release(event); + mutex_unlock(&event->mmap_mutex); + } +} + +static const struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + struct perf_mmap_data *data; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->output) { + ret = -EINVAL; + goto unlock; + } + + if (atomic_inc_not_zero(&event->mmap_count)) { + if (nr_pages != event->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->data); + + data = perf_mmap_data_alloc(event, nr_pages); + ret = -ENOMEM; + if (!data) + goto unlock; + + ret = 0; + perf_mmap_data_init(event, data); + + atomic_set(&event->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->locked_vm += extra; + event->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + event->data->writable = 1; + +unlock: + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_event(struct perf_pending_entry *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_event_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_event *event) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return event->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_event *event) +{ + wait_event(event->waitq, perf_not_pending(event)); +} + +void perf_event_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + +/* + * Output + */ +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!data->writable) + return true; + + mask = perf_data_size(data) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + perf_pending_queue(&handle->event->pending, + perf_pending_event); + } else + perf_event_wakeup(handle->event); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event_id doesn't publish a head when a former + * event_id isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event_id completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + unsigned long head; + int cpu; + + data->done_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_long_xchg(&data->done_head, 0))) + data->user_page->data_head = head; + + /* + * NMI can happen here, which means we can miss a done_head update. + */ + + cpu = atomic_xchg(&data->lock, -1); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_long_read(&data->done_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + goto again; + } + + if (atomic_xchg(&data->wakeup, 0)) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned long offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned long page_offset; + unsigned long page_size; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); + page_offset = offset & (page_size - 1); + size = min_t(unsigned int, page_size - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_event *output_event; + struct perf_mmap_data *data; + unsigned long tail, offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + output_event = rcu_dereference(event->output); + if (output_event) + event = output_event; + + data = rcu_dereference(event->data); + if (!data) + goto out; + + handle->data = data; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!data->nr_pages) + goto fail; + + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + + perf_output_lock(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + offset = head = atomic_long_read(&data->head); + head += size; + if (unlikely(!perf_output_space(data, tail, offset, head))) + goto fail; + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + + handle->offset = offset; + handle->head = head; + + if (head - tail > data->watermark) + atomic_set(&data->wakeup, 1); + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = event->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + + return 0; + +fail: + atomic_inc(&data->lost); + perf_output_unlock(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } + } + + perf_output_unlock(handle); + rcu_read_unlock(); +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&event->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event); + else + perf_output_read_one(handle, event); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header); + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + if (sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(regs); + + header->size += sizeof(data->ip); + } + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + + header->size += sizeof(data->tid_entry); + } + + if (sample_type & PERF_SAMPLE_TIME) { + data->time = perf_clock(); + + header->size += sizeof(data->time); + } + + if (sample_type & PERF_SAMPLE_ADDR) + header->size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_ID) { + data->id = primary_event_id(event); + + header->size += sizeof(data->id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = event->id; + + header->size += sizeof(data->stream_id); + } + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + + header->size += sizeof(data->cpu_entry); + } + + if (sample_type & PERF_SAMPLE_PERIOD) + header->size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + header->size += perf_event_read_size(event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + return; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + perf_event_read_size(event), + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + int size; + struct task_struct *task = task_event->task; + int ret; + + size = task_event->event_id.header.size; + ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + task_event->event_id.time = perf_clock(); + + perf_output_put(&handle, task_event->event_id); + + perf_output_end(&handle); +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->attr.comm || event->attr.mmap || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } + rcu_read_unlock(); +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx = task_event->task_ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_event_ctxp); + if (ctx) + perf_event_task_ctx(ctx, task_event); + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } + rcu_read_unlock(); +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + unsigned int size; + char comm[TASK_COMM_LEN]; + + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (task->perf_event_ctxp) + perf_event_enable_on_exec(task); + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + if (event->attr.mmap) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event)) + perf_event_mmap_output(event, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); + + kfree(buf); +} + +void __perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + throttle = (throttle && event->pmu->unthrottle != NULL); + + if (!throttle) { + hwc->interrupts++; + } else { + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > + (u64)sysctl_perf_event_sample_rate) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling events even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the event got enabled again: + */ + ret = 1; + } + } + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + if (nmi) { + event->pending_disable = 1; + perf_pending_queue(&event->pending, + perf_pending_event); + } else + perf_event_disable(event); + } + + perf_event_output(event, nmi, data, regs); + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + u64 overflow; + + data->period = event->hw.last_period; + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_unthrottle(struct perf_event *event) +{ + /* + * Nothing to do, we already reset hwc->interrupts. + */ +} + +static void perf_swevent_add(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + atomic64_add(nr, &event->count); + + if (!hwc->sample_period) + return; + + if (!regs) + return; + + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swevent_overflow(event, nmi, data, regs); +} + +static int perf_swevent_is_counting(struct perf_event *event) +{ + /* + * The event is active, we're good! + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) + return 1; + + /* + * The event is off/error, not counting. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE) + return 0; + + /* + * The event is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. + */ + if (event->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; +} + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data); + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (!perf_swevent_is_counting(event)) + return 0; + + if (event->attr.type != type) + return 0; + if (event->attr.config != event_id) + return 0; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 0; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 0; + } + + if (event->attr.type == PERF_TYPE_TRACEPOINT && + !perf_tp_event_match(event, data)) + return 0; + + return 1; +} + +static void perf_swevent_ctx_event(struct perf_event_context *ctx, + enum perf_type_id type, + u32 event_id, u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_swevent_match(event, type, event_id, data, regs)) + perf_swevent_add(event, nr, nmi, data, regs); + } + rcu_read_unlock(); +} + +static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_event_context *ctx; + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); + + perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, + nr, nmi, data, regs); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); + rcu_read_unlock(); + + barrier(); + (*recursion)--; + +out: + put_cpu_var(perf_cpu_context); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data = { + .addr = addr, + }; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, + &data, regs); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + return 0; +} + +static void perf_swevent_disable(struct perf_event *event) +{ +} + +static const struct pmu perf_ops_generic = { + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, + .read = perf_swevent_read, + .unthrottle = perf_swevent_unthrottle, +}; + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + data.addr = 0; + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((event->attr.exclude_kernel || !regs) && + !event->attr.exclude_user) + regs = task_pt_regs(current); + + if (regs) { + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_perf_event_update(struct perf_event *event) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&event->hw.prev_count); + atomic64_set(&event->hw.prev_count, now); + atomic64_add(now - prev, &event->count); +} + +static int cpu_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void cpu_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + cpu_clock_perf_event_update(event); +} + +static void cpu_clock_perf_event_read(struct perf_event *event) +{ + cpu_clock_perf_event_update(event); +} + +static const struct pmu perf_ops_cpu_clock = { + .enable = cpu_clock_perf_event_enable, + .disable = cpu_clock_perf_event_disable, + .read = cpu_clock_perf_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_perf_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = atomic64_xchg(&event->hw.prev_count, now); + delta = now - prev; + atomic64_add(delta, &event->count); +} + +static int task_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 now; + + now = event->ctx->time; + + atomic64_set(&hwc->prev_count, now); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void task_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + task_clock_perf_event_update(event, event->ctx->time); + +} + +static void task_clock_perf_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_perf_event_update(event, time); +} + +static const struct pmu perf_ops_task_clock = { + .enable = task_clock_perf_event_enable, + .disable = task_clock_perf_event_disable, + .read = task_clock_perf_event_read, +}; + +#ifdef CONFIG_EVENT_PROFILE + +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) +{ + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + struct perf_sample_data data = { + .addr = addr, + .raw = &raw, + }; + + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + +static void tp_perf_event_destroy(struct perf_event *event) +{ + ftrace_profile_disable(event->attr.config); +} + +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_paranoid_tracepoint_raw() && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (ftrace_profile_enable(event->attr.config)) + return NULL; + + event->destroy = tp_perf_event_destroy; + + return &perf_ops_generic; +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + +#else + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + return 1; +} + +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + return NULL; +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_PROFILE */ + +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + atomic_dec(&perf_swevent_enabled[event_id]); +} + +static const struct pmu *sw_perf_event_init(struct perf_event *event) +{ + const struct pmu *pmu = NULL; + u64 event_id = event->attr.config; + + /* + * Software events (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_TASK_CLOCK: + /* + * If the user instantiates this as a per-cpu event, + * use the cpu_clock event instead. + */ + if (event->ctx->task) + pmu = &perf_ops_task_clock; + else + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: + if (!event->parent) { + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + pmu = &perf_ops_generic; + break; + } + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, + int cpu, + struct perf_event_context *ctx, + struct perf_event *group_leader, + struct perf_event *parent_event, + gfp_t gfpflags) +{ + const struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + event = kzalloc(sizeof(*event), gfpflags); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->ctx = ctx; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + atomic64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + switch (attr->type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + pmu = hw_perf_event_init(event); + break; + + case PERF_TYPE_SOFTWARE: + pmu = sw_perf_event_init(event); + break; + + case PERF_TYPE_TRACEPOINT: + pmu = tp_perf_event_init(event); + break; + + default: + break; + } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + atomic_inc(&nr_events); + if (event->attr.mmap) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +static int perf_event_set_output(struct perf_event *event, int output_fd) +{ + struct perf_event *output_event = NULL; + struct file *output_file = NULL; + struct perf_event *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_event = output_file->private_data; + + /* Don't chain output fds */ + if (output_event->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (event->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&event->mmap_mutex); + old_output = event->output; + rcu_assign_pointer(event->output, output_event); + mutex_unlock(&event->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this event. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *event, *group_leader; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + int fput_needed = 0; + int fput_needed2 = 0; + int err; + + /* for future expandability... */ + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this event to it): + */ + group_leader = NULL; + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + err = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto err_put_context; + if (group_file->f_op != &perf_fops) + goto err_put_context; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (group_leader->ctx != ctx) + goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_put_context; + } + + event = perf_event_alloc(&attr, cpu, ctx, group_leader, + NULL, GFP_KERNEL); + err = PTR_ERR(event); + if (IS_ERR(event)) + goto err_put_context; + + err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + if (err < 0) + goto err_free_put_context; + + event_file = fget_light(err, &fput_needed2); + if (!event_file) + goto err_free_put_context; + + if (flags & PERF_FLAG_FD_OUTPUT) { + err = perf_event_set_output(event, group_fd); + if (err) + goto err_fput_free_put_context; + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + +err_fput_free_put_context: + fput_light(event_file, fput_needed2); + +err_free_put_context: + if (err < 0) + kfree(event); + +err_put_context: + if (err < 0) + put_ctx(ctx); + + fput_light(group_file, fput_needed); + + return err; +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, child_ctx, + group_leader, parent_event, + GFP_KERNEL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) + child_event->hw.sample_period = parent_event->hw.sample_period; + + /* + * Link it up in the child's context: + */ + add_event_to_ctx(child_event, child_ctx); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = atomic64_read(&child_event->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + struct perf_event *parent_event; + + update_event_times(child_event); + perf_event_remove_from_context(child_event); + + parent_event = child_event->parent; + /* + * It can happen that parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped - but otherwise linger. + */ + if (parent_event) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp)) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_event_ctxp; + __perf_event_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); + child->perf_event_ctxp = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->group_list)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event *event, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); + } + + if (!list_empty(&ctx->group_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + int ret = 0; + + child->perf_event_ctxp = NULL; + + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + if (likely(!parent->perf_event_ctxp)) + return 0; + + /* + * This is executed from the parent task context, so inherit + * events that have been marked for cloning. + * First allocate and initialize a context for the child. + */ + + child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry(event, &parent_ctx->group_list, group_entry) { + + if (!event->attr.inherit) { + inherited_all = 0; + continue; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + if (ret) { + inherited_all = 0; + break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of events and the generation + * count can't have changed since we took the mutex. + */ + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + + return ret; +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + + spin_lock(&perf_resource_lock); + cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; + spin_unlock(&perf_resource_lock); + + hw_perf_event_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_event_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event *event, *tmp; + + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + __perf_event_remove_from_context(event); +} +static void perf_event_exit_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_event_init_cpu(cpu); + break; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_event_setup_online(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, + .priority = 20, +}; + +void __init perf_event_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_events) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_events - cpuctx->ctx.nr_events, + perf_max_events - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + spin_unlock(&perf_resource_lock); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_overcommit = val; + spin_unlock(&perf_resource_lock); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_events", +}; + +static int __init perf_event_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_event_sysfs_init); diff --git a/kernel/pid.c b/kernel/pid.c index 31310b5..d3f722d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -40,7 +40,7 @@ #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) static struct hlist_head *pid_hash; -static int pidhash_shift; +static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) void __init pidhash_init(void) { int i, pidhash_size; - unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); - pidhash_shift = max(4, fls(megabytes * 4)); - pidhash_shift = min(12, pidhash_shift); + pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, + HASH_EARLY | HASH_SMALL, + &pidhash_shift, NULL, 4096); pidhash_size = 1 << pidhash_shift; - printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", - pidhash_size, pidhash_shift, - pidhash_size * sizeof(struct hlist_head)); - - pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); - if (!pid_hash) - panic("Could not alloc pidhash!\n"); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 821722a..86b3796 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & CLONE_THREAD) + if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); return create_pid_namespace(old_ns); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e33a21c..5c9dc22 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -8,17 +8,18 @@ #include <linux/math64.h> #include <asm/uaccess.h> #include <linux/kernel_stat.h> +#include <trace/events/timer.h> /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ void update_rlimit_cpu(unsigned long rlim_new) { - cputime_t cputime; + cputime_t cputime = secs_to_cputime(rlim_new); + struct signal_struct *const sig = current->signal; - cputime = secs_to_cputime(rlim_new); - if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_gt(current->signal->it_prof_expires, cputime)) { + if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || + cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return cputime_eq(expires, cputime_zero) || + cputime_gt(expires, new_exp); +} + +static inline int expires_le(cputime_t expires, cputime_t new_exp) +{ + return !cputime_eq(expires, cputime_zero) && + cputime_le(expires, new_exp); +} /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + union cpu_time_count *exp = &nt->expires; + switch (CPUCLOCK_WHICH(timer->it_clock)) { default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->cputime_expires.prof_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.prof_exp, - nt->expires.cpu)) - p->cputime_expires.prof_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.prof_exp, + exp->cpu)) + p->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->cputime_expires.virt_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.virt_exp, - nt->expires.cpu)) - p->cputime_expires.virt_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.virt_exp, + exp->cpu)) + p->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_SCHED: if (p->cputime_expires.sched_exp == 0 || - p->cputime_expires.sched_exp > - nt->expires.sched) + p->cputime_expires.sched_exp > exp->sched) p->cputime_expires.sched_exp = - nt->expires.sched; + exp->sched; break; } } else { + struct signal_struct *const sig = p->signal; + union cpu_time_count *exp = &timer->it.cpu.expires; + /* * For a process timer, set the cached expiration time. */ @@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_VIRT: - if (!cputime_eq(p->signal->it_virt_expires, - cputime_zero) && - cputime_lt(p->signal->it_virt_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_VIRT].expires, + exp->cpu)) break; - p->signal->cputime_expires.virt_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_PROF: - if (!cputime_eq(p->signal->it_prof_expires, - cputime_zero) && - cputime_lt(p->signal->it_prof_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_PROF].expires, + exp->cpu)) break; - i = p->signal->rlim[RLIMIT_CPU].rlim_cur; + i = sig->rlim[RLIMIT_CPU].rlim_cur; if (i != RLIM_INFINITY && - i <= cputime_to_secs(timer->it.cpu.expires.cpu)) + i <= cputime_to_secs(exp->cpu)) break; - p->signal->cputime_expires.prof_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_SCHED: - p->signal->cputime_expires.sched_exp = - timer->it.cpu.expires.sched; + sig->cputime_expires.sched_exp = exp->sched; break; } } @@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static u32 onecputick; + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + cputime_t *expires, cputime_t cur_time, int signo) +{ + if (cputime_eq(it->expires, cputime_zero)) + return; + + if (cputime_ge(cur_time, it->expires)) { + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires = cputime_sub(it->expires, + cputime_one_jiffy); + it->error -= onecputick; + } + } else { + it->expires = cputime_zero; + } + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { + *expires = it->expires; + } +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk, * Don't sample the current process CPU clocks if there are no timers. */ if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it_prof_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it_virt_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { stop_process_timers(tsk); return; @@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk, /* * Check for the special case process timers. */ - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - if (cputime_ge(ptime, sig->it_prof_expires)) { - /* ITIMER_PROF fires and reloads. */ - sig->it_prof_expires = sig->it_prof_incr; - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - sig->it_prof_expires = cputime_add( - sig->it_prof_expires, ptime); - } - __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_prof_expires, cputime_zero) && - (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(sig->it_prof_expires, prof_expires))) { - prof_expires = sig->it_prof_expires; - } - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - if (cputime_ge(utime, sig->it_virt_expires)) { - /* ITIMER_VIRTUAL fires and reloads. */ - sig->it_virt_expires = sig->it_virt_incr; - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - sig->it_virt_expires = cputime_add( - sig->it_virt_expires, utime); - } - __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero) && - (cputime_eq(virt_expires, cputime_zero) || - cputime_lt(sig->it_virt_expires, virt_expires))) { - virt_expires = sig->it_virt_expires; - } - } + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); cputime_t x; @@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ - *oldval = jiffies_to_cputime(1); + *oldval = cputime_one_jiffy; } else { *oldval = cputime_sub(*oldval, now.cpu); } @@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + struct timespec ts; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + cputime_to_timespec(cputime_one_jiffy, &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + return 0; } __initcall(init_posix_cpu_timers); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d089d05..4954407 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) return 0; } + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -262,10 +281,26 @@ static __init int init_posix_timers(void) .timer_create = no_timer_create, .nsleep = no_nsleep, }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/power/console.c b/kernel/power/console.c index a3961b20..5187136 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -14,56 +14,13 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; -static int disable_vt_switch; - -/* - * Normally during a suspend, we allocate a new console and switch to it. - * When we resume, we switch back to the original console. This switch - * can be slow, so on systems where the framebuffer can handle restoration - * of video registers anyways, there's little point in doing the console - * switch. This function allows you to disable it by passing it '0'. - */ -void pm_set_vt_switch(int do_switch) -{ - acquire_console_sem(); - disable_vt_switch = !do_switch; - release_console_sem(); -} -EXPORT_SYMBOL(pm_set_vt_switch); int pm_prepare_console(void) { - acquire_console_sem(); - - if (disable_vt_switch) { - release_console_sem(); - return 0; - } - - orig_fgconsole = fg_console; - - if (vc_allocate(SUSPEND_CONSOLE)) { - /* we can't have a free VC for now. Too bad, - * we don't want to mess the screen for now. */ - release_console_sem(); + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); + if (orig_fgconsole < 0) return 1; - } - if (set_console(SUSPEND_CONSOLE)) { - /* - * We're unable to switch to the SUSPEND_CONSOLE. - * Let the calling function know so it can decide - * what to do. - */ - release_console_sem(); - return 1; - } - release_console_sem(); - - if (vt_waitactive(SUSPEND_CONSOLE)) { - pr_debug("Suspend: Can't switch VCs."); - return 1; - } orig_kmsg = kmsg_redirect; kmsg_redirect = SUSPEND_CONSOLE; return 0; @@ -71,19 +28,9 @@ int pm_prepare_console(void) void pm_restore_console(void) { - acquire_console_sem(); - if (disable_vt_switch) { - release_console_sem(); - return; - } - set_console(orig_fgconsole); - release_console_sem(); - - if (vt_waitactive(orig_fgconsole)) { - pr_debug("Resume: Can't switch VCs."); - return; + if (orig_fgconsole >= 0) { + vt_move_to_console(orig_fgconsole, 0); + kmsg_redirect = orig_kmsg; } - - kmsg_redirect = orig_kmsg; } #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index da2072d..cc2e553 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -9,6 +9,7 @@ #undef DEBUG #include <linux/interrupt.h> +#include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 97955b0..36cb168 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + region = alloc_bootmem(sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8ba052c..b101cdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/file.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> diff --git a/kernel/printk.c b/kernel/printk.c index 602033a..f38b07f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -206,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY static unsigned int boot_delay; /* msecs delay after each printk during bootup */ -static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ +static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; - unsigned long long loops_per_msec; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; @@ -220,10 +219,9 @@ static int __init boot_delay_setup(char *str) if (boot_delay > 10 * 1000) boot_delay = 0; - printk_delay_msec = loops_per_msec; - printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " - "HZ: %d, printk_delay_msec: %llu\n", - boot_delay, preset_lpj, lpj, HZ, printk_delay_msec); + pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " + "HZ: %d, loops_per_msec: %llu\n", + boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 1; } __setup("boot_delay=", boot_delay_setup); @@ -236,7 +234,7 @@ static void boot_delay_msec(void) if (boot_delay == 0 || system_state != SYSTEM_BOOTING) return; - k = (unsigned long long)printk_delay_msec * boot_delay; + k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { @@ -655,6 +653,20 @@ static int recursion_bug; static int new_text_line = 1; static char printk_buf[1024]; +int printk_delay_msec __read_mostly; + +static inline void printk_delay(void) +{ + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; + + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} + asmlinkage int vprintk(const char *fmt, va_list args) { int printed_len = 0; @@ -664,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) char *p; boot_delay_msec(); + printk_delay(); preempt_disable(); /* This stops the holder of console_sem just where we want him */ diff --git a/kernel/profile.c b/kernel/profile.c index 419250e..a55d3a3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,48 +442,51 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <asm/uaccess.h> -static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { - int len = cpumask_scnprintf(page, count, data); - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + seq_cpumask(m, prof_cpu_mask); + seq_putc(m, '\n'); + return 0; } -static int prof_cpu_mask_write_proc(struct file *file, - const char __user *buffer, unsigned long count, void *data) +static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, prof_cpu_mask_proc_show, NULL); +} + +static ssize_t prof_cpu_mask_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) { - struct cpumask *mask = data; - unsigned long full_count = count, err; cpumask_var_t new_value; + int err; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); if (!err) { - cpumask_copy(mask, new_value); - err = full_count; + cpumask_copy(prof_cpu_mask, new_value); + err = count; } free_cpumask_var(new_value); return err; } +static const struct file_operations prof_cpu_mask_proc_fops = { + .open = prof_cpu_mask_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = prof_cpu_mask_proc_write, +}; + void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) { - struct proc_dir_entry *entry; - /* create /proc/irq/prof_cpu_mask */ - entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); - if (!entry) - return; - entry->data = prof_cpu_mask; - entry->read_proc = prof_cpu_mask_read_proc; - entry->write_proc = prof_cpu_mask_write_proc; + proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 307c285..23bd09c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh) * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. + * If it's our own child, there is no notification to do. But if our normal + * children self-reap, then this child was prevented by ptrace and we must + * reap it now, in that case we must also wake up sub-threads sleeping in + * do_wait(). */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { @@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); p->exit_signal = -1; + } } if (task_detached(p)) { /* Mark it as in the process of being reaped. */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index bd5d5c8..4001833 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -19,7 +19,7 @@ * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * + * * Based on the original work by Paul McKenney <paulmck@us.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: @@ -27,7 +27,7 @@ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - - * http://lse.sourceforge.net/locking/rcupdate.html + * http://lse.sourceforge.net/locking/rcupdate.html * */ #include <linux/types.h> @@ -46,22 +46,15 @@ #include <linux/module.h> #include <linux/kernel_stat.h> -enum rcu_barrier { - RCU_BARRIER_STD, - RCU_BARRIER_BH, - RCU_BARRIER_SCHED, -}; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_key; +struct lockdep_map rcu_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +EXPORT_SYMBOL_GPL(rcu_lock_map); +#endif -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; int rcu_scheduler_active __read_mostly; -static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); -static struct rcu_head rcu_migrate_head[3]; -static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); - /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. @@ -74,6 +67,8 @@ void wakeme_after_rcu(struct rcu_head *head) complete(&rcu->completion); } +#ifdef CONFIG_TREE_PREEMPT_RCU + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -87,7 +82,7 @@ void synchronize_rcu(void) { struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) + if (!rcu_scheduler_active) return; init_completion(&rcu.completion); @@ -98,6 +93,46 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns. However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + /** * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. * @@ -122,129 +157,10 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -static void rcu_barrier_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); -} - -/* - * Called with preemption disabled, and from cross-cpu IRQ context. - */ -static void rcu_barrier_func(void *type) -{ - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - - atomic_inc(&rcu_barrier_cpu_count); - switch ((enum rcu_barrier)type) { - case RCU_BARRIER_STD: - call_rcu(head, rcu_barrier_callback); - break; - case RCU_BARRIER_BH: - call_rcu_bh(head, rcu_barrier_callback); - break; - case RCU_BARRIER_SCHED: - call_rcu_sched(head, rcu_barrier_callback); - break; - } -} - -static inline void wait_migrated_callbacks(void) -{ - wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); - smp_mb(); /* In case we didn't sleep. */ -} - -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. - */ -static void _rcu_barrier(enum rcu_barrier type) -{ - BUG_ON(in_interrupt()); - /* Take cpucontrol mutex to protect against CPU hotplug */ - mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); - /* - * Initialize rcu_barrier_cpu_count to 1, then invoke - * rcu_barrier_func() on each CPU, so that each CPU also has - * incremented rcu_barrier_cpu_count. Only then is it safe to - * decrement rcu_barrier_cpu_count -- otherwise the first CPU - * might complete its grace period before all of the other CPUs - * did their increment, causing this function to return too - * early. - */ - atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)type, 1); - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); - wait_for_completion(&rcu_barrier_completion); - mutex_unlock(&rcu_barrier_mutex); - wait_migrated_callbacks(); -} - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - */ -void rcu_barrier(void) -{ - _rcu_barrier(RCU_BARRIER_STD); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(RCU_BARRIER_BH); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(RCU_BARRIER_SCHED); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -static void rcu_migrate_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_migrate_type_count)) - wake_up(&rcu_migrate_wq); -} - -extern int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); - static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { - rcu_cpu_notify(self, action, hcpu); - if (action == CPU_DYING) { - /* - * preempt_disable() in on_each_cpu() prevents stop_machine(), - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(), - * and the dead cpu(if it exist) queues rcu_migrate_callback()s. - * - * These callbacks ensure _rcu_barrier() waits for all - * RCU callbacks of the specified type to complete. - */ - atomic_set(&rcu_migrate_type_count, 3); - call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); - call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); - call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); - } else if (action == CPU_DOWN_PREPARE) { - /* Don't need to wait until next removal operation. */ - /* rcu_migrate_head is protected by cpu_add_remove_lock */ - wait_migrated_callbacks(); - } - - return NOTIFY_OK; + return rcu_cpu_notify(self, action, hcpu); } void __init rcu_init(void) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b33db53..697c0a0 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -18,7 +18,7 @@ * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney <paulmck@us.ibm.com> - * Josh Triplett <josh@freedesktop.org> + * Josh Triplett <josh@freedesktop.org> * * See also: Documentation/RCU/torture.txt */ @@ -50,7 +50,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " - "Josh Triplett <josh@freedesktop.org>"); + "Josh Triplett <josh@freedesktop.org>"); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ @@ -110,8 +110,8 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current = NULL; -static long rcu_torture_current_version = 0; +static struct rcu_torture *rcu_torture_current; +static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = @@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; -static long n_rcu_torture_timers = 0; +static long n_rcu_torture_timers; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; -static int stutter_pause_test = 0; +static int stutter_pause_test; #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -267,7 +267,8 @@ struct rcu_torture_ops { int irq_capable; char *name; }; -static struct rcu_torture_ops *cur_ops = NULL; + +static struct rcu_torture_ops *cur_ops; /* * Definitions for rcu torture testing. @@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct rcu_random_state *rrsp) { - long delay; - const long longdelay = 200; + const unsigned long shortdelay_us = 200; + const unsigned long longdelay_ms = 50; - /* We want there to be long-running readers, but not all the time. */ + /* We want a short delay sometimes to make a reader delay the grace + * period, and we want a long delay occasionally to trigger + * force_quiescent_state. */ - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); - if (!delay) - udelay(longdelay); + if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + udelay(shortdelay_us); } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -339,8 +343,8 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .cb_barrier = rcu_barrier, .stats = NULL, - .irq_capable = 1, - .name = "rcu" + .irq_capable = 1, + .name = "rcu" }; static void rcu_sync_torture_deferred_free(struct rcu_torture *p) @@ -602,8 +606,6 @@ static struct rcu_torture_ops sched_ops_sync = { .name = "sched_sync" }; -extern int rcu_expedited_torture_stats(char *page); - static struct rcu_torture_ops sched_expedited_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, @@ -638,14 +640,15 @@ rcu_torture_writer(void *arg) do { schedule_timeout_uninterruptible(1); - if ((rp = rcu_torture_alloc()) == NULL) + rp = rcu_torture_alloc(); + if (rp == NULL) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); old_rp = rcu_torture_current; rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ if (old_rp) { i = old_rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) @@ -1110,7 +1113,7 @@ rcu_torture_init(void) printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", torture_type); mutex_unlock(&fullstop_mutex); - return (-EINVAL); + return -EINVAL; } if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ @@ -1161,7 +1164,7 @@ rcu_torture_init(void) goto unwind; } fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); + GFP_KERNEL); if (fakewriter_tasks == NULL) { VERBOSE_PRINTK_ERRSTRING("out of memory"); firsterr = -ENOMEM; @@ -1170,7 +1173,7 @@ rcu_torture_init(void) for (i = 0; i < nfakewriters; i++) { VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); + "rcu_torture_fakewriter"); if (IS_ERR(fakewriter_tasks[i])) { firsterr = PTR_ERR(fakewriter_tasks[i]); VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b11b07..705f02a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -25,7 +25,7 @@ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU + * Documentation/RCU */ #include <linux/types.h> #include <linux/kernel.h> @@ -49,13 +49,6 @@ #include "rcutree.h" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - /* Data structures. */ #define RCU_STATE_INITIALIZER(name) { \ @@ -70,6 +63,9 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .orphan_cbs_list = NULL, \ + .orphan_cbs_tail = &name.orphan_cbs_list, \ + .orphan_qlen = 0, \ .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -81,24 +77,16 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); -extern long rcu_batches_completed_sched(void); -static struct rcu_node *rcu_get_root(struct rcu_state *rsp); -static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags); -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); -#ifdef CONFIG_HOTPLUG_CPU -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void __rcu_process_callbacks(struct rcu_state *rsp, - struct rcu_data *rdp); -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp); -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp); -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, - int preemptable); -#include "rcutree_plugin.h" +/* + * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * permit this function to be invoked without holding the root rcu_node + * structure's ->lock, but of course results can be subject to change. + */ +static int rcu_gp_in_progress(struct rcu_state *rsp) +{ + return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); +} /* * Note a quiescent state. Because we do not need to know @@ -107,27 +95,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, */ void rcu_sched_qs(int cpu) { - unsigned long flags; struct rcu_data *rdp; - local_irq_save(flags); rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; - rcu_preempt_qs(cpu); - local_irq_restore(flags); + barrier(); + rdp->passed_quiesc = 1; + rcu_preempt_note_context_switch(cpu); } void rcu_bh_qs(int cpu) { - unsigned long flags; struct rcu_data *rdp; - local_irq_save(flags); rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; - local_irq_restore(flags); + barrier(); + rdp->passed_quiesc = 1; } #ifdef CONFIG_NO_HZ @@ -141,6 +125,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -177,9 +165,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - /* ACCESS_ONCE() because we are accessing outside of lock. */ - return *rdp->nxttail[RCU_DONE_TAIL] && - ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum); + return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); } /* @@ -373,7 +359,7 @@ static long dyntick_recall_completed(struct rcu_state *rsp) /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU - * is already in a quiescent state courtesy of dynticks idle mode. + * is in dynticks idle mode, which is an extended quiescent state. */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { @@ -479,30 +465,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp) long delta; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; /* Only let one CPU complain about others per time interval. */ spin_lock_irqsave(&rnp->lock, flags); delta = jiffies - rsp->jiffies_stall; - if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { + if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { spin_unlock_irqrestore(&rnp->lock, flags); return; } rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rcu_print_task_stall(rnp); spin_unlock_irqrestore(&rnp->lock, flags); /* OK, time to rat on our buddy... */ printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { rcu_print_task_stall(rnp); - if (rnp_cur->qsmask == 0) + if (rnp->qsmask == 0) continue; - for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) - if (rnp_cur->qsmask & (1UL << cpu)) - printk(" %d", rnp_cur->grplo + cpu); + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + printk(" %d", rnp->grplo + cpu); } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); @@ -541,8 +531,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); - } else if (rsp->gpnum != rsp->completed && - delta >= RCU_STALL_RAT_DELAY) { + } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { /* They had two time units to dump stack, so complain. */ print_other_cpu_stall(rsp); @@ -605,8 +594,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = rsp->rda[smp_processor_id()]; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur; - struct rcu_node *rnp_end; if (!cpu_needs_another_gp(rsp, rdp)) { spin_unlock_irqrestore(&rnp->lock, flags); @@ -615,6 +602,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; + WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -622,16 +610,24 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) note_new_gpnum(rsp, rdp); /* - * Because we are first, we know that all our callbacks will - * be covered by this upcoming grace period, even the ones - * that were registered arbitrarily recently. + * Because this CPU just now started the new grace period, we know + * that all of its callbacks will be covered by this upcoming grace + * period, even the ones that were registered arbitrarily recently. + * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. + * + * Other CPUs cannot be sure exactly when the grace period started. + * Therefore, their recently registered callbacks must pass through + * an additional RCU_NEXT_READY stage, so that they will be handled + * by the next RCU grace period. */ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { + rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -644,42 +640,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* - * Set the quiescent-state-needed bits in all the non-leaf RCU - * nodes for all currently online CPUs. This operation relies - * on the layout of the hierarchy within the rsp->node[] array. - * Note that other CPUs will access only the leaves of the - * hierarchy, which still indicate that no grace period is in - * progress. In addition, we have excluded CPU-hotplug operations. - * - * We therefore do not need to hold any locks. Any required - * memory barriers will be supplied by the locks guarding the - * leaf rcu_nodes in the hierarchy. - */ - - rnp_end = rsp->level[NUM_RCU_LVLS - 1]; - for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++) - rnp_cur->qsmask = rnp_cur->qsmaskinit; - - /* - * Now set up the leaf nodes. Here we must be careful. First, - * we need to hold the lock in order to exclude other CPUs, which - * might be contending for the leaf nodes' locks. Second, as - * soon as we initialize a given leaf node, its CPUs might run - * up the rest of the hierarchy. We must therefore acquire locks - * for each node that we touch during this stage. (But we still - * are excluding CPU-hotplug operations.) + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure. This + * operation relies on the layout of the hierarchy within the + * rsp->node[] array. Note that other CPUs will access only + * the leaves of the hierarchy, which still indicate that no + * grace period is in progress, at least until the corresponding + * leaf node has been initialized. In addition, we have excluded + * CPU-hotplug operations. * * Note that the grace period cannot complete until we finish * the initialization process, as there will be at least one * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU. + * one corresponding to this CPU, due to the fact that we have + * irqs disabled. */ - rnp_end = &rsp->node[NUM_RCU_NODES]; - rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - for (; rnp_cur < rnp_end; rnp_cur++) { - spin_lock(&rnp_cur->lock); /* irqs already disabled. */ - rnp_cur->qsmask = rnp_cur->qsmaskinit; - spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ + rcu_for_each_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + spin_unlock(&rnp->lock); /* irqs already disabled. */ } rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ @@ -720,8 +702,9 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) * hold rnp->lock, as required by rcu_start_gp(), which will release it. */ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) - __releases(rnp->lock) + __releases(rcu_get_root(rsp)->lock) { + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -739,6 +722,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { + struct rcu_node *rnp_c; + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask)) { @@ -762,8 +747,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, break; } spin_unlock_irqrestore(&rnp->lock, flags); + rnp_c = rnp; rnp = rnp->parent; spin_lock_irqsave(&rnp->lock, flags); + WARN_ON_ONCE(rnp_c->qsmask); } /* @@ -776,10 +763,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, /* * Record a quiescent state for the specified CPU, which must either be - * the current CPU or an offline CPU. The lastcomp argument is used to - * make sure we are still in the grace period of interest. We don't want - * to end the current grace period based on quiescent states detected in - * an earlier grace period! + * the current CPU. The lastcomp argument is used to make sure we are + * still in the grace period of interest. We don't want to end the current + * grace period based on quiescent states detected in an earlier grace + * period! */ static void cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) @@ -814,7 +801,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rdp = rsp->rda[smp_processor_id()]; rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ @@ -855,24 +841,70 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU /* + * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the + * specified flavor of RCU. The callbacks will be adopted by the next + * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever + * comes first. Because this is invoked from the CPU_DYING notifier, + * irqs are already disabled. + */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + + if (rdp->nxtlist == NULL) + return; /* irqs disabled, so comparison is stable. */ + spin_lock(&rsp->onofflock); /* irqs already disabled. */ + *rsp->orphan_cbs_tail = rdp->nxtlist; + rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rsp->orphan_qlen += rdp->qlen; + rdp->qlen = 0; + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ +} + +/* + * Adopt previously orphaned RCU callbacks. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp; + + spin_lock_irqsave(&rsp->onofflock, flags); + rdp = rsp->rda[smp_processor_id()]; + if (rsp->orphan_cbs_list == NULL) { + spin_unlock_irqrestore(&rsp->onofflock, flags); + return; + } + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; + rdp->qlen += rsp->orphan_qlen; + rsp->orphan_cbs_list = NULL; + rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; + rsp->orphan_qlen = 0; + spin_unlock_irqrestore(&rsp->onofflock, flags); +} + +/* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { - int i; unsigned long flags; long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; - struct rcu_data *rdp_me; struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ spin_lock_irqsave(&rsp->onofflock, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; + rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { spin_lock(&rnp->lock); /* irqs already disabled. */ @@ -881,42 +913,16 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp); + rcu_preempt_offline_tasks(rsp, rnp, rdp); mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; } while (rnp != NULL); lastcomp = rsp->completed; - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - - /* Being offline is a quiescent state, so go record it. */ - cpu_quiet(cpu, rsp, rdp, lastcomp); + spin_unlock_irqrestore(&rsp->onofflock, flags); - /* - * Move callbacks from the outgoing CPU to the running CPU. - * Note that the outgoing CPU is now quiscent, so it is now - * (uncharacteristically) safe to access its rcu_data structure. - * Note also that we must carefully retain the order of the - * outgoing CPU's callbacks in order for rcu_barrier() to work - * correctly. Finally, note that we start all the callbacks - * afresh, even those that have passed through a grace period - * and are therefore ready to invoke. The theory is that hotplug - * events are rare, and that if they are frequent enough to - * indefinitely delay callbacks, you have far worse things to - * be worrying about. - */ - rdp_me = rsp->rda[smp_processor_id()]; - if (rdp->nxtlist != NULL) { - *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp_me->qlen += rdp->qlen; - rdp->qlen = 0; - } - local_irq_restore(flags); + rcu_adopt_orphan_cbs(rsp); } /* @@ -934,6 +940,14 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ +} + +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} + static void rcu_offline_cpu(int cpu) { } @@ -1066,33 +1080,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, int cpu; unsigned long flags; unsigned long mask; - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; + struct rcu_node *rnp; - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { mask = 0; - spin_lock_irqsave(&rnp_cur->lock, flags); + spin_lock_irqsave(&rnp->lock, flags); if (rsp->completed != lastcomp) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); return 1; } - if (rnp_cur->qsmask == 0) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + if (rnp->qsmask == 0) { + spin_unlock_irqrestore(&rnp->lock, flags); continue; } - cpu = rnp_cur->grplo; + cpu = rnp->grplo; bit = 1; - for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { - if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) + for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) mask |= bit; } if (mask != 0 && rsp->completed == lastcomp) { - /* cpu_quiet_msk() releases rnp_cur->lock. */ - cpu_quiet_msk(mask, rsp, rnp_cur, flags); + /* cpu_quiet_msk() releases rnp->lock. */ + cpu_quiet_msk(mask, rsp, rnp, flags); continue; } - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); } return 0; } @@ -1108,7 +1121,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) struct rcu_node *rnp = rcu_get_root(rsp); u8 signaled; - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) + if (!rcu_gp_in_progress(rsp)) return; /* No grace period in progress, nothing to force. */ if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ @@ -1267,7 +1280,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->nxttail[RCU_NEXT_TAIL] = &head->next; /* Start a new grace period if one not already started. */ - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { + if (!rcu_gp_in_progress(rsp)) { unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); @@ -1347,7 +1360,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && + if (rcu_gp_in_progress(rsp) && ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { rdp->n_rp_need_fqs++; return 1; @@ -1384,6 +1397,82 @@ int rcu_needs_cpu(int cpu) rcu_preempt_needs_cpu(cpu); } +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; + +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *type) +{ + int cpu = smp_processor_id(); + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + + atomic_inc(&rcu_barrier_cpu_count); + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); +} + +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. + */ +static void _rcu_barrier(struct rcu_state *rsp, + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) +{ + BUG_ON(in_interrupt()); + /* Take mutex to serialize concurrent rcu_barrier() requests. */ + mutex_lock(&rcu_barrier_mutex); + init_completion(&rcu_barrier_completion); + /* + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. + */ + atomic_set(&rcu_barrier_cpu_count, 1); + preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ + rcu_adopt_orphan_cbs(rsp); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); + wait_for_completion(&rcu_barrier_completion); + mutex_unlock(&rcu_barrier_mutex); +} + +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(&rcu_bh_state, call_rcu_bh); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(&rcu_sched_state, call_rcu_sched); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ @@ -1457,20 +1546,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - - /* - * A new grace period might start here. If so, we will be part of - * it, and its gpnum will be greater than ours, so we will - * participate. It is also possible for the gpnum to have been - * incremented before this function was called, and the bitmasks - * to not be filled out until now, in which case we will also - * participate due to our gpnum being behind. - */ - - /* Since it is coming online, the CPU is in a quiescent state. */ - cpu_quiet(cpu, rsp, rdp, lastcomp); - local_irq_restore(flags); + spin_unlock_irqrestore(&rsp->onofflock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -1493,6 +1569,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * preempt_disable() in _rcu_barrier() prevents stop_machine(), + * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" + * returns, all online cpus have queued rcu_barrier_func(). + * The dying CPU clears its cpu_online_mask bit and + * moves all of its RCU callbacks to ->orphan_cbs_list + * in the context of stop_machine(), so subsequent calls + * to _rcu_barrier() will adopt these callbacks and only + * then queue rcu_barrier_func() on all remaining CPUs. + */ + rcu_send_cbs_to_orphanage(&rcu_bh_state); + rcu_send_cbs_to_orphanage(&rcu_sched_state); + rcu_preempt_send_cbs_to_orphanage(); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: @@ -1555,7 +1647,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - spin_lock_init(&rnp->lock); + if (rnp != rcu_get_root(rsp)) + spin_lock_init(&rnp->lock); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; @@ -1578,6 +1671,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) INIT_LIST_HEAD(&rnp->blocked_tasks[1]); } } + spin_lock_init(&rcu_get_root(rsp)->lock); } /* @@ -1587,6 +1681,10 @@ static void __init rcu_init_one(struct rcu_state *rsp) */ #define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ + int i; \ + int j; \ + struct rcu_node *rnp; \ + \ rcu_init_one(rsp); \ rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ j = 0; \ @@ -1599,31 +1697,8 @@ do { \ } \ } while (0) -#ifdef CONFIG_TREE_PREEMPT_RCU - -void __init __rcu_init_preempt(void) -{ - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -void __init __rcu_init_preempt(void) -{ -} - -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ - void __init __rcu_init(void) { - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); @@ -1634,6 +1709,4 @@ void __init __rcu_init(void) open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); +#include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index bf8a6f9..b40ac57 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -48,14 +48,14 @@ #elif NR_CPUS <= RCU_FANOUT_SQ # define NUM_RCU_LVLS 2 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 #elif NR_CPUS <= RCU_FANOUT_CUBE # define NUM_RCU_LVLS 3 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) -# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_3 NR_CPUS #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" @@ -79,15 +79,21 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - spinlock_t lock; + spinlock_t lock; /* Root rcu_node's lock protects some */ + /* rcu_state fields as well as following. */ long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ + /* In leaf rcu_node, each bit corresponds to */ + /* an rcu_data structure, otherwise, each */ + /* bit corresponds to a child rcu_node */ + /* structure. */ unsigned long qsmaskinit; /* Per-GP initialization for qsmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ + /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ int grphi; /* highest-numbered CPU or group here. */ u8 grpnum; /* CPU/group number for next level up. */ @@ -95,8 +101,23 @@ struct rcu_node { struct rcu_node *parent; struct list_head blocked_tasks[2]; /* Tasks blocked in RCU read-side critsect. */ + /* Grace period number (->gpnum) x blocked */ + /* by tasks on the (x & 0x1) element of the */ + /* blocked_tasks[] array. */ } ____cacheline_internodealigned_in_smp; +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ @@ -126,23 +147,26 @@ struct rcu_data { * Any of the partitions might be empty, in which case the * pointer to that partition will be equal to the pointer for * the following partition. When the list is empty, all of - * the nxttail elements point to nxtlist, which is NULL. + * the nxttail elements point to the ->nxtlist pointer itself, + * which in that case is NULL. * - * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP * [nxtlist, *nxttail[RCU_DONE_TAIL]): * Entries that batch # <= ->completed * The grace period for these entries has completed, and * the other grace-period-completed entries may be moved * here temporarily in rcu_process_callbacks(). + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * Note that the value of *nxttail[RCU_NEXT_TAIL] will + * always be NULL, as this is the end of the list. */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; - long qlen; /* # of queued callbacks */ + long qlen; /* # of queued callbacks */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ @@ -216,8 +240,19 @@ struct rcu_state { /* Force QS state. */ long gpnum; /* Current gp number. */ long completed; /* # of last completed gp. */ + + /* End of fields guarded by root rcu_node's lock. */ + spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. */ + /* starting new GP. Also */ + /* protects the following */ + /* orphan_cbs fields. */ + struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ + /* orphaned by all CPUs in */ + /* a given leaf rcu_node */ + /* going offline. */ + struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ + long orphan_qlen; /* Number of orphaned cbs. */ spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -255,5 +290,30 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#endif /* #ifdef RCU_TREE_NONCORE */ +#else /* #ifdef RCU_TREE_NONCORE */ + +/* Forward declarations for rcutree_plugin.h */ +static inline void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempted_readers(struct rcu_node *rnp); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +static void rcu_print_task_stall(struct rcu_node *rnp); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +static void rcu_preempt_offline_cpu(int cpu); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_process_callbacks(void); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +static int rcu_preempt_pending(int cpu); +static int rcu_preempt_needs_cpu(int cpu); +static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void rcu_preempt_send_cbs_to_orphanage(void); +static void __init __rcu_init_preempt(void); +#endif /* #else #ifdef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4778936..c0cb783 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -64,22 +64,31 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. */ -static void rcu_preempt_qs_record(int cpu) +static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; + barrier(); + rdp->passed_quiesc = 1; } /* - * We have entered the scheduler or are between softirqs in ksoftirqd. - * If we are in an RCU read-side critical section, we need to reflect - * that in the state of the rcu_node structure corresponding to this CPU. - * Caller must disable hardirqs. + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the appropriate entry + * of the blocked_tasks[] array. The task will dequeue itself when + * it exits the outermost enclosing RCU read-side critical section. + * Therefore, the current grace period cannot be permitted to complete + * until the blocked_tasks[] entry indexed by the low-order bit of + * rnp->gpnum empties. + * + * Caller must disable preemption. */ -static void rcu_preempt_qs(int cpu) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; + unsigned long flags; int phase; struct rcu_data *rdp; struct rcu_node *rnp; @@ -90,7 +99,7 @@ static void rcu_preempt_qs(int cpu) /* Possibly blocking in an RCU read-side critical section. */ rdp = rcu_preempt_state.rda[cpu]; rnp = rdp->mynode; - spin_lock(&rnp->lock); + spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -103,11 +112,15 @@ static void rcu_preempt_qs(int cpu) * state for the current grace period), then as long * as that task remains queued, the current grace period * cannot end. + * + * But first, note that the current CPU must still be + * on line! */ - phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1); + WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); + phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); - smp_mb(); /* Ensure later ctxt swtch seen after above. */ - spin_unlock(&rnp->lock); + spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -119,9 +132,10 @@ static void rcu_preempt_qs(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs_record(cpu); - t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS | - RCU_READ_UNLOCK_GOT_QS); + rcu_preempt_qs(cpu); + local_irq_save(flags); + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + local_irq_restore(flags); } /* @@ -136,6 +150,16 @@ void __rcu_read_lock(void) } EXPORT_SYMBOL_GPL(__rcu_read_lock); +/* + * Check for preempted RCU readers blocking the current grace period + * for the specified rcu_node structure. If the caller needs a reliable + * answer, it must hold the rcu_node's ->lock. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); +} + static void rcu_read_unlock_special(struct task_struct *t) { int empty; @@ -157,7 +181,7 @@ static void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special & RCU_READ_UNLOCK_NEED_QS) { t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS; + rcu_preempt_qs(smp_processor_id()); } /* Hardware IRQ handlers cannot block. */ @@ -177,12 +201,12 @@ static void rcu_read_unlock_special(struct task_struct *t) */ for (;;) { rnp = t->rcu_blocked_node; - spin_lock(&rnp->lock); + spin_lock(&rnp->lock); /* irqs already disabled. */ if (rnp == t->rcu_blocked_node) break; - spin_unlock(&rnp->lock); + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + empty = !rcu_preempted_readers(rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; @@ -193,10 +217,9 @@ static void rcu_read_unlock_special(struct task_struct *t) * drop rnp->lock and restore irq. */ if (!empty && rnp->qsmask == 0 && - list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { - t->rcu_read_unlock_special &= - ~(RCU_READ_UNLOCK_NEED_QS | - RCU_READ_UNLOCK_GOT_QS); + !rcu_preempted_readers(rnp)) { + struct rcu_node *rnp_p; + if (rnp->parent == NULL) { /* Only one rcu_node in the tree. */ cpu_quiet_msk_finish(&rcu_preempt_state, flags); @@ -205,9 +228,10 @@ static void rcu_read_unlock_special(struct task_struct *t) /* Report up the rest of the hierarchy. */ mask = rnp->grpmask; spin_unlock_irqrestore(&rnp->lock, flags); - rnp = rnp->parent; - spin_lock_irqsave(&rnp->lock, flags); - cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags); + rnp_p = rnp->parent; + spin_lock_irqsave(&rnp_p->lock, flags); + WARN_ON_ONCE(rnp->qsmask); + cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); return; } spin_unlock(&rnp->lock); @@ -243,12 +267,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { unsigned long flags; struct list_head *lp; - int phase = rnp->gpnum & 0x1; + int phase; struct task_struct *t; - if (!list_empty(&rnp->blocked_tasks[phase])) { + if (rcu_preempted_readers(rnp)) { spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; /* re-read under lock. */ + phase = rnp->gpnum & 0x1; lp = &rnp->blocked_tasks[phase]; list_for_each_entry(t, lp, rcu_node_entry) printk(" P%d", t->pid); @@ -259,13 +283,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* - * Check for preempted RCU readers for the specified rcu_node structure. - * If the caller needs a reliable answer, it must hold the rcu_node's - * >lock. + * Check that the list of blocked tasks for the newly completed grace + * period is in fact empty. It is a serious bug to complete a grace + * period that still has RCU readers blocked! This function must be + * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * must be held by the caller. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + WARN_ON_ONCE(rcu_preempted_readers(rnp)); + WARN_ON_ONCE(rnp->qsmask); } #ifdef CONFIG_HOTPLUG_CPU @@ -280,7 +307,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp) * The caller must hold rnp->lock with irqs disabled. */ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; @@ -292,6 +320,9 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, WARN_ONCE(1, "Last CPU thought to be offlined?"); return; /* Shouldn't happen: at least one CPU online. */ } + WARN_ON_ONCE(rnp != rdp->mynode && + (!list_empty(&rnp->blocked_tasks[0]) || + !list_empty(&rnp->blocked_tasks[1]))); /* * Move tasks up to root rcu_node. Rely on the fact that the @@ -335,20 +366,12 @@ static void rcu_preempt_check_callbacks(int cpu) struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - t->rcu_read_unlock_special &= - ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS); - rcu_preempt_qs_record(cpu); + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + rcu_preempt_qs(cpu); return; } - if (per_cpu(rcu_preempt_data, cpu).qs_pending) { - if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) { - rcu_preempt_qs_record(cpu); - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS; - } else if (!(t->rcu_read_unlock_special & - RCU_READ_UNLOCK_NEED_QS)) { - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; - } - } + if (per_cpu(rcu_preempt_data, cpu).qs_pending) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } /* @@ -387,6 +410,15 @@ static int rcu_preempt_needs_cpu(int cpu) return !!per_cpu(rcu_preempt_data, cpu).nxtlist; } +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(&rcu_preempt_state, call_rcu); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Initialize preemptable RCU's per-CPU data. */ @@ -396,6 +428,22 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* + * Move preemptable RCU's callbacks to ->orphan_cbs_list. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ + rcu_send_cbs_to_orphanage(&rcu_preempt_state); +} + +/* + * Initialize preemptable RCU's state structures. + */ +static void __init __rcu_init_preempt(void) +{ + RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); +} + +/* * Check for a task exiting while in a preemptable-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep @@ -434,8 +482,17 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * Because preemptable RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_qs(int cpu) +static void rcu_preempt_note_context_switch(int cpu) +{ +} + +/* + * Because preemptable RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) { + return 0; } #ifdef CONFIG_RCU_CPU_STALL_DETECTOR @@ -451,12 +508,13 @@ static void rcu_print_task_stall(struct rcu_node *rnp) #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* - * Because preemptable RCU does not exist, there are never any preempted - * RCU readers. + * Because there is no preemptable RCU, there can be no readers blocked, + * so there is no need to check for blocked tasks. So check only for + * bogus qsmask values. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - return 0; + WARN_ON_ONCE(rnp->qsmask); } #ifdef CONFIG_HOTPLUG_CPU @@ -466,7 +524,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp) * tasks that were blocked within RCU read-side critical sections. */ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp, + struct rcu_data *rdp) { } @@ -484,7 +543,7 @@ static void rcu_preempt_offline_cpu(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to check. */ -void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(int cpu) { } @@ -492,7 +551,7 @@ void rcu_preempt_check_callbacks(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to process. */ -void rcu_preempt_process_callbacks(void) +static void rcu_preempt_process_callbacks(void) { } @@ -522,6 +581,16 @@ static int rcu_preempt_needs_cpu(int cpu) } /* + * Because preemptable RCU does not exist, rcu_barrier() is just + * another name for rcu_barrier_sched(). + */ +void rcu_barrier(void) +{ + rcu_barrier_sched(); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* * Because preemptable RCU does not exist, there is no per-CPU * data to initialize. */ @@ -529,4 +598,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { } +/* + * Because there is no preemptable RCU, there are no callbacks to move. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ +} + +/* + * Because preemptable RCU does not exist, it need not be initialized. + */ +static void __init __rcu_init_preempt(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0ea1bff..4b31c77 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -20,7 +20,7 @@ * Papers: http://www.rdrop.com/users/paulmck/RCU * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU + * Documentation/RCU * */ #include <linux/types.h> @@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata, NULL); } -static struct file_operations rcudata_fops = { +static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, .open = rcudata_open, .read = seq_read, @@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata_csv, NULL); } -static struct file_operations rcudata_csv_fops = { +static const struct file_operations rcudata_csv_fops = { .owner = THIS_MODULE, .open = rcudata_csv_open, .read = seq_read, @@ -159,13 +159,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", rsp->completed, rsp->gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh); + rsp->n_force_qs_lh, rsp->orphan_qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); @@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file) return single_open(file, show_rcuhier, NULL); } -static struct file_operations rcuhier_fops = { +static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, @@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file) return single_open(file, show_rcugp, NULL); } -static struct file_operations rcugp_fops = { +static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, @@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file) return single_open(file, show_rcu_pending, NULL); } -static struct file_operations rcu_pending_fops = { +static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, .open = rcu_pending_open, .read = seq_read, diff --git a/kernel/relay.c b/kernel/relay.c index bc18854..760c262 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* * vm_ops for relay file mappings. */ -static struct vm_operations_struct relay_file_mmap_ops = { +static const struct vm_operations_struct relay_file_mmap_ops = { .fault = relay_buf_fault, .close = relay_file_mmap_close, }; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e1338f0..bcdabf3 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = RESOURCE_MAX; + counter->soft_limit = RESOURCE_MAX; counter->parent = parent; } @@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member) return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_SOFT_LIMIT: + return &counter->soft_limit; }; BUG(); diff --git a/kernel/resource.c b/kernel/resource.c index 78b0872..fb11a58 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -223,13 +223,13 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* * Finds the lowest memory reosurce exists within [res->start.res->end) - * the caller must specify res->start, res->end, res->flags. + * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. */ -static int find_next_system_ram(struct resource *res) +static int find_next_system_ram(struct resource *res, char *name) { resource_size_t start, end; struct resource *p; @@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res) /* system ram is just marked as IORESOURCE_MEM */ if (p->flags != res->flags) continue; + if (name && strcmp(p->name, name)) + continue; if (p->start > end) { p = NULL; break; @@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res) res->end = p->end; return 0; } -int -walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)) + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". + */ +int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg, int (*func)(unsigned long, unsigned long, void *)) { struct resource res; unsigned long pfn, len; u64 orig_end; int ret = -1; + res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; - while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { + while ((res.start < res.end) && + (find_next_system_ram(&res, "System RAM") >= 0)) { pfn = (unsigned long)(res.start >> PAGE_SHIFT); len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); ret = (*func)(pfn, len, arg); diff --git a/kernel/sched.c b/kernel/sched.c index e27a536..e886895 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@ #include <linux/completion.h> #include <linux/kernel_stat.h> #include <linux/debug_locks.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> @@ -119,8 +119,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -295,12 +293,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group @@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return 1; -} -#endif - static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { @@ -514,14 +505,6 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif }; /* @@ -646,9 +629,10 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - rq->curr->sched_class->check_preempt_curr(rq, p, sync); + rq->curr->sched_class->check_preempt_curr(rq, p, flags); } static inline int cpu_of(struct rq *rq) @@ -692,20 +676,15 @@ inline void update_rq_clock(struct rq *rq) /** * runqueue_is_locked + * @cpu: the processor in question. * * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -int runqueue_is_locked(void) +int runqueue_is_locked(int cpu) { - int cpu = get_cpu(); - struct rq *rq = cpu_rq(cpu); - int ret; - - ret = spin_is_locked(&rq->lock); - put_cpu(); - return ret; + return spin_is_locked(&cpu_rq(cpu)->lock); } /* @@ -802,7 +781,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) return single_open(filp, sched_feat_show, NULL); } -static struct file_operations sched_feat_fops = { +static const struct file_operations sched_feat_fops = { .open = sched_feat_open, .write = sched_feat_write, .read = seq_read, @@ -1509,8 +1488,65 @@ static int tg_nop(struct task_group *tg, void *data) #endif #ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->cpu_power; +} + static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) @@ -1695,6 +1731,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #ifdef CONFIG_PREEMPT +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * fair double_lock_balance: Safely acquires both rq->locks in a fair * way at the expense of forcing extra atomic operations in all @@ -1959,13 +1997,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - /* * Is this task likely cache-hot: */ @@ -2023,7 +2054,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - @@ -2239,185 +2270,6 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu(i, sched_group_cpus(group)) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; - if (tmp->flags & flag) - sd = tmp; - } - - if (sd) - update_shares(sd); - - while (sd) { - struct sched_group *group; - int new_cpu, weight; - - if (!(sd->flags & flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, t, cpu); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - return cpu; -} - #endif /* CONFIG_SMP */ /** @@ -2455,37 +2307,22 @@ void task_oncpu_function_call(struct task_struct *p, * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, + int wake_flags) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - long old_state; - struct rq *rq; + struct rq *rq, *orig_rq; if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; + wake_flags &= ~WF_SYNC; -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { - struct sched_domain *sd; - - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif + this_cpu = get_cpu(); smp_wmb(); - rq = task_rq_lock(p, &flags); + rq = orig_rq = task_rq_lock(p, &flags); update_rq_clock(rq); - old_state = p->state; - if (!(old_state & state)) + if (!(p->state & state)) goto out; if (p->se.on_rq) @@ -2493,27 +2330,33 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); orig_cpu = cpu; - this_cpu = smp_processor_id(); #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; - cpu = p->sched_class->select_task_rq(p, sync); - if (cpu != orig_cpu) { + /* + * In order to handle concurrent wakeups and release the rq->lock + * we put the task in TASK_WAKING state. + * + * First fix up the nr_uninterruptible count: + */ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + p->state = TASK_WAKING; + task_rq_unlock(rq, &flags); + + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (cpu != orig_cpu) set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } + rq = task_rq_lock(p, &flags); + + if (rq != orig_rq) + update_rq_clock(rq); + + WARN_ON(p->state != TASK_WAKING); + cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2533,7 +2376,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); - if (sync) + if (wake_flags & WF_SYNC) schedstat_inc(p, se.nr_wakeups_sync); if (orig_cpu != cpu) schedstat_inc(p, se.nr_wakeups_migrate); @@ -2562,7 +2405,7 @@ out_activate: out_running: trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, sync); + check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2571,6 +2414,7 @@ out_running: #endif out: task_rq_unlock(rq, &flags); + put_cpu(); return success; } @@ -2613,6 +2457,7 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; + p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2674,28 +2519,18 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); - - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { p->policy = SCHED_NORMAL; - - if (p->normal_prio < DEFAULT_PRIO) - p->prio = DEFAULT_PRIO; + p->normal_prio = p->static_prio; + } if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; set_load_weight(p); } @@ -2706,9 +2541,19 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_reset_on_fork = 0; } + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; +#ifdef CONFIG_SMP + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); +#endif + set_task_cpu(p, cpu); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -2741,8 +2586,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - p->prio = effective_prio(p); - if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { @@ -2754,7 +2597,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) inc_nr_running(rq); } trace_sched_wakeup_new(rq, p, 1); - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2878,7 +2721,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_counter_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -3064,6 +2907,19 @@ unsigned long nr_iowait(void) return sum; } +unsigned long nr_iowait_cpu(void) +{ + struct rq *this = this_rq(); + return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; @@ -3263,7 +3119,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); @@ -3683,11 +3539,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, *imbalance = sds->min_load_per_task; sds->busiest = sds->group_min; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds->group_leader); - } - return 1; } @@ -3711,7 +3562,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_LOAD_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long smt_gain = sd->smt_gain; @@ -3721,6 +3583,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + return default_scale_smt_power(sd, cpu); +} + unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -3745,10 +3612,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - /* here we could scale based on cpufreq */ + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= arch_scale_smt_power(sd, cpu); + if (sched_feat(ARCH_POWER)) + power *= arch_scale_smt_power(sd, cpu); + else + power *= default_scale_smt_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; } @@ -3785,6 +3661,7 @@ static void update_group_power(struct sched_domain *sd, int cpu) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu @@ -4161,26 +4038,6 @@ ret: return NULL; } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - -static unsigned long power_of(int cpu) -{ - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; -} - /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -5239,17 +5096,16 @@ void account_idle_time(cputime_t cputime) */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, one_jiffy_scaled); else - account_idle_time(one_jiffy); + account_idle_time(cputime_one_jiffy); } /* @@ -5353,7 +5209,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); - perf_counter_task_tick(curr, cpu); + perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5465,14 +5321,13 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *prev) +static void put_prev_task(struct rq *rq, struct task_struct *p) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; + u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_running, runtime); + if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5482,9 +5337,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) * correlates to the amount of cache footprint a task can * build up. */ - update_avg(&prev->se.avg_overlap, runtime); + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_overlap, runtime); + } else { + update_avg(&p->se.avg_running, 0); } - prev->sched_class->put_prev_task(rq, prev); + p->sched_class->put_prev_task(rq, p); } /* @@ -5567,7 +5425,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -5716,10 +5574,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -5733,14 +5591,14 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) + int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, sync, key) && + if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } @@ -5801,16 +5659,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - int sync = 1; + int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) - sync = 0; + wake_flags = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, key); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); @@ -6866,9 +6724,6 @@ EXPORT_SYMBOL(yield); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) */ void __sched io_schedule(void) { @@ -6977,23 +6832,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - /* - * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER - * tasks that are on an otherwise idle runqueue: - */ - time_slice = 0; - if (p->policy == SCHED_RR) { - time_slice = DEF_TIMESLICE; - } else if (p->policy != SCHED_FIFO) { - struct sched_entity *se = &p->se; - unsigned long flags; - struct rq *rq; + time_slice = p->sched_class->get_rr_interval(p); - rq = task_rq_lock(p, &flags); - if (rq->cfs.load.weight) - time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); - } read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -7844,7 +7684,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -8000,9 +7840,7 @@ static int sd_degenerate(struct sched_domain *sd) } /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) + if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; @@ -8019,10 +7857,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | @@ -8708,10 +8542,10 @@ static void set_domain_attribute(struct sched_domain *sd, request = attr->relax_domain_level; if (request < sd->level) { /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } else { /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } @@ -9329,6 +9163,7 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -9360,7 +9195,6 @@ void __init sched_init_smp(void) sched_init_granularity(); free_cpumask_var(non_isolated_cpus); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); init_sched_rt_class(); } #else @@ -9707,7 +9541,7 @@ void __init sched_init(void) alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_counter_init(); + perf_event_init(); scheduler_running = 1; } @@ -10479,7 +10313,7 @@ static int sched_rt_global_constraints(void) #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -10490,7 +10324,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); @@ -10544,8 +10378,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) } static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -10555,15 +10388,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if (tsk->sched_class != &fair_sched_class) return -EINVAL; #endif + return 0; +} +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup) +{ + int retval = cpu_cgroup_can_attach_task(cgrp, tsk); + if (retval) + return retval; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + retval = cpu_cgroup_can_attach_task(cgrp, c); + if (retval) { + rcu_read_unlock(); + return retval; + } + } + rcu_read_unlock(); + } return 0; } static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) + struct cgroup *old_cont, struct task_struct *tsk, + bool threadgroup) { sched_move_task(tsk); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + sched_move_task(c); + } + rcu_read_unlock(); + } } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e1d16c9..479ce56 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running; __read_mostly int sched_clock_stable; struct sched_clock_data { - /* - * Raw spinlock - this is a special case: this might be called - * from within instrumentation code so we dont want to do any - * instrumentation ourselves. - */ - raw_spinlock_t lock; - u64 tick_raw; u64 tick_gtod; u64 clock; @@ -80,7 +73,6 @@ void sched_clock_init(void) for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); - scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; @@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) +static u64 sched_clock_local(struct sched_clock_data *scd) { - s64 delta = now - scd->tick_raw; - u64 clock, min_clock, max_clock; + u64 now, clock, old_clock, min_clock, max_clock; + s64 delta; +again: + now = sched_clock(); + delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; + old_clock = scd->clock; + /* * scd->clock = clamp(scd->tick_gtod + delta, * max(scd->tick_gtod, scd->clock), @@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) */ clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); + min_clock = wrap_max(scd->tick_gtod, old_clock); + max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - scd->clock = clock; + if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + goto again; - return scd->clock; + return clock; } -static void lock_double_clock(struct sched_clock_data *data1, - struct sched_clock_data *data2) +static u64 sched_clock_remote(struct sched_clock_data *scd) { - if (data1 < data2) { - __raw_spin_lock(&data1->lock); - __raw_spin_lock(&data2->lock); + struct sched_clock_data *my_scd = this_scd(); + u64 this_clock, remote_clock; + u64 *ptr, old_val, val; + + sched_clock_local(my_scd); +again: + this_clock = my_scd->clock; + remote_clock = scd->clock; + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + ptr = &scd->clock; + old_val = remote_clock; + val = this_clock; } else { - __raw_spin_lock(&data2->lock); - __raw_spin_lock(&data1->lock); + /* + * Should be rare, but possible: + */ + ptr = &my_scd->clock; + old_val = this_clock; + val = remote_clock; } + + if (cmpxchg64(ptr, old_val, val) != old_val) + goto again; + + return val; } u64 sched_clock_cpu(int cpu) { - u64 now, clock, this_clock, remote_clock; struct sched_clock_data *scd; + u64 clock; + + WARN_ON_ONCE(!irqs_disabled()); if (sched_clock_stable) return sched_clock(); - scd = cpu_sdc(cpu); - - /* - * Normally this is not called in NMI context - but if it is, - * trying to do any locking here is totally lethal. - */ - if (unlikely(in_nmi())) - return scd->clock; - if (unlikely(!sched_clock_running)) return 0ull; - WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); - - if (cpu != raw_smp_processor_id()) { - struct sched_clock_data *my_scd = this_scd(); - - lock_double_clock(scd, my_scd); - - this_clock = __update_sched_clock(my_scd, now); - remote_clock = scd->clock; - - /* - * Use the opportunity that we have both locks - * taken to couple the two clocks: we take the - * larger time as the latest time for both - * runqueues. (this creates monotonic movement) - */ - if (likely((s64)(remote_clock - this_clock) < 0)) { - clock = this_clock; - scd->clock = clock; - } else { - /* - * Should be rare, but possible: - */ - clock = remote_clock; - my_scd->clock = remote_clock; - } - - __raw_spin_unlock(&my_scd->lock); - } else { - __raw_spin_lock(&scd->lock); - clock = __update_sched_clock(scd, now); - } + scd = cpu_sdc(cpu); - __raw_spin_unlock(&scd->lock); + if (cpu != smp_processor_id()) + clock = sched_clock_remote(scd); + else + clock = sched_clock_local(scd); return clock; } @@ -223,11 +209,9 @@ void sched_clock_tick(void) now_gtod = ktime_to_ns(ktime_get()); now = sched_clock(); - __raw_spin_lock(&scd->lock); scd->tick_raw = now; scd->tick_gtod = now_gtod; - __update_sched_clock(scd, now); - __raw_spin_unlock(&scd->lock); + sched_clock_local(scd); } /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ddbd08..efb8440 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); + PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aa7f841..4e777b4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) #ifdef CONFIG_SCHED_DEBUG int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } @@ -709,24 +710,28 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice(cfs_rq, se); - if (!initial) { - /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) { - unsigned long thresh = sysctl_sched_latency; + /* sleeps up to a single latency don't count. */ + if (!initial && sched_feat(FAIR_SLEEPERS)) { + unsigned long thresh = sysctl_sched_latency; - /* - * Convert the sleeper threshold into virtual time. - * SCHED_IDLE is a special sub-class. We care about - * fairness only relative to other SCHED_IDLE tasks, - * all of which have the same weight. - */ - if (sched_feat(NORMALIZED_SLEEPER) && - (!entity_is_task(se) || - task_of(se)->policy != SCHED_IDLE)) - thresh = calc_delta_fair(thresh, se); + /* + * Convert the sleeper threshold into virtual time. + * SCHED_IDLE is a special sub-class. We care about + * fairness only relative to other SCHED_IDLE tasks, + * all of which have the same weight. + */ + if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || + task_of(se)->policy != SCHED_IDLE)) + thresh = calc_delta_fair(thresh, se); - vruntime -= thresh; - } + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + vruntime -= thresh; } /* ensure we never gain time by being placed backwards. */ @@ -757,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->last == se) + if (!se || cfs_rq->last == se) cfs_rq->last = NULL; - if (cfs_rq->next == se) + if (!se || cfs_rq->next == se) cfs_rq->next = NULL; } @@ -1062,83 +1067,6 @@ static void yield_task_fair(struct rq *rq) se->vruntime = rightmost->vruntime + 1; } -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (rq->rd->online) - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) - -#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) - -static int wake_idle(int cpu, struct task_struct *p) -{ - struct sched_domain *sd; - int i; - unsigned int chosen_wakeup_cpu; - int this_cpu; - struct rq *task_rq = task_rq(p); - - /* - * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu - * are idle and this is not a kernel thread and this task's affinity - * allows it to be moved to preferred cpu, then just move! - */ - - this_cpu = smp_processor_id(); - chosen_wakeup_cpu = - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; - - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && - idle_cpu(cpu) && idle_cpu(this_cpu) && - p->mm && !(p->flags & PF_KTHREAD) && - cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) - return chosen_wakeup_cpu; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if ((sd->flags & SD_WAKE_IDLE) - || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq->clock, sd))) { - for_each_cpu_and(i, sched_domain_span(sd), - &p->cpus_allowed) { - if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1225,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif -static int -wake_affine(struct sched_domain *this_sd, struct rq *this_rq, - struct task_struct *p, int prev_cpu, int this_cpu, int sync, - int idx, unsigned long load, unsigned long this_load, - unsigned int imbalance) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; - unsigned long tl = this_load; + struct task_struct *curr = current; + unsigned long this_load, load; + int idx, this_cpu, prev_cpu; unsigned long tl_per_task; + unsigned int imbalance; + struct task_group *tg; unsigned long weight; int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) - return 0; + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); - if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; + if (sync) { + if (sched_feat(SYNC_LESS) && + (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; + } else { + if (sched_feat(SYNC_MORE) && + (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost)) + sync = 1; + } /* * If sync wakeup then subtract the (maximum possible) @@ -1254,24 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(current); weight = current->se.load.weight; - tl += effective_load(tg, this_cpu, -weight, -weight); + this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); weight = p->se.load.weight; + imbalance = 100 + (sd->imbalance_pct - 100) / 2; + /* * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped tl to 0, we'll always have - * an imbalance, but there's really nothing you can do about that, so - * that's good too. + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. * * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !tl || - 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + balanced = !this_load || + 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* @@ -1285,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= - tl_per_task)) { + if (balanced || + (this_load <= load && + this_load + target_load(prev_cpu, idx) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ - schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(sd, ttwu_move_affine); schedstat_inc(p, se.nr_wakeups_affine); return 1; @@ -1300,65 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, return 0; } -static int select_task_rq_fair(struct task_struct *p, int sync) +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int load_idx) { - struct sched_domain *sd, *this_sd = NULL; - int prev_cpu, this_cpu, new_cpu; - unsigned long load, this_load; - struct rq *this_rq; - unsigned int imbalance; - int idx; + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int imbalance = 100 + (sd->imbalance_pct-100)/2; - prev_cpu = task_cpu(p); - this_cpu = smp_processor_id(); - this_rq = cpu_rq(this_cpu); - new_cpu = prev_cpu; + do { + unsigned long load, avg_load; + int local_group; + int i; - /* - * 'this_sd' is the first domain that both - * this_cpu and prev_cpu are present in: - */ - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { - this_sd = sd; - break; + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu(i, sched_group_cpus(group)) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + load = weighted_cpuload(i); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; } } - if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) - goto out; + return idlest; +} - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (!this_sd) +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +{ + struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; + int want_affine = 0; + int want_sd = 1; + int sync = wake_flags & WF_SYNC; + + if (sd_flag & SD_BALANCE_WAKE) { + if (sched_feat(AFFINE_WAKEUPS) && + cpumask_test_cpu(cpu, &p->cpus_allowed)) + want_affine = 1; + new_cpu = prev_cpu; + } + + rcu_read_lock(); + for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, see if we + * are not overloaded, if so, don't balance wider. + */ + if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { + unsigned long power = 0; + unsigned long nr_running = 0; + unsigned long capacity; + int i; + + for_each_cpu(i, sched_domain_span(tmp)) { + power += power_of(i); + nr_running += cpu_rq(i)->cfs.nr_running; + } + + capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + nr_running /= 2; + + if (nr_running < capacity) + want_sd = 0; + } + + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + + affine_sd = tmp; + want_affine = 0; + } + + if (!want_sd && !want_affine) + break; + + if (!(tmp->flags & sd_flag)) + continue; + + if (want_sd) + sd = tmp; + } + + if (sched_feat(LB_SHARES_UPDATE)) { + /* + * Pick the largest domain to update shares over + */ + tmp = sd; + if (affine_sd && (!tmp || + cpumask_weight(sched_domain_span(affine_sd)) > + cpumask_weight(sched_domain_span(sd)))) + tmp = affine_sd; + + if (tmp) + update_shares(tmp); + } + + if (affine_sd && wake_affine(affine_sd, p, sync)) { + new_cpu = cpu; goto out; + } - idx = this_sd->wake_idx; + while (sd) { + int load_idx = sd->forkexec_idx; + struct sched_group *group; + int weight; - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; - if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) - return this_cpu; + group = find_idlest_group(sd, p, cpu, load_idx); + if (!group) { + sd = sd->child; + continue; + } - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - return this_cpu; + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= cpumask_weight(sched_domain_span(tmp))) + break; + if (tmp->flags & sd_flag) + sd = tmp; } + /* while loop will break here if sd == NULL */ } out: - return wake_idle(new_cpu, p); + rcu_read_unlock(); + return new_cpu; } #endif /* CONFIG_SMP */ @@ -1471,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int sync = wake_flags & WF_SYNC; update_curr(cfs_rq); @@ -1501,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) set_last_buddy(se); - set_next_buddy(pse); + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + set_next_buddy(pse); /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -1523,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } - if (!sched_feat(WAKEUP_PREEMPT)) - return; - - if (sched_feat(WAKEUP_OVERLAP) && (sync || - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { + if ((sched_feat(WAKEUP_SYNC) && sync) || + (sched_feat(WAKEUP_OVERLAP) && + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } + if (sched_feat(WAKEUP_RUNNING)) { + if (pse->avg_running < se->avg_running) { + set_next_buddy(pse); + resched_task(curr); + return; + } + } + + if (!sched_feat(WAKEUP_PREEMPT)) + return; + find_matching_se(&se, &pse); BUG_ON(!pse); @@ -1555,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) /* * If se was a buddy, clear it so that it will have to earn * the favour again. + * + * If se was not a buddy, clear the buddies because neither + * was elegible to run, let them earn it again. + * + * IOW. unconditionally clear buddies. */ - __clear_buddies(cfs_rq, se); + __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -1832,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p) } #endif +unsigned int get_rr_interval_fair(struct task_struct *task) +{ + struct sched_entity *se = &task->se; + unsigned long flags; + struct rq *rq; + unsigned int rr_interval = 0; + + /* + * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise + * idle runqueue: + */ + rq = task_rq_lock(task, &flags); + if (rq->cfs.load.weight) + rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + task_rq_unlock(rq, &flags); + + return rr_interval; +} + /* * All the scheduling class methods: */ @@ -1860,6 +1986,8 @@ static const struct sched_class fair_sched_class = { .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, + .get_rr_interval = get_rr_interval_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .moved_group = moved_group_fair, #endif diff --git a/kernel/sched_features.h b/kernel/sched_features.h index e2dc63a..0d94083 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,17 +1,123 @@ -SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) +/* + * Disregards a certain amount of sleep time (sched_latency_ns) and + * considers the task to be running during that period. This gives it + * a service deficit on wakeup, allowing it to run sooner. + */ +SCHED_FEAT(FAIR_SLEEPERS, 1) + +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) + +/* + * By not normalizing the sleep time, heavy tasks get an effective + * longer period, and lighter task an effective shorter period they + * are considered running. + */ SCHED_FEAT(NORMALIZED_SLEEPER, 0) -SCHED_FEAT(ADAPTIVE_GRAN, 1) -SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ SCHED_FEAT(START_DEBIT, 1) + +/* + * Should wakeups try to preempt running tasks. + */ +SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Compute wakeup_gran based on task behaviour, clipped to + * [0, sched_wakeup_gran_ns] + */ +SCHED_FEAT(ADAPTIVE_GRAN, 1) + +/* + * When converting the wakeup granularity to virtual time, do it such + * that heavier tasks preempting a lighter task have an edge. + */ +SCHED_FEAT(ASYM_GRAN, 1) + +/* + * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS. + */ +SCHED_FEAT(WAKEUP_SYNC, 0) + +/* + * Wakeup preempt based on task behaviour. Tasks that do not overlap + * don't get preempted. + */ +SCHED_FEAT(WAKEUP_OVERLAP, 0) + +/* + * Wakeup preemption towards tasks that run short + */ +SCHED_FEAT(WAKEUP_RUNNING, 0) + +/* + * Use the SYNC wakeup hint, pipes and the likes use this to indicate + * the remote end is likely to consume the data we just wrote, and + * therefore has cache benefit from being placed on the same cpu, see + * also AFFINE_WAKEUPS. + */ +SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Based on load and program behaviour, see if it makes sense to place + * a newly woken task on the same cpu as the task that woke it -- + * improve cache locality. Typically used with SYNC wakeups as + * generated by pipes and the like, see also SYNC_WAKEUPS. + */ SCHED_FEAT(AFFINE_WAKEUPS, 1) + +/* + * Weaken SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_LESS, 1) + +/* + * Add SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_MORE, 0) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, 0) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ SCHED_FEAT(CACHE_HOT_BUDDY, 1) -SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, 0) + SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_WAKEUP_UPDATE, 1) +SCHED_FEAT(LB_SHARES_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_OVERLAP, 0) -SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Spin-wait on mutex acquisition when the mutex owner is running on + * another cpu -- assumes that when the owner is running, it will soon + * release the lock. Decreases scheduling overhead. + */ SCHED_FEAT(OWNER_SPIN, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 499672c..b133a28 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sync) +static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { resched_task(rq->idle); } @@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } +unsigned int get_rr_interval_idle(struct task_struct *task) +{ + return 0; +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = { .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, + .get_rr_interval = get_rr_interval_idle, + .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2eb4bd6..a4d790c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int sync) +static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { struct rq *rq = task_rq(p); + if (sd_flag != SD_BALANCE_WAKE) + return smp_processor_id(); + /* * If the current task is an RT task, then * try to see if we can wake this RT task up on another @@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); @@ -1731,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } +unsigned int get_rr_interval_rt(struct task_struct *task) +{ + /* + * Time slice is 0 for SCHED_FIFO tasks + */ + if (task->policy == SCHED_RR) + return DEF_TIMESLICE; + else + return 0; +} + static const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, @@ -1759,6 +1773,8 @@ static const struct sched_class rt_sched_class = { .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, + .get_rr_interval = get_rr_interval_rt, + .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, }; diff --git a/kernel/signal.c b/kernel/signal.c index 64c5dee..6705320 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) if (why) { /* - * The first thread which returns from finish_stop() + * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ @@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) return send_signal(sig, info, t, 0); } +int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + bool group) +{ + unsigned long flags; + int ret = -ESRCH; + + if (lock_task_sighand(p, &flags)) { + ret = send_signal(sig, info, p, group); + unlock_task_sighand(p, &flags); + } + + return ret; +} + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p) } } -int __fatal_signal_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL); -} -EXPORT_SYMBOL(__fatal_signal_pending); - struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; @@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - unsigned long flags; - int ret; + int ret = check_kill_permission(sig, info, p); - ret = check_kill_permission(sig, info, p); - - if (!ret && sig) { - ret = -ESRCH; - if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); - } - } + if (!ret && sig) + ret = do_send_sig_info(sig, info, p, true); return ret; } @@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -/* - * The caller must ensure the task can't exit. - */ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret; - unsigned long flags; - /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). @@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = specific_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; + return do_send_sig_info(sig, info, p, false); } #define __si_special(priv) \ @@ -1383,15 +1374,6 @@ ret: } /* - * Wake up any threads in the parent blocked in wait* syscalls. - */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) -{ - wake_up_interruptible_sync(&parent->signal->wait_chldexit); -} - -/* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * @@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code) spin_unlock_irq(¤t->sighand->siglock); } -static void -finish_stop(int stop_count) -{ - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, - * report to the parent. When ptraced, every thread reports itself. - */ - if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, CLD_STOPPED); - read_unlock(&tasklist_lock); - } - - do { - schedule(); - } while (try_to_freeze()); - /* - * Now we don't run again until continued. - */ - current->exit_code = 0; -} - /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. @@ -1705,15 +1664,9 @@ finish_stop(int stop_count) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int stop_count; + int notify; - if (sig->group_stop_count > 0) { - /* - * There is a group stop in progress. We don't need to - * start another one. - */ - stop_count = --sig->group_stop_count; - } else { + if (!sig->group_stop_count) { struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; - stop_count = 0; + sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) /* * Setting state to TASK_STOPPED for a group @@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr) */ if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { - stop_count++; + sig->group_stop_count++; signal_wake_up(t, 0); } - sig->group_stop_count = stop_count; } + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, report + * to the parent. When ptraced, every thread reports itself. + */ + notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; + notify = tracehook_notify_jctl(notify, CLD_STOPPED); + /* + * tracehook_notify_jctl() can drop and reacquire siglock, so + * we keep ->group_stop_count != 0 before the call. If SIGCONT + * or SIGKILL comes in between ->group_stop_count == 0. + */ + if (sig->group_stop_count) { + if (!--sig->group_stop_count) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + } + spin_unlock_irq(¤t->sighand->siglock); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + do { + schedule(); + } while (try_to_freeze()); + + tracehook_finish_jctl(); + current->exit_code = 0; - spin_unlock_irq(¤t->sighand->siglock); - finish_stop(stop_count); return 1; } @@ -1815,14 +1793,15 @@ relock: int why = (signal->flags & SIGNAL_STOP_CONTINUED) ? CLD_CONTINUED : CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; - spin_unlock_irq(&sighand->siglock); - if (unlikely(!tracehook_notify_jctl(1, why))) - goto relock; + why = tracehook_notify_jctl(why, CLD_CONTINUED); + spin_unlock_irq(&sighand->siglock); - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); + if (why) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); + } goto relock; } @@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk) if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = 1; + group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); } out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { + if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, CLD_STOPPED); + do_notify_parent_cldstop(tsk, group_stop); read_unlock(&tasklist_lock); } } @@ -2290,7 +2269,6 @@ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { struct task_struct *p; - unsigned long flags; int error = -ESRCH; rcu_read_lock(); @@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. - * - * If lock_task_sighand() fails we pretend the task dies - * after receiving the signal. The window is tiny, and the - * signal is private anyway. */ - if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); + if (!error && sig) { + error = do_send_sig_info(sig, info, p, false); + /* + * If lock_task_sighand() failed we pretend the task + * dies after receiving the signal. The window is tiny, + * and the signal is private anyway. + */ + if (unlikely(error == -ESRCH)) + error = 0; } } rcu_read_unlock(); diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 09d7519..0d31135 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); #ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, +static int slow_work_min_threads_sysctl(struct ctl_table *, int, void __user *, size_t *, loff_t *); -static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, +static int slow_work_max_threads_sysctl(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif @@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data) * Handle adjustment of the minimum number of threads */ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, * Handle adjustment of the maximum number of threads */ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { diff --git a/kernel/smp.c b/kernel/smp.c index 94188b8..c9d1c78 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,8 +29,7 @@ enum { struct call_function_data { struct call_single_data csd; - spinlock_t lock; - unsigned int refs; + atomic_t refs; cpumask_var_t cpumask; }; @@ -39,9 +38,7 @@ struct call_single_queue { spinlock_t lock; }; -static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { - .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), -}; +static DEFINE_PER_CPU(struct call_function_data, cfd_data); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -177,6 +174,11 @@ void generic_smp_call_function_interrupt(void) int cpu = get_cpu(); /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(cpu)); + + /* * Ensure entry is visible on call_function_queue after we have * entered the IPI. See comment in smp_call_function_many. * If we don't have this, then we may miss an entry on the list @@ -191,25 +193,18 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - spin_lock(&data->lock); - if (!cpumask_test_cpu(cpu, data->cpumask)) { - spin_unlock(&data->lock); + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; - } - cpumask_clear_cpu(cpu, data->cpumask); - spin_unlock(&data->lock); data->csd.func(data->csd.info); - spin_lock(&data->lock); - WARN_ON(data->refs == 0); - refs = --data->refs; + refs = atomic_dec_return(&data->refs); + WARN_ON(refs < 0); if (!refs) { spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); spin_unlock(&call_function.lock); } - spin_unlock(&data->lock); if (refs) continue; @@ -230,6 +225,11 @@ void generic_smp_call_function_single_interrupt(void) unsigned int data_flags; LIST_HEAD(list); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(smp_processor_id())); + spin_lock(&q->lock); list_replace_init(&q->list, &list); spin_unlock(&q->lock); @@ -285,8 +285,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, */ this_cpu = get_cpu(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -329,19 +335,18 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, { csd_lock(data); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + && !oops_in_progress); generic_exec_single(cpu, data, wait); } -/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ - -#ifndef arch_send_call_function_ipi_mask -# define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) -#endif - /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -365,8 +370,14 @@ void smp_call_function_many(const struct cpumask *mask, unsigned long flags; int cpu, next_cpu, this_cpu = smp_processor_id(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -391,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); - spin_lock_irqsave(&data->lock, flags); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); - data->refs = cpumask_weight(data->cpumask); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); - spin_lock(&call_function.lock); + spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock(&call_function.lock); - - spin_unlock_irqrestore(&data->lock, flags); + spin_unlock_irqrestore(&call_function.lock, flags); /* * Make the list addition visible before sending the ipi. diff --git a/kernel/softirq.c b/kernel/softirq.c index 7db2506..f8749e5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 88796c3..81324d1 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void) EXPORT_SYMBOL(touch_all_softlockup_watchdogs); int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } /* diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097..255475d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,7 +14,7 @@ #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/kexec.h> @@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long flags; cputime_t utime, stime; struct task_cputime cputime; + unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; @@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = task_utime(current); stime = task_stime(current); accumulate_thread_rusage(p, r); + maxrss = p->signal->maxrss; goto out; } @@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt = p->signal->cmaj_flt; r->ru_inblock = p->signal->cinblock; r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; if (who == RUSAGE_CHILDREN) break; @@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt += p->signal->maj_flt; r->ru_inblock += p->signal->inblock; r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; t = p; do { accumulate_thread_rusage(t, r); @@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); + + if (who != RUSAGE_CHILDREN) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + } + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) @@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; - case PR_TASK_PERF_COUNTERS_DISABLE: - error = perf_counter_task_disable(); + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); break; - case PR_TASK_PERF_COUNTERS_ENABLE: - error = perf_counter_task_enable(); + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; @@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->timer_slack_ns = arg2; error = 0; break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case 0: + if (arg3 != 0) + return -EINVAL; + current->flags &= ~PF_MCE_PROCESS; + break; + case 1: + current->flags |= PF_MCE_PROCESS; + if (arg3 != 0) + current->flags |= PF_MCE_EARLY; + else + current->flags &= ~PF_MCE_EARLY; + break; + default: + return -EINVAL; + } + error = 0; + break; + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 68320f6..e06d0b8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg); cond_syscall(compat_sys_sendmsg); cond_syscall(sys_recvmsg); cond_syscall(compat_sys_recvmsg); +cond_syscall(compat_sys_recvfrom); cond_syscall(sys_socketcall); cond_syscall(sys_futex); cond_syscall(compat_sys_futex); @@ -177,4 +178,4 @@ cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); /* performance counters: */ -cond_syscall(sys_perf_counter_open); +cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6bb59f7..0d949c5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -26,7 +26,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/utsname.h> #include <linux/kmemcheck.h> #include <linux/smp_lock.h> #include <linux/fs.h> @@ -50,7 +49,7 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/slow-work.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -77,6 +76,7 @@ extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; +extern unsigned int core_pipe_limit; extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -91,7 +91,9 @@ extern int sysctl_nr_trim_pages; #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#ifdef CONFIG_BLOCK extern int blk_iopoll_enabled; +#endif /* Constants used for minimum and maximum */ #ifdef CONFIG_DETECT_SOFTLOCKUP @@ -104,6 +106,9 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; +#ifdef CONFIG_PRINTK +static int ten_thousand = 10000; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -158,9 +163,9 @@ extern int max_lock_depth; #endif #ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -419,6 +424,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -720,6 +733,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &ten_thousand, + }, #endif { .ctl_name = KERN_NGROUPS_MAX, @@ -962,28 +986,28 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_paranoid", - .data = &sysctl_perf_counter_paranoid, - .maxlen = sizeof(sysctl_perf_counter_paranoid), + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_mlock_kb", - .data = &sysctl_perf_counter_mlock, - .maxlen = sizeof(sysctl_perf_counter_mlock), + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_max_sample_rate", - .data = &sysctl_perf_counter_sample_rate, - .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, @@ -998,6 +1022,7 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_BLOCK { .ctl_name = CTL_UNNUMBERED, .procname = "blk_iopoll", @@ -1006,6 +1031,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1372,6 +1398,31 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, +#ifdef CONFIG_MEMORY_FAILURE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -2200,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head) #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(void* data, int maxlen, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -2261,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write, * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2275,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write, * * Returns 0 on success. */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, filp, + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } @@ -2303,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, struct file *filp, void __user *buffer, + int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -2410,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, #undef TMPBUFLEN } -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, +static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { - return __do_proc_dointvec(table->data, table, write, filp, + return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } @@ -2424,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2434,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, NULL,NULL); } @@ -2445,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, * Taint values can only be increased * This means we can safely use a temporary. */ -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2457,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp, t = *table; t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; @@ -2509,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2522,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, ¶m); } static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, @@ -2639,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, - filp, buffer, lenp, ppos, convmul, convdiv); + buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2666,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2691,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } @@ -2771,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2783,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } @@ -2794,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position @@ -2806,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * * Returns 0 on success. */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); } @@ -2817,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2830,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid *new_pid; @@ -2846,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp tmp = pid_vnr(cad_pid); - r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + r = __do_proc_dointvec(&tmp, table, write, buffer, lenp, ppos, NULL, NULL); if (r || !write) return r; @@ -2861,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp #else /* CONFIG_PROC_FS */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/time.c b/kernel/time.c index 2951194..2e2e469 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime); * 0 <= tv_nsec < NSEC_PER_SEC * For negative values only the tv_sec field is negative ! */ -void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { + asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 0b0a636..ee26662 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7466cb8..5e18c6a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -21,7 +21,6 @@ * * TODO WishList: * o Allow clocksource drivers to be unregistered - * o get rid of clocksource_jiffies extern */ #include <linux/clocksource.h> @@ -30,6 +29,7 @@ #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ #include <linux/tick.h> +#include <linux/kthread.h> void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, @@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc, } EXPORT_SYMBOL(timecounter_cyc2time); -/* XXX - Would like a better way for initializing curr_clocksource */ -extern struct clocksource clocksource_jiffies; - /*[Clocksource internal variables]--------- * curr_clocksource: - * currently selected clocksource. Initialized to clocksource_jiffies. - * next_clocksource: - * pending next selected clocksource. + * currently selected clocksource. * clocksource_list: * linked list with the registered clocksources - * clocksource_lock: - * protects manipulations to curr_clocksource and next_clocksource - * and the clocksource_list + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list * override_name: * Name of the user-specified clocksource. */ -static struct clocksource *curr_clocksource = &clocksource_jiffies; -static struct clocksource *next_clocksource; -static struct clocksource *clocksource_override; +static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); -static DEFINE_SPINLOCK(clocksource_lock); +static DEFINE_MUTEX(clocksource_mutex); static char override_name[32]; static int finished_booting; -/* clocksource_done_booting - Called near the end of core bootup - * - * Hack to avoid lots of clocksource churn at boot time. - * We use fs_initcall because we want this to start before - * device_initcall but after subsys_initcall. - */ -static int __init clocksource_done_booting(void) -{ - finished_booting = 1; - return 0; -} -fs_initcall(clocksource_done_booting); - #ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static void clocksource_watchdog_work(struct work_struct *work); + static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; static struct timer_list watchdog_timer; +static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; -static unsigned long watchdog_resumed; +static int watchdog_running; + +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); /* * Interval: 0.5sec Threshold: 0.0625s @@ -158,135 +143,249 @@ static unsigned long watchdog_resumed; #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) -static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +static void clocksource_watchdog_work(struct work_struct *work) { - if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) - return; + /* + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + +static void __clocksource_unstable(struct clocksource *cs) +{ + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + if (finished_booting) + schedule_work(&watchdog_work); +} +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", cs->name, delta); - cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); - clocksource_change_rating(cs, 0); - list_del(&cs->wd_list); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); } static void clocksource_watchdog(unsigned long data) { - struct clocksource *cs, *tmp; + struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int resumed; + int next_cpu; spin_lock(&watchdog_lock); - - resumed = test_and_clear_bit(0, &watchdog_resumed); + if (!watchdog_running) + goto out; wdnow = watchdog->read(watchdog); - wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, + watchdog->mult, watchdog->shift); watchdog_last = wdnow; - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { - csnow = cs->read(cs); + list_for_each_entry(cs, &watchdog_list, wd_list) { - if (unlikely(resumed)) { - cs->wd_last = csnow; + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + if (finished_booting) + schedule_work(&watchdog_work); continue; } - /* Initialized ? */ + csnow = cs->read(cs); + + /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { - if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* - * We just marked the clocksource as - * highres-capable, notify the rest of the - * system as well so that we transition - * into high-res mode: - */ - tick_clock_notify(); - } cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = csnow; - } else { - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); - cs->wd_last = csnow; - /* Check the delta. Might remove from the list ! */ - clocksource_ratewd(cs, cs_nsec - wd_nsec); + continue; } - } - if (!list_empty(&watchdog_list)) { - /* - * Cycle through CPUs to check if the CPUs stay - * synchronized to each other. - */ - int next_cpu = cpumask_next(raw_smp_processor_id(), - cpu_online_mask); + /* Check the deviation from the watchdog clocksource. */ + cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + cs->mask, cs->mult, cs->shift); + cs->wd_last = csnow; + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + clocksource_unstable(cs, cs_nsec - wd_nsec); + continue; + } - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as highres-capable, + * notify the rest of the system as well so that we + * transition into high-res mode: + */ + tick_clock_notify(); + } } + + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); +out: spin_unlock(&watchdog_lock); } + +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + watchdog_last = watchdog->read(watchdog); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + static void clocksource_resume_watchdog(void) { - set_bit(0, &watchdog_resumed); + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + clocksource_reset_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_check_watchdog(struct clocksource *cs) +static void clocksource_enqueue_watchdog(struct clocksource *cs) { - struct clocksource *cse; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - int started = !list_empty(&watchdog_list); - + /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); - if (!started && watchdog) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; } else { + /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - + /* Pick the best watchdog. */ if (!watchdog || cs->rating > watchdog->rating) { - if (watchdog) - del_timer(&watchdog_timer); watchdog = cs; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; - /* Reset watchdog cycles */ - list_for_each_entry(cse, &watchdog_list, wd_list) - cse->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Start if list is not empty */ - if (!list_empty(&watchdog_list)) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = - jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } + clocksource_reset_watchdog(); + } + } + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + struct clocksource *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + } else if (cs == watchdog) { + /* Reset watchdog cycles */ + clocksource_reset_watchdog(); + /* Current watchdog is removed. Find an alternative. */ + watchdog = NULL; + list_for_each_entry(tmp, &clocksource_list, list) { + if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + if (!watchdog || tmp->rating > watchdog->rating) + watchdog = tmp; } } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } -#else -static void clocksource_check_watchdog(struct clocksource *cs) + +static int clocksource_watchdog_kthread(void *data) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + LIST_HEAD(unstable); + + mutex_lock(&clocksource_mutex); + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + list_add(&cs->wd_list, &unstable); + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); + + /* Needs to be done outside of watchdog lock */ + list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { + list_del_init(&cs->wd_list); + __clocksource_change_rating(cs, 0); + } + mutex_unlock(&clocksource_mutex); + return 0; +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -#endif +static inline int clocksource_watchdog_kthread(void *data) { return 0; } + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** * clocksource_resume - resume the clocksource(s) @@ -294,18 +393,12 @@ static inline void clocksource_resume_watchdog(void) { } void clocksource_resume(void) { struct clocksource *cs; - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); - - list_for_each_entry(cs, &clocksource_list, list) { + list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) cs->resume(); - } clocksource_resume_watchdog(); - - spin_unlock_irqrestore(&clocksource_lock, flags); } /** @@ -320,75 +413,94 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +#ifdef CONFIG_GENERIC_TIME + /** - * clocksource_get_next - Returns the selected clocksource + * clocksource_select - Select the best clocksource available * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. */ -struct clocksource *clocksource_get_next(void) +static void clocksource_select(void) { - unsigned long flags; + struct clocksource *best, *cs; - spin_lock_irqsave(&clocksource_lock, flags); - if (next_clocksource && finished_booting) { - curr_clocksource = next_clocksource; - next_clocksource = NULL; + if (!finished_booting || list_empty(&clocksource_list)) + return; + /* First clocksource on the list has the best rating. */ + best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + tick_oneshot_mode_active()) { + /* Override clocksource cannot be used. */ + printk(KERN_WARNING "Override clocksource %s is not " + "HRT compatible. Cannot switch while in " + "HRT/NOHZ mode\n", cs->name); + override_name[0] = 0; + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + if (curr_clocksource != best) { + printk(KERN_INFO "Switching to clocksource %s\n", best->name); + curr_clocksource = best; + timekeeping_notify(curr_clocksource); } - spin_unlock_irqrestore(&clocksource_lock, flags); - - return curr_clocksource; } -/** - * select_clocksource - Selects the best registered clocksource. - * - * Private function. Must hold clocksource_lock when called. +#else /* CONFIG_GENERIC_TIME */ + +static inline void clocksource_select(void) { } + +#endif + +/* + * clocksource_done_booting - Called near the end of core bootup * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. */ -static struct clocksource *select_clocksource(void) +static int __init clocksource_done_booting(void) { - struct clocksource *next; - - if (list_empty(&clocksource_list)) - return NULL; - - if (clocksource_override) - next = clocksource_override; - else - next = list_entry(clocksource_list.next, struct clocksource, - list); + finished_booting = 1; - if (next == curr_clocksource) - return NULL; + /* + * Run the watchdog first to eliminate unstable clock sources + */ + clocksource_watchdog_kthread(NULL); - return next; + mutex_lock(&clocksource_mutex); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; } +fs_initcall(clocksource_done_booting); /* * Enqueue the clocksource sorted by rating */ -static int clocksource_enqueue(struct clocksource *c) +static void clocksource_enqueue(struct clocksource *cs) { - struct list_head *tmp, *entry = &clocksource_list; + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; - - cs = list_entry(tmp, struct clocksource, list); - if (cs == c) - return -EBUSY; + list_for_each_entry(tmp, &clocksource_list, list) /* Keep track of the place, where to insert */ - if (cs->rating >= c->rating) - entry = tmp; - } - list_add(&c->list, entry); - - if (strlen(c->name) == strlen(override_name) && - !strcmp(c->name, override_name)) - clocksource_override = c; - - return 0; + if (tmp->rating >= cs->rating) + entry = &tmp->list; + list_add(&cs->list, entry); } /** @@ -397,52 +509,48 @@ static int clocksource_enqueue(struct clocksource *c) * * Returns -EBUSY if registration fails, zero otherwise. */ -int clocksource_register(struct clocksource *c) +int clocksource_register(struct clocksource *cs) { - unsigned long flags; - int ret; - - spin_lock_irqsave(&clocksource_lock, flags); - ret = clocksource_enqueue(c); - if (!ret) - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); - if (!ret) - clocksource_check_watchdog(c); - return ret; + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_select(); + clocksource_enqueue_watchdog(cs); + mutex_unlock(&clocksource_mutex); + return 0; } EXPORT_SYMBOL(clocksource_register); +static void __clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); + clocksource_select(); +} + /** * clocksource_change_rating - Change the rating of a registered clocksource - * */ void clocksource_change_rating(struct clocksource *cs, int rating) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); + __clocksource_change_rating(cs, rating); + mutex_unlock(&clocksource_mutex); } +EXPORT_SYMBOL(clocksource_change_rating); /** * clocksource_unregister - remove a registered clocksource */ void clocksource_unregister(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); + clocksource_dequeue_watchdog(cs); list_del(&cs->list); - if (clocksource_override == cs) - clocksource_override = NULL; - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); + clocksource_select(); + mutex_unlock(&clocksource_mutex); } +EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** @@ -458,9 +566,9 @@ sysfs_show_current_clocksources(struct sys_device *dev, { ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return count; } @@ -478,9 +586,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct clocksource *ovr = NULL; size_t ret = count; - int len; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) @@ -490,44 +596,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (buf[count-1] == '\n') count--; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); if (count > 0) memcpy(override_name, buf, count); override_name[count] = 0; + clocksource_select(); - len = strlen(override_name); - if (len) { - struct clocksource *cs; - - ovr = clocksource_override; - /* try to select it: */ - list_for_each_entry(cs, &clocksource_list, list) { - if (strlen(cs->name) == len && - !strcmp(cs->name, override_name)) - ovr = cs; - } - } - - /* - * Check to make sure we don't switch to a non-highres capable - * clocksource if the tick code is in oneshot mode (highres or nohz) - */ - if (tick_oneshot_mode_active() && ovr && - !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { - printk(KERN_WARNING "%s clocksource is not HRT compatible. " - "Cannot switch while in HRT/NOHZ mode\n", ovr->name); - ovr = NULL; - override_name[0] = 0; - } - - /* Reselect, when the override name has changed */ - if (ovr != clocksource_override) { - clocksource_override = ovr; - next_clocksource = select_clocksource(); - } - - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return ret; } @@ -547,7 +623,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, struct clocksource *src; ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); list_for_each_entry(src, &clocksource_list, list) { /* * Don't show non-HRES clocksource if the tick code is @@ -559,7 +635,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "%s ", src->name); } - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); count += snprintf(buf + count, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); @@ -614,11 +690,10 @@ device_initcall(init_clocksource_sysfs); */ static int __init boot_override_clocksource(char* str) { - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); if (str) strlcpy(override_name, str, sizeof(override_name)); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index c3f6c30..5404a84 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ - .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; @@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void) } core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7fc6437..4800f93 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) case TIME_OK: break; case TIME_INS: - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; + timekeeping_leap_insert(-1); time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) res = HRTIMER_RESTART; break; case TIME_DEL: - xtime.tv_sec++; + timekeeping_leap_insert(1); time_tai--; - wall_to_monotonic.tv_sec--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OK; break; } - update_vsyscall(&xtime, clock); write_sequnlock(&xtime_lock); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e0f59a2..89aed59 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle) if (!inidle && !ts->inidle) goto end; + /* + * Set ts->inidle unconditionally. Even if the system did not + * switch to NOHZ mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + now = tick_nohz_start_idle(ts); /* @@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - ts->inidle = 1; - if (need_resched()) goto end; diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 0000000..86628e7 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + */ + +#include <linux/time.h> +#include <linux/module.h> + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + const unsigned short *ip; + + days = totalsecs / SECS_PER_DAY; + rem = totalsecs % SECS_PER_DAY; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time_to_tm); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e8c77d9..c3a4e29 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -13,12 +13,123 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/sysdev.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/tick.h> +#include <linux/stop_machine.h> + +/* Structure holding internal timekeeping values. */ +struct timekeeper { + /* Current clocksource used for timekeeping. */ + struct clocksource *clock; + /* The shift value of the current clocksource. */ + int shift; + + /* Number of clock cycles in one NTP interval. */ + cycle_t cycle_interval; + /* Number of clock shifted nano seconds in one NTP interval. */ + u64 xtime_interval; + /* Raw nano seconds accumulated per NTP interval. */ + u32 raw_interval; + + /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ + u64 xtime_nsec; + /* Difference between accumulated time and NTP time in ntp + * shifted nano seconds. */ + s64 ntp_error; + /* Shift conversion between clock shifted nano seconds and + * ntp shifted nano seconds. */ + int ntp_error_shift; + /* NTP adjusted clock multiplier */ + u32 mult; +}; + +struct timekeeper timekeeper; + +/** + * timekeeper_setup_internals - Set up internals to use clocksource clock. + * + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void timekeeper_setup_internals(struct clocksource *clock) +{ + cycle_t interval; + u64 tmp; + + timekeeper.clock = clock; + clock->cycle_last = clock->read(clock); + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + tmp += clock->mult/2; + do_div(tmp, clock->mult); + if (tmp == 0) + tmp = 1; + + interval = (cycle_t) tmp; + timekeeper.cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.raw_interval = + ((u64) interval * clock->mult) >> clock->shift; + + timekeeper.xtime_nsec = 0; + timekeeper.shift = clock->shift; + + timekeeper.ntp_error = 0; + timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + timekeeper.mult = clock->mult; +} + +/* Timekeeper helper functions. */ +static inline s64 timekeeping_get_ns(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); +} + +static inline s64 timekeeping_get_ns_raw(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); +} /* * This read-write spinlock protects us from races in SMP while @@ -44,7 +155,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static unsigned long total_sleep_time; /* seconds */ +static struct timespec total_sleep_time; + +/* + * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. + */ +struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -56,35 +172,44 @@ void update_xtime_cache(u64 nsec) timespec_add_ns(&xtime_cache, nsec); } -struct clocksource *clock; - +/* must hold xtime_lock */ +void timekeeping_leap_insert(int leapsecond) +{ + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; + update_vsyscall(&xtime, timekeeper.clock); +} #ifdef CONFIG_GENERIC_TIME + /** - * clocksource_forward_now - update clock to the current time + * timekeeping_forward_now - update clock to the current time * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void clocksource_forward_now(void) +static void timekeeping_forward_now(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; s64 nsec; - cycle_now = clocksource_read(clock); + clock = timekeeper.clock; + cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = cyc2ns(clock, cycle_delta); + nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); timespec_add_ns(&xtime, nsec); - nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - clock->raw_time.tv_nsec += nsec; + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + timespec_add_ns(&raw_time, nsec); } /** @@ -95,7 +220,6 @@ static void clocksource_forward_now(void) */ void getnstimeofday(struct timespec *ts) { - cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -105,15 +229,7 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -125,6 +241,57 @@ void getnstimeofday(struct timespec *ts) EXPORT_SYMBOL(getnstimeofday); +ktime_t ktime_get(void) +{ + unsigned int seq; + s64 secs, nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + secs = xtime.tv_sec + wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + nsecs += timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + /* + * Use ktime_set/ktime_add_ns to create a proper ktime on + * 32-bit architectures without CONFIG_KTIME_SCALAR. + */ + return ktime_add_ns(ktime_set(secs, 0), nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set @@ -157,7 +324,7 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; @@ -167,10 +334,10 @@ int do_settimeofday(struct timespec *tv) update_xtime_cache(0); - clock->error = 0; + timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -187,44 +354,97 @@ EXPORT_SYMBOL(do_settimeofday); * * Accumulates current time interval and initializes new clocksource */ -static void change_clocksource(void) +static int change_clocksource(void *data) { struct clocksource *new, *old; - new = clocksource_get_next(); + new = (struct clocksource *) data; + + timekeeping_forward_now(); + if (!new->enable || new->enable(new) == 0) { + old = timekeeper.clock; + timekeeper_setup_internals(new); + if (old->disable) + old->disable(old); + } + return 0; +} - if (clock == new) +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +void timekeeping_notify(struct clocksource *clock) +{ + if (timekeeper.clock == clock) return; + stop_machine(change_clocksource, clock, NULL); + tick_clock_notify(); +} - clocksource_forward_now(); +#else /* GENERIC_TIME */ - if (clocksource_enable(new)) - return; +static inline void timekeeping_forward_now(void) { } - new->raw_time = clock->raw_time; - old = clock; - clock = new; - clocksource_disable(old); +/** + * ktime_get - get the monotonic time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get(void) +{ + struct timespec now; - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + ktime_get_ts(&now); - tick_clock_notify(); + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get); - /* - * We're holding xtime lock and waking up klogd would deadlock - * us on enqueue. So no printing! - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - */ +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + tomono = wall_to_monotonic; + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec); } -#else -static inline void clocksource_forward_now(void) { } -static inline void change_clocksource(void) { } -#endif +EXPORT_SYMBOL_GPL(ktime_get_ts); + +#endif /* !GENERIC_TIME */ + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec @@ -236,21 +456,11 @@ void getrawmonotonic(struct timespec *ts) { unsigned long seq; s64 nsecs; - cycle_t cycle_now, cycle_delta; do { seq = read_seqbegin(&xtime_lock); - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - - *ts = clock->raw_time; + nsecs = timekeeping_get_ns_raw(); + *ts = raw_time; } while (read_seqretry(&xtime_lock, seq)); @@ -270,7 +480,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); @@ -278,17 +488,33 @@ int timekeeping_valid_for_hres(void) } /** - * read_persistent_clock - Return time in seconds from the persistent clock. + * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -unsigned long __attribute__((weak)) read_persistent_clock(void) +void __attribute__((weak)) read_persistent_clock(struct timespec *ts) { - return 0; + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +/** + * read_boot_clock - Return time of the system start. + * + * Weak dummy function for arches that do not yet support it. + * Function to read the exact time the system has been started. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __attribute__((weak)) read_boot_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; } /* @@ -296,29 +522,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void) */ void __init timekeeping_init(void) { + struct clocksource *clock; unsigned long flags; - unsigned long sec = read_persistent_clock(); + struct timespec now, boot; + + read_persistent_clock(&now); + read_boot_clock(&boot); write_seqlock_irqsave(&xtime_lock, flags); ntp_init(); - clock = clocksource_get_next(); - clocksource_enable(clock); - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); - - xtime.tv_sec = sec; - xtime.tv_nsec = 0; + clock = clocksource_default_clock(); + if (clock->enable) + clock->enable(clock); + timekeeper_setup_internals(clock); + + xtime.tv_sec = now.tv_sec; + xtime.tv_nsec = now.tv_nsec; + raw_time.tv_sec = 0; + raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) { + boot.tv_sec = xtime.tv_sec; + boot.tv_nsec = xtime.tv_nsec; + } set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + -boot.tv_sec, -boot.tv_nsec); update_xtime_cache(0); - total_sleep_time = 0; + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; +static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -331,24 +568,24 @@ static unsigned long timekeeping_suspend_time; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; - unsigned long now = read_persistent_clock(); + struct timespec ts; + + read_persistent_clock(&ts); clocksource_resume(); write_seqlock_irqsave(&xtime_lock, flags); - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - total_sleep_time += sleep_length; + if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { + ts = timespec_sub(ts, timekeeping_suspend_time); + xtime = timespec_add_safe(xtime, ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); + total_sleep_time = timespec_add_safe(total_sleep_time, ts); } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); - clock->error = 0; + timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); + timekeeper.ntp_error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -366,10 +603,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; - timekeeping_suspend_time = read_persistent_clock(); + read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -404,7 +641,7 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -420,7 +657,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -429,8 +666,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; + tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); + tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -455,18 +692,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(s64 offset) +static void timekeeping_adjust(s64 offset) { - s64 error, interval = clock->cycle_interval; + s64 error, interval = timekeeper.cycle_interval; int adj; - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); + error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { error >>= 2; if (likely(error <= interval)) adj = 1; else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { error >>= 2; if (likely(error >= -interval)) { @@ -474,15 +711,15 @@ static void clocksource_adjust(s64 offset) interval = -interval; offset = -offset; } else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else return; - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (NTP_SCALE_SHIFT - clock->shift); + timekeeper.mult += adj; + timekeeper.xtime_interval += interval; + timekeeper.xtime_nsec -= offset; + timekeeper.ntp_error -= (interval - offset) << + timekeeper.ntp_error_shift; } /** @@ -492,53 +729,59 @@ static void clocksource_adjust(s64 offset) */ void update_wall_time(void) { + struct clocksource *clock; cycle_t offset; + u64 nsecs; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return; + clock = timekeeper.clock; #ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else - offset = clock->cycle_interval; + offset = timekeeper.cycle_interval; #endif - clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ - while (offset >= clock->cycle_interval) { + while (offset >= timekeeper.cycle_interval) { + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + /* accumulate one interval */ - offset -= clock->cycle_interval; - clock->cycle_last += clock->cycle_interval; + offset -= timekeeper.cycle_interval; + clock->cycle_last += timekeeper.cycle_interval; - clock->xtime_nsec += clock->xtime_interval; - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + timekeeper.xtime_nsec += timekeeper.xtime_interval; + if (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; xtime.tv_sec++; second_overflow(); } - clock->raw_time.tv_nsec += clock->raw_interval; - if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { - clock->raw_time.tv_nsec -= NSEC_PER_SEC; - clock->raw_time.tv_sec++; + raw_time.tv_nsec += timekeeper.raw_interval; + if (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; } /* accumulate error between NTP and clock interval */ - clock->error += tick_length; - clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); + timekeeper.ntp_error += tick_length; + timekeeper.ntp_error -= timekeeper.xtime_interval << + timekeeper.ntp_error_shift; } /* correct the clock when NTP error is too big */ - clocksource_adjust(offset); + timekeeping_adjust(offset); /* * Since in the loop above, we accumulate any amount of time * in xtime_nsec over a second into xtime.tv_sec, its possible for * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in clocksource_adjust(), + * slightly speeding the clocksource up in timekeeping_adjust(), * its possible the required corrective factor to xtime_nsec could * cause it to underflow. * @@ -550,24 +793,25 @@ void update_wall_time(void) * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)clock->xtime_nsec < 0)) { - s64 neg = -(s64)clock->xtime_nsec; - clock->xtime_nsec = 0; - clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); + if (unlikely((s64)timekeeper.xtime_nsec < 0)) { + s64 neg = -(s64)timekeeper.xtime_nsec; + timekeeper.xtime_nsec = 0; + timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; } /* store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ - xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); + xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; + timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; + timekeeper.ntp_error += timekeeper.xtime_nsec << + timekeeper.ntp_error_shift; - update_xtime_cache(cyc2ns(clock, offset)); + nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); + update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ - change_clocksource(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); } /** @@ -583,9 +827,12 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { - set_normalized_timespec(ts, - - (wall_to_monotonic.tv_sec + total_sleep_time), - - wall_to_monotonic.tv_nsec); + struct timespec boottime = { + .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, + .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec + }; + + set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } /** @@ -594,7 +841,7 @@ void getboottime(struct timespec *ts) */ void monotonic_to_bootbased(struct timespec *ts) { - ts->tv_sec += total_sleep_time; + *ts = timespec_add_safe(*ts, total_sleep_time); } unsigned long get_seconds(void) @@ -603,6 +850,10 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); +struct timespec __current_kernel_time(void) +{ + return xtime_cache; +} struct timespec current_kernel_time(void) { @@ -618,3 +869,20 @@ struct timespec current_kernel_time(void) return now; } EXPORT_SYMBOL(current_kernel_time); + +struct timespec get_monotonic_coarse(void) +{ + struct timespec now, mono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime_cache; + mono = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + return now; +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index fddd69d..1b5b7aa 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp) return single_open(filp, timer_list_show, NULL); } -static struct file_operations timer_list_fops = { +static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 4cde8b9..ee5681f 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp) return single_open(filp, tstats_show, NULL); } -static struct file_operations tstats_fops = { +static const struct file_operations tstats_fops = { .open = tstats_open, .read = seq_read, .write = tstats_write, diff --git a/kernel/timer.c b/kernel/timer.c index a3d25f415..5db5a8d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/sched.h> #include <asm/uaccess.h> @@ -46,6 +46,9 @@ #include <asm/timex.h> #include <asm/io.h> +#define CREATE_TRACE_POINTS +#include <trace/events/timer.h> + u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -72,6 +75,7 @@ struct tvec_base { spinlock_t lock; struct timer_list *running_timer; unsigned long timer_jiffies; + unsigned long next_timer; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } #endif +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key) @@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer, const char *name, struct lock_class_key *key) { - debug_timer_init(timer); + debug_init(timer); __init_timer(timer, name, key); } EXPORT_SYMBOL(init_timer_key); @@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer, { struct list_head *entry = &timer->entry; - debug_timer_deactivate(timer); + debug_deactivate(timer); __list_del(entry->prev, entry->next); if (clear_pending) @@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, if (timer_pending(timer)) { detach_timer(timer, 0); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } else { if (pending_only) goto out_unlock; } - debug_timer_activate(timer); + debug_activate(timer, expires); new_base = __get_cpu_var(tvec_bases); @@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu) BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); - debug_timer_activate(timer); + debug_activate(timer, timer->expires); + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer) base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } spin_unlock_irqrestore(&base->lock, flags); @@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer) ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } out: @@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base) */ lock_map_acquire(&lockdep_map); + trace_timer_expire_entry(timer); fn(data); + trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base) #ifdef CONFIG_NO_HZ /* * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a cpus is idle. - * This functions needs to be called disabled. + * is used on S/390 to stop all activity when a CPU is idle. + * This function needs to be called with interrupts disabled. */ static unsigned long __next_timer_interrupt(struct tvec_base *base) { @@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now) unsigned long expires; spin_lock(&base->lock); - expires = __next_timer_interrupt(base); + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; spin_unlock(&base->lock); if (time_before_eq(expires, now)) @@ -1169,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_counter_do_pending(); + perf_event_do_pending(); hrtimer_run_pending(); @@ -1522,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu) INIT_LIST_HEAD(base->tv1.vec + j); base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; return 0; } @@ -1534,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); timer_set_base(timer, new_base); + if (time_before(timer->expires, new_base->next_timer) && + !tbase_get_deferrable(timer->base)) + new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5efeb42..06c3d5b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -11,12 +11,18 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool @@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - This gets selected when the arch tests the function_trace_stop - variable at the mcount call site. Otherwise, this variable - is tested by the called function. + See Documentation/trace/ftrace-implementation.txt config HAVE_DYNAMIC_FTRACE bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FTRACE_MCOUNT_RECORD bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_HW_BRANCH_TRACER bool config HAVE_SYSCALL_TRACEPOINTS bool + help + See Documentation/trace/ftrace-implementation.txt config TRACER_MAX_TRACE bool @@ -73,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options options. +# hidding of the automatic options. config TRACING bool @@ -490,6 +500,18 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ce3b1cd..0f84c52 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o -obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o @@ -55,5 +54,6 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o +obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3eb159c..d9d6206 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, } /** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + +/** * blk_add_driver_data - Add binary message with driver-specific data * @q: queue the io is for * @rq: io request @@ -922,10 +953,13 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + WARN_ON(ret); } static void blk_unregister_tracepoints(void) { + unregister_trace_block_rq_remap(blk_add_trace_rq_remap); unregister_trace_block_remap(blk_add_trace_remap); unregister_trace_block_split(blk_add_trace_split); unregister_trace_block_unplug_io(blk_add_trace_unplug_io); @@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +void blk_trace_remove_sysfs(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8c804e2..b10c0d9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -60,6 +60,13 @@ static int last_ftrace_enabled; /* Quick disabling of function tracer. */ int function_trace_stop; +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +#endif + static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) else func = ftrace_list_func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list->next == &ftrace_list_end) { ftrace_func_t func = ftrace_list->func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void) if (ftrace_trace_function == ftrace_stub) return; +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST func = ftrace_trace_function; +#else + func = __ftrace_trace_function; +#endif - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } else { @@ -817,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE @@ -1074,14 +1087,9 @@ static void ftrace_replace_code(int enable) failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(rec->ip)) { - ftrace_free_rec(rec); - } else { - ftrace_bug(failed, rec->ip); - /* Stop processing */ - return; - } + ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; } } while_for_each_ftrace_rec(); } @@ -1262,12 +1270,34 @@ static int ftrace_update_code(struct module *mod) ftrace_new_addrs = p->newlist; p->flags = 0L; - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(mod, p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else + /* + * Do the initial record convertion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); + continue; + } + + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up) { + int failed = __ftrace_replace_code(p, 1); + if (failed) { + ftrace_bug(failed, p->ip); + ftrace_free_rec(p); + } + } } stop = ftrace_now(raw_smp_processor_id()); @@ -1323,11 +1353,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_CONT = (1 << 1), - FTRACE_ITER_NOTRACE = (1 << 2), - FTRACE_ITER_FAILURES = (1 << 3), - FTRACE_ITER_PRINTALL = (1 << 4), - FTRACE_ITER_HASH = (1 << 5), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_FAILURES = (1 << 2), + FTRACE_ITER_PRINTALL = (1 << 3), + FTRACE_ITER_HASH = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1337,8 +1366,7 @@ struct ftrace_iterator { int hidx; int idx; unsigned flags; - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned buffer_idx; + struct trace_parser parser; }; static void * @@ -1407,7 +1435,7 @@ static int t_hash_show(struct seq_file *m, void *v) if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); + seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1517,12 +1545,12 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%pf\n", (void *)rec->ip); + seq_printf(m, "%ps\n", (void *)rec->ip); return 0; } -static struct seq_operations show_ftrace_seq_ops = { +static const struct seq_operations show_ftrace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -1604,6 +1632,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!iter) return -ENOMEM; + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -1618,8 +1651,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!ret) { struct seq_file *m = file->private_data; m->private = iter; - } else + } else { + trace_parser_put(&iter->parser); kfree(iter); + } } else file->private_data = iter; mutex_unlock(&ftrace_regex_lock); @@ -1652,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) return ret; } -enum { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -/* - * (static function - no need for kernel doc) - * - * Pass in a buffer containing a glob and this function will - * set search to point to the search part of the buffer and - * return the type of search it is (see enum above). - * This does modify buff. - * - * Returns enum type. - * search returns the pointer to use for comparison. - * not returns 1 if buff started with a '!' - * 0 otherwise. - */ -static int -ftrace_setup_glob(char *buff, int len, char **search, int *not) -{ - int type = MATCH_FULL; - int i; - - if (buff[0] == '!') { - *not = 1; - buff++; - len--; - } else - *not = 0; - - *search = buff; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - *search = buff + 1; - type = MATCH_END_ONLY; - } else { - if (type == MATCH_END_ONLY) - type = MATCH_MIDDLE_ONLY; - else - type = MATCH_FRONT_ONLY; - buff[i] = 0; - break; - } - } - } - - return type; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; @@ -1754,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable) int not; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = ftrace_setup_glob(buff, len, &search, ¬); + type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); @@ -1822,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) } if (strlen(buff)) { - type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); + type = filter_parse_regex(buff, strlen(buff), &search, ¬); search_len = strlen(search); } @@ -1987,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int count = 0; char *search; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2059,12 +2040,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int i, len = 0; char *search; - if (glob && (strcmp(glob, "*") || !strlen(glob))) + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; - else { + else if (glob) { int not; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2196,11 +2177,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; - char ch; - size_t read = 0; - ssize_t ret; + struct trace_parser *parser; + ssize_t ret, read; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&ftrace_regex_lock); @@ -2211,72 +2191,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } else iter = file->private_data; - if (!*ppos) { - iter->flags &= ~FTRACE_ITER_CONT; - iter->buffer_idx = 0; - } - - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); - /* - * If the parser haven't finished with the last write, - * continue reading the user input without skipping spaces. - */ - if (!(iter->flags & FTRACE_ITER_CONT)) { - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } - - /* only spaces were written */ - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } - - iter->buffer_idx = 0; - } - - while (cnt && !isspace(ch)) { - if (iter->buffer_idx < FTRACE_BUFF_MAX) - iter->buffer[iter->buffer_idx++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(parser->buffer, + parser->idx, enable); if (ret) goto out; - read++; - cnt--; - } - if (isspace(ch)) { - iter->buffer[iter->buffer_idx] = 0; - ret = ftrace_process_regex(iter->buffer, - iter->buffer_idx, enable); - if (ret) - goto out; - iter->buffer_idx = 0; - } else { - iter->flags |= FTRACE_ITER_CONT; - iter->buffer[iter->buffer_idx++] = ch; + trace_parser_clear(parser); } - *ppos += read; ret = read; - out: - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&ftrace_regex_lock); +out: return ret; } @@ -2343,6 +2274,7 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) #define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { @@ -2358,6 +2290,31 @@ static int __init set_ftrace_filter(char *str) } __setup("ftrace_filter=", set_ftrace_filter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int __init set_graph_function(char *str) +{ + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static void __init set_ftrace_early_filter(char *buf, int enable) { char *func; @@ -2374,6 +2331,10 @@ static void __init set_ftrace_early_filters(void) set_ftrace_early_filter(ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) set_ftrace_early_filter(ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } static int @@ -2381,6 +2342,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct trace_parser *parser; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2390,9 +2352,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) } else iter = file->private_data; - if (iter->buffer_idx) { - iter->buffer[iter->buffer_idx] = 0; - ftrace_match_records(iter->buffer, iter->buffer_idx, enable); + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + parser->buffer[parser->idx] = 0; + ftrace_match_records(parser->buffer, parser->idx, enable); } mutex_lock(&ftrace_lock); @@ -2400,7 +2363,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_lock); + trace_parser_put(parser); kfree(iter); + mutex_unlock(&ftrace_regex_lock); return 0; } @@ -2457,11 +2422,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * __g_next(struct seq_file *m, loff_t *pos) { - unsigned long *array = m->private; - if (*pos >= ftrace_graph_count) return NULL; - return &array[*pos]; + return &ftrace_graph_funcs[*pos]; } static void * @@ -2499,12 +2462,12 @@ static int g_show(struct seq_file *m, void *v) return 0; } - seq_printf(m, "%pf\n", v); + seq_printf(m, "%ps\n", (void *)*ptr); return 0; } -static struct seq_operations ftrace_graph_seq_ops = { +static const struct seq_operations ftrace_graph_seq_ops = { .start = g_start, .next = g_next, .stop = g_stop, @@ -2525,16 +2488,10 @@ ftrace_graph_open(struct inode *inode, struct file *file) ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } + mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) { + if (file->f_mode & FMODE_READ) ret = seq_open(file, &ftrace_graph_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = ftrace_graph_funcs; - } - } else - file->private_data = ftrace_graph_funcs; - mutex_unlock(&graph_lock); return ret; } @@ -2563,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) return -ENODEV; /* decode regex */ - type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (not) return -EINVAL; @@ -2602,12 +2559,8 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned long *array; - size_t read = 0; - ssize_t ret; - int index = 0; - char ch; + struct trace_parser parser; + ssize_t read, ret; if (!cnt || cnt < 0) return 0; @@ -2616,60 +2569,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { ret = -EBUSY; - goto out; + goto out_unlock; } - if (file->f_mode & FMODE_READ) { - struct seq_file *m = file->private_data; - array = m->private; - } else - array = file->private_data; - - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { + ret = -ENOMEM; + goto out_unlock; } - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } + read = trace_get_user(&parser, ubuf, cnt, ppos); - while (cnt && !isspace(ch)) { - if (index < FTRACE_BUFF_MAX) - buffer[index++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (read >= 0 && trace_parser_loaded((&parser))) { + parser.buffer[parser.idx] = 0; + + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + parser.buffer); if (ret) - goto out; - read++; - cnt--; + goto out_free; } - buffer[index] = 0; - - /* we allow only one expression at a time */ - ret = ftrace_set_func(array, &ftrace_graph_count, buffer); - if (ret) - goto out; - - file->f_pos += read; ret = read; - out: + +out_free: + trace_parser_put(&parser); +out_unlock: mutex_unlock(&graph_lock); return ret; @@ -2707,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static int ftrace_convert_nops(struct module *mod, +static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { @@ -2740,19 +2664,17 @@ static int ftrace_convert_nops(struct module *mod, } #ifdef CONFIG_MODULES -void ftrace_release(void *start, void *end) +void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = (unsigned long)end; - if (ftrace_disabled || !start || start == end) + if (ftrace_disabled) return; mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { + if (within_module_core(rec->ip, mod)) { /* * rec->ip is changed in ftrace_free_rec() * It should not between s and e if record was freed. @@ -2769,7 +2691,7 @@ static void ftrace_init_module(struct module *mod, { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(mod, start, end); + ftrace_process_locs(mod, start, end); } static int ftrace_module_notify(struct notifier_block *self, @@ -2784,9 +2706,7 @@ static int ftrace_module_notify(struct notifier_block *self, mod->num_ftrace_callsites); break; case MODULE_STATE_GOING: - ftrace_release(mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); + ftrace_release_mod(mod); break; } @@ -2832,7 +2752,7 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(NULL, + ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); @@ -2865,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ -static ssize_t -ftrace_pid_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - if (ftrace_pid_trace == ftrace_swapper_pid) - r = sprintf(buf, "swapper tasks\n"); - else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); - else - r = sprintf(buf, "no pid\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -2932,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid) rcu_read_unlock(); } -static void clear_ftrace_pid_task(struct pid **pid) +static void clear_ftrace_pid_task(struct pid *pid) { - if (*pid == ftrace_swapper_pid) + if (pid == ftrace_swapper_pid) clear_ftrace_swapper(); else - clear_ftrace_pid(*pid); - - *pid = NULL; + clear_ftrace_pid(pid); } static void set_ftrace_pid_task(struct pid *pid) @@ -2950,11 +2851,140 @@ static void set_ftrace_pid_task(struct pid *pid) set_ftrace_pid(pid); } +static int ftrace_pid_add(int p) +{ + struct pid *pid; + struct ftrace_pid *fpid; + int ret = -EINVAL; + + mutex_lock(&ftrace_lock); + + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); + + if (!pid) + goto out; + + ret = 0; + + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; + + ret = -ENOMEM; + + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; + + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; + + set_ftrace_pid_task(pid); + + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); + return 0; + +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; + + clear_ftrace_pid_task(pid); + + list_del(&fpid->list); + kfree(fpid); + } + + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); +} + +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_printf(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_printf(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct pid *pid; char buf[64]; long val; int ret; @@ -2967,57 +2997,38 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + strstrip(buf); + if (strlen(buf) == 0) + return 1; + ret = strict_strtol(buf, 10, &val); if (ret < 0) return ret; - mutex_lock(&ftrace_lock); - if (val < 0) { - /* disable pid tracing */ - if (!ftrace_pid_trace) - goto out; - - clear_ftrace_pid_task(&ftrace_pid_trace); - - } else { - /* swapper task is special */ - if (!val) { - pid = ftrace_swapper_pid; - if (pid == ftrace_pid_trace) - goto out; - } else { - pid = find_get_pid(val); - - if (pid == ftrace_pid_trace) { - put_pid(pid); - goto out; - } - } - - if (ftrace_pid_trace) - clear_ftrace_pid_task(&ftrace_pid_trace); - - if (!pid) - goto out; + ret = ftrace_pid_add(val); - ftrace_pid_trace = pid; - - set_ftrace_pid_task(ftrace_pid_trace); - } - - /* update the function call */ - ftrace_update_pid_func(); - ftrace_startup_enable(0); + return ret ? ret : cnt; +} - out: - mutex_unlock(&ftrace_lock); +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); - return cnt; + return 0; } static const struct file_operations ftrace_pid_fops = { - .read = ftrace_pid_read, - .write = ftrace_pid_write, + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_pid_release, }; static __init int ftrace_init_debugfs(void) @@ -3100,7 +3111,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3110,7 +3121,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 81b1645..a91da69 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void) return 1; } - if (!register_tracer(&kmem_tracer)) { + if (register_tracer(&kmem_tracer) != 0) { pr_warning("Warning: could not register the kmem tracer\n"); return 1; } diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 0000000..e06c6e3 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/slab.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/power.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); + diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 454e74e..e43c928 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -201,8 +201,6 @@ int tracing_is_on(void) } EXPORT_SYMBOL_GPL(tracing_is_on); -#include "trace.h" - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -399,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s) int ret; ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\n", - (unsigned int)sizeof(field.time_stamp)); + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit)); + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE); + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); return ret; } @@ -701,8 +702,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, val &= ~RB_FLAG_MASK; - ret = (unsigned long)cmpxchg(&list->next, - val | old_flag, val | new_flag); + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) @@ -794,7 +795,7 @@ static int rb_head_page_replace(struct buffer_page *old, val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; - ret = cmpxchg(ptr, val, &new->list); + ret = cmpxchg(ptr, val, (unsigned long)&new->list); return ret == val; } @@ -2997,15 +2998,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) } static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; - cpu_buffer = buffer->buffers[cpu]; - again: /* * We repeat when a timestamp is encountered. It is possible @@ -3049,7 +3047,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(buffer, + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } return event; @@ -3168,7 +3166,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) local_irq_save(flags); if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) @@ -3237,7 +3235,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event) rb_advance_reader(cpu_buffer); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c75dee..026e715 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -125,19 +125,19 @@ int ftrace_dump_on_oops; static int tracing_set_tracer(const char *buf); -#define BOOTUP_TRACER_SIZE 100 -static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; +#define MAX_TRACER_SIZE 100 +static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static int __init set_ftrace(char *str) +static int __init set_cmdline_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); + strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = 1; return 1; } -__setup("ftrace=", set_ftrace); +__setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { @@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; /* - * max_tracer_type_len is used to simplify the allocating of - * buffers to read userspace tracer names. We keep track of - * the longest tracer name registered. - */ -static int max_tracer_type_len; - -/* * trace_types_lock is used to protect the trace_types list. * This lock is also used to keep user access serialized. * Accesses from userspace will grab this lock while userspace @@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock); */ void trace_wake_up(void) { + int cpu; + + if (trace_flags & TRACE_ITER_BLOCK) + return; /* * The runqueue_is_locked() can fail, but this is the best we * have for now: */ - if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) + cpu = get_cpu(); + if (!runqueue_is_locked(cpu)) wake_up(&trace_wait); + put_cpu(); } static int __init set_buf_size(char *str) @@ -339,6 +338,112 @@ static struct { int trace_clock_id; +/* + * trace_parser_get_init - gets the buffer for trace parser + */ +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} + +/* + * trace_parser_put - frees the buffer for trace parser + */ +void trace_parser_put(struct trace_parser *parser) +{ + kfree(parser->buffer); +} + +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; + + if (!*ppos) + trace_parser_clear(parser); + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + + read++; + cnt--; + + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* only spaces were written */ + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + parser->idx = 0; + } + + /* read the non-space input */ + while (cnt && !isspace(ch)) { + if (parser->idx < parser->size - 1) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch)) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + } + + *ppos += read; + ret = read; + +out: + return ret; +} + ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) { int len; @@ -513,7 +618,6 @@ __releases(kernel_lock) __acquires(kernel_lock) { struct tracer *t; - int len; int ret = 0; if (!type->name) { @@ -521,6 +625,11 @@ __acquires(kernel_lock) return -1; } + if (strlen(type->name) > MAX_TRACER_SIZE) { + pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); + return -1; + } + /* * When this gets called we hold the BKL which means that * preemption is disabled. Various trace selftests however @@ -535,7 +644,7 @@ __acquires(kernel_lock) for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ - pr_info("Trace %s already registered\n", + pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; @@ -586,9 +695,6 @@ __acquires(kernel_lock) type->next = trace_types; trace_types = type; - len = strlen(type->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; out: tracing_selftest_running = false; @@ -597,7 +703,7 @@ __acquires(kernel_lock) if (ret || !default_bootup_tracer) goto out_unlock; - if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) + if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) goto out_unlock; printk(KERN_INFO "Starting tracer '%s'\n", type->name); @@ -619,14 +725,13 @@ __acquires(kernel_lock) void unregister_tracer(struct tracer *type) { struct tracer **t; - int len; mutex_lock(&trace_types_lock); for (t = &trace_types; *t; t = &(*t)->next) { if (*t == type) goto found; } - pr_info("Trace %s not registered\n", type->name); + pr_info("Tracer %s not registered\n", type->name); goto out; found: @@ -639,17 +744,7 @@ void unregister_tracer(struct tracer *type) current_trace->stop(&global_trace); current_trace = &nop_trace; } - - if (strlen(type->name) != max_tracer_type_len) - goto out; - - max_tracer_type_len = 0; - for (t = &trace_types; *t; t = &(*t)->next) { - len = strlen((*t)->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; - } - out: +out: mutex_unlock(&trace_types_lock); } @@ -719,6 +814,11 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } +int is_tracing_stopped(void) +{ + return trace_stop_count; +} + /** * ftrace_off_permanent - disable all ftrace code permanently * @@ -886,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->tgid = (tsk) ? tsk->tgid : 0; + entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1068,6 +1168,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) return; entry = ring_buffer_event_data(event); + entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); trace.nr_entries = 0; @@ -1094,6 +1195,7 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct ring_buffer *buffer = tr->buffer; @@ -1107,7 +1209,9 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void @@ -1289,7 +1393,7 @@ int trace_array_vprintk(struct trace_array *tr, int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { - return trace_array_printk(&global_trace, ip, fmt, args); + return trace_array_vprintk(&global_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); @@ -1530,10 +1634,10 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# |||| /_--=> lock-depth \n"); + seq_puts(m, "# |||||/ delay \n"); + seq_puts(m, "# cmd pid |||||| time | caller \n"); + seq_puts(m, "# \\ / |||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -1845,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations tracer_seq_ops = { +static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, @@ -1880,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - cpumask_clear(iter->started); - if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -2059,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations show_traces_seq_ops = { +static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -2489,7 +2591,7 @@ static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+2]; + char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); @@ -2639,15 +2741,15 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+1]; + char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; int err; ret = cnt; - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; + if (cnt > MAX_TRACER_SIZE) + cnt = MAX_TRACER_SIZE; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -4285,7 +4387,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; /* To save memory, keep the ring buffer size to its minimum */ @@ -4296,7 +4398,6 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); - cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ea7e0bc..91c3d0e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -7,10 +7,10 @@ #include <linux/clocksource.h> #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> +#include <linux/tracepoint.h> #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> -#include <trace/power.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -40,164 +40,60 @@ enum trace_type { TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, - TRACE_POWER, TRACE_BLK, TRACE_KSYM, __TRACE_LAST_TYPE, }; -/* - * Function trace entry - function address and parent function addres: - */ -struct ftrace_entry { - struct trace_entry ent; - unsigned long ip; - unsigned long parent_ip; -}; - -/* Function call entry */ -struct ftrace_graph_ent_entry { - struct trace_entry ent; - struct ftrace_graph_ent graph_ent; +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ }; -/* Function return entry */ -struct ftrace_graph_ret_entry { - struct trace_entry ent; - struct ftrace_graph_ret ret; -}; extern struct tracer boot_tracer; -/* - * Context switch trace entry - which task (and prio) we switched from/to: - */ -struct ctx_switch_entry { - struct trace_entry ent; - unsigned int prev_pid; - unsigned char prev_prio; - unsigned char prev_state; - unsigned int next_pid; - unsigned char next_prio; - unsigned char next_state; - unsigned int next_cpu; -}; - -/* - * Special (free-form) trace entry: - */ -struct special_entry { - struct trace_entry ent; - unsigned long arg1; - unsigned long arg2; - unsigned long arg3; -}; - -/* - * Stack-trace entry: - */ - -#define FTRACE_STACK_ENTRIES 8 - -struct stack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; - -struct userstack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; - -/* - * trace_printk entry: - */ -struct bprint_entry { - struct trace_entry ent; - unsigned long ip; - const char *fmt; - u32 buf[]; -}; - -struct print_entry { - struct trace_entry ent; - unsigned long ip; - char buf[]; -}; - -#define TRACE_OLD_SIZE 88 - -struct trace_field_cont { - unsigned char type; - /* Temporary till we get rid of this completely */ - char buf[TRACE_OLD_SIZE - 1]; -}; +#undef __field +#define __field(type, item) type item; -struct trace_mmiotrace_rw { - struct trace_entry ent; - struct mmiotrace_rw rw; -}; +#undef __field_struct +#define __field_struct(type, item) __field(type, item) -struct trace_mmiotrace_map { - struct trace_entry ent; - struct mmiotrace_map map; -}; +#undef __field_desc +#define __field_desc(type, container, item) -struct trace_boot_call { - struct trace_entry ent; - struct boot_trace_call boot_call; -}; +#undef __array +#define __array(type, item, size) type item[size]; -struct trace_boot_ret { - struct trace_entry ent; - struct boot_trace_ret boot_ret; -}; +#undef __array_desc +#define __array_desc(type, container, item, size) -#define TRACE_FUNC_SIZE 30 -#define TRACE_FILE_SIZE 20 -struct trace_branch { - struct trace_entry ent; - unsigned line; - char func[TRACE_FUNC_SIZE+1]; - char file[TRACE_FILE_SIZE+1]; - char correct; -}; +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; -struct hw_branch_entry { - struct trace_entry ent; - u64 from; - u64 to; -}; +#undef F_STRUCT +#define F_STRUCT(args...) args -struct trace_power { - struct trace_entry ent; - struct power_trace state_data; -}; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; +#undef TP_ARGS +#define TP_ARGS(args...) args -struct kmemtrace_alloc_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; - size_t bytes_req; - size_t bytes_alloc; - gfp_t gfp_flags; - int node; -}; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) -struct kmemtrace_free_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; -}; +#include "trace_entries.h" +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ struct syscall_trace_enter { struct trace_entry ent; int nr; @@ -210,23 +106,12 @@ struct syscall_trace_exit { unsigned long ret; }; -#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy" -extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); - -struct ksym_trace_entry { - struct trace_entry ent; - unsigned long ip; - unsigned char type; - char ksym_name[KSYM_NAME_LEN]; - char cmd[TASK_COMM_LEN]; -}; - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: * IRQS_OFF - interrupts were disabled * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags - * NEED_RESCED - reschedule is requested + * NEED_RESCHED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler */ @@ -325,7 +210,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ - IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ @@ -406,7 +290,6 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; - struct tracer_stat *stats; }; @@ -485,6 +368,10 @@ void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); +int is_tracing_stopped(void); + +#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy" +extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); extern unsigned long nsecs_to_usecs(unsigned long nsecs); @@ -525,20 +412,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -typedef void -(*tracer_switch_func_t)(void *private, - void *__rq, - struct task_struct *prev, - struct task_struct *next); - -struct tracer_switch_ops { - tracer_switch_func_t func; - void *private; - struct tracer_switch_ops *next; -}; -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE @@ -621,10 +494,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } #else -static inline int ftrace_trace_addr(unsigned long addr) -{ - return 1; -} static inline int ftrace_graph_addr(unsigned long addr) { return 1; @@ -638,12 +507,12 @@ print_graph_function(struct trace_iterator *iter) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -extern struct pid *ftrace_pid_trace; +extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { - if (!ftrace_pid_trace) + if (list_empty(&ftrace_pids)) return 1; return test_tsk_trace_trace(task); @@ -656,6 +525,41 @@ static inline int ftrace_trace_task(struct task_struct *task) #endif /* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input lenght + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +/* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * @@ -790,7 +694,6 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; - bool no_reset; }; struct event_subsystem { @@ -802,22 +705,40 @@ struct event_subsystem { }; struct filter_pred; +struct regex; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, int val1, int val2); -struct filter_pred { - filter_pred_fn_t fn; - u64 val; - char str_val[MAX_FILTER_STR_VAL]; - int str_len; - char *field_name; - int offset; - int not; - int op; - int pop_n; +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; }; +struct filter_pred { + filter_pred_fn_t fn; + u64 val; + struct regex regex; + char *field_name; + int offset; + int not; + int op; + int pop_n; +}; + +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, @@ -833,7 +754,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && + !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } @@ -841,58 +763,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - default: \ - break; \ - } \ - \ - return match; \ -} - -#define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - u##size *addr = (u##size *)(event + pred->offset); \ - u##size val = (u##size)pred->val; \ - int match; \ - \ - match = (val == *addr) ^ pred->not; \ - \ - return match; \ -} - extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct ftrace_event_call event_##call; -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) -#include "trace_event_types.h" +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#include "trace_entries.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 19bfc75..c21d5f3 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly = void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_call; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_boot_call *entry; @@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) goto out; entry = ring_buffer_event_data(event); entry->boot_call = *bt; - trace_buffer_unlock_commit(buffer, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_ret; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_boot_ret *entry; @@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) goto out; entry = ring_buffer_event_data(event); entry->boot_ret = *bt; - trace_buffer_unlock_commit(buffer, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7a7a9fd..4a194f0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; + struct ring_buffer *buffer; unsigned long flags; int cpu, pc; const char *p; @@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) goto out; pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) goto out; @@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index b588fd8..20c5f92 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -66,10 +66,14 @@ u64 notrace trace_clock(void) * Used by plugins that need globally coherent timestamps. */ -static u64 prev_trace_clock_time; - -static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + raw_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + }; u64 notrace trace_clock_global(void) { @@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_lock); + __raw_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_trace_clock_time+1, to make sure + * my_scd->clock to prev_time+1, to make sure * we start ticking with the local clock from now on? */ - if ((s64)(now - prev_trace_clock_time) < 0) - now = prev_trace_clock_time + 1; + if ((s64)(now - trace_clock_struct.prev_time) < 0) + now = trace_clock_struct.prev_time + 1; - prev_trace_clock_time = now; + trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_lock); + __raw_spin_unlock(&trace_clock_struct.lock); out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 0000000..e19747d --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,382 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * * for structures within structures, the format of the internal + * structure is layed out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internel structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function addres: + */ +FTRACE_ENTRY(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + ), + + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_desc( unsigned long, graph_ent, func ) + __field_desc( int, graph_ent, depth ) + ), + + F_printk("--> %lx (%d)", __entry->func, __entry->depth) +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long long, ret, calltime) + __field_desc( unsigned long long, ret, rettime ) + __field_desc( unsigned long, ret, overrun ) + __field_desc( int, ret, depth ) + ), + + F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", + __entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth) +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) \ + __field( unsigned int, next_cpu ) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * Special (free-form) trace entry: + */ +FTRACE_ENTRY(special, special_entry, + + TRACE_SPECIAL, + + F_STRUCT( + __field( unsigned long, arg1 ) + __field( unsigned long, arg2 ) + __field( unsigned long, arg3 ) + ), + + F_printk("(%08lx) (%08lx) (%08lx)", + __entry->arg1, __entry->arg2, __entry->arg3) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%08lx fmt:%p", + __entry->ip, __entry->fmt) +); + +FTRACE_ENTRY(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%08lx %s", + __entry->ip, __entry->buf) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) + ), + + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) + ), + + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode) +); + +FTRACE_ENTRY(boot_call, trace_boot_call, + + TRACE_BOOT_CALL, + + F_STRUCT( + __field_struct( struct boot_trace_call, boot_call ) + __field_desc( pid_t, boot_call, caller ) + __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) + ), + + F_printk("%d %s", __entry->caller, __entry->func) +); + +FTRACE_ENTRY(boot_ret, trace_boot_ret, + + TRACE_BOOT_RET, + + F_STRUCT( + __field_struct( struct boot_trace_ret, boot_ret ) + __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) + __field_desc( int, boot_ret, result ) + __field_desc( unsigned long, boot_ret, duration ) + ), + + F_printk("%s %d %lx", + __entry->func, __entry->result, __entry->duration) +); + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + ), + + F_printk("%u:%s:%s (%u)", + __entry->line, + __entry->func, __entry->file, __entry->correct) +); + +FTRACE_ENTRY(hw_branch, hw_branch_entry, + + TRACE_HW_BRANCHES, + + F_STRUCT( + __field( u64, from ) + __field( u64, to ) + ), + + F_printk("from: %llx to: %llx", __entry->from, __entry->to) +); + +FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, + + TRACE_KMEM_ALLOC, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" + " flags:%x node:%d", + __entry->type_id, __entry->call_site, __entry->ptr, + __entry->bytes_req, __entry->bytes_alloc, + __entry->gfp_flags, __entry->node) +); + +FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, + + TRACE_KMEM_FREE, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + F_printk("type:%u call_site:%lx ptr:%p", + __entry->type_id, __entry->call_site, __entry->ptr) +); + +FTRACE_ENTRY(ksym_trace, ksym_trace_entry, + + TRACE_KSYM, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned char, type ) + __array( char , ksym_name, KSYM_NAME_LEN ) + __array( char , cmd, TASK_COMM_LEN ) + ), + + F_printk("ip: %pF type: %d ksym_name: %s cmd: %s", + (void *)__entry->ip, (unsigned int)__entry->type, + __entry->ksym_name, __entry->cmd) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 11ba5bb..8d5c171 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -5,8 +5,65 @@ * */ +#include <linux/module.h> #include "trace.h" +/* + * We can't use a size but a type in alloc_percpu() + * So let's create a dummy type that matches the desired size + */ +typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; + +char *trace_profile_buf; +EXPORT_SYMBOL_GPL(trace_profile_buf); + +char *trace_profile_buf_nmi; +EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); + +/* Count the events in use (per event id, not per instance) */ +static int total_profile_count; + +static int ftrace_profile_enable_event(struct ftrace_event_call *event) +{ + char *buf; + int ret = -ENOMEM; + + if (atomic_inc_return(&event->profile_count)) + return 0; + + if (!total_profile_count) { + buf = (char *)alloc_percpu(profile_buf_t); + if (!buf) + goto fail_buf; + + rcu_assign_pointer(trace_profile_buf, buf); + + buf = (char *)alloc_percpu(profile_buf_t); + if (!buf) + goto fail_buf_nmi; + + rcu_assign_pointer(trace_profile_buf_nmi, buf); + } + + ret = event->profile_enable(); + if (!ret) { + total_profile_count++; + return 0; + } + +fail_buf_nmi: + if (!total_profile_count) { + free_percpu(trace_profile_buf_nmi); + free_percpu(trace_profile_buf); + trace_profile_buf_nmi = NULL; + trace_profile_buf = NULL; + } +fail_buf: + atomic_dec(&event->profile_count); + + return ret; +} + int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; @@ -14,8 +71,9 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->profile_enable) { - ret = event->profile_enable(event); + if (event->id == event_id && event->profile_enable && + try_module_get(event->mod)) { + ret = ftrace_profile_enable_event(event); break; } } @@ -24,6 +82,33 @@ int ftrace_profile_enable(int event_id) return ret; } +static void ftrace_profile_disable_event(struct ftrace_event_call *event) +{ + char *buf, *nmi_buf; + + if (!atomic_add_negative(-1, &event->profile_count)) + return; + + event->profile_disable(); + + if (!--total_profile_count) { + buf = trace_profile_buf; + rcu_assign_pointer(trace_profile_buf, NULL); + + nmi_buf = trace_profile_buf_nmi; + rcu_assign_pointer(trace_profile_buf_nmi, NULL); + + /* + * Ensure every events in profiling have finished before + * releasing the buffers + */ + synchronize_sched(); + + free_percpu(buf); + free_percpu(nmi_buf); + } +} + void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; @@ -31,7 +116,8 @@ void ftrace_profile_disable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) { - event->profile_disable(event); + ftrace_profile_disable_event(event); + module_put(event->mod); break; } } diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h deleted file mode 100644 index 6db005e..0000000 --- a/kernel/trace/trace_event_types.h +++ /dev/null @@ -1,178 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM ftrace - -/* - * We cheat and use the proto type field as the ID - * and args as the entry type (minus 'struct') - */ -TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned long, parent_ip, parent_ip) - ), - TP_RAW_FMT(" %lx <-- %lx") -); - -TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, - ftrace_graph_ent_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, graph_ent.func, func) - TRACE_FIELD(int, graph_ent.depth, depth) - ), - TP_RAW_FMT("--> %lx (%d)") -); - -TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, - ftrace_graph_ret_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ret.func, func) - TRACE_FIELD(unsigned long long, ret.calltime, calltime) - TRACE_FIELD(unsigned long long, ret.rettime, rettime) - TRACE_FIELD(unsigned long, ret.overrun, overrun) - TRACE_FIELD(int, ret.depth, depth) - ), - TP_RAW_FMT("<-- %lx (%d)") -); - -TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, arg1, arg1) - TRACE_FIELD(unsigned long, arg2, arg2) - TRACE_FIELD(unsigned long, arg3, arg3) - ), - TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") -); - -/* - * Stack-trace entry: - */ - -/* #define FTRACE_STACK_ENTRIES 8 */ - -TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(char *, fmt, fmt) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, - TRACE_FUNC_SIZE+1, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, - TRACE_FUNC_SIZE+1, file) - TRACE_FIELD(char, correct, correct) - ), - TP_RAW_FMT("%u:%s:%s (%u)") -); - -TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(u64, from, from) - TRACE_FIELD(u64, to, to) - ), - TP_RAW_FMT("from: %llx to: %llx") -); - -TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, - TRACE_STRUCT( - TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) - TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) - TRACE_FIELD(int, state_data.type, type) - TRACE_FIELD(int, state_data.state, state) - ), - TP_RAW_FMT("%llx->%llx type:%u state:%u") -); - -TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - TRACE_FIELD(size_t, bytes_req, bytes_req) - TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) - TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) - TRACE_FIELD(int, node, node) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" - " flags:%x node:%d") -); - -TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p") -); - -#undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 78b1ed2..7c18d15 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,6 +21,7 @@ #include "trace_output.h" +#undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); @@ -86,7 +87,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, tgid); + __common_field(int, lock_depth); return ret; } @@ -230,73 +231,38 @@ static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - size_t read = 0; - int i, set = 1; - ssize_t ret; - char *buf; - char ch; + struct trace_parser parser; + ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; ret = tracing_update_buffers(); if (ret < 0) return ret; - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - } - - /* Only white space found? */ - if (isspace(ch)) { - file->f_pos += read; - ret = read; - return ret; - } - - buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); - if (!buf) + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; - if (cnt > EVENT_BUF_SIZE) - cnt = EVENT_BUF_SIZE; + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded((&parser))) { + int set = 1; - i = 0; - while (cnt && !isspace(ch)) { - if (!i && ch == '!') + if (*parser.buffer == '!') set = 0; - else - buf[i++] = ch; - ret = get_user(ch, ubuf++); + parser.buffer[parser.idx] = 0; + + ret = ftrace_set_clr_event(parser.buffer + !set, set); if (ret) - goto out_free; - read++; - cnt--; + goto out_put; } - buf[i] = 0; - - file->f_pos += read; - - ret = ftrace_set_clr_event(buf, set); - if (ret) - goto out_free; ret = read; - out_free: - kfree(buf); + out_put: + trace_parser_put(&parser); return ret; } @@ -304,42 +270,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - for (;;) { - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - + list_for_each_entry_continue(call, &ftrace_events, list) { /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->regfunc) - break; - - list = list->next; + return call; } - m->private = list->next; - - return call; + return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; loff_t l; mutex_lock(&event_mutex); - m->private = ftrace_events.next; + call = list_entry(&ftrace_events, struct ftrace_event_call, list); for (l = 0; l <= *pos; ) { - call = t_next(m, NULL, &l); + call = t_next(m, call, &l); if (!call) break; } @@ -349,37 +305,28 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - retry: - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - - if (!call->enabled) { - list = list->next; - goto retry; + list_for_each_entry_continue(call, &ftrace_events, list) { + if (call->enabled) + return call; } - m->private = list->next; - - return call; + return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; loff_t l; mutex_lock(&event_mutex); - m->private = ftrace_events.next; + call = list_entry(&ftrace_events, struct ftrace_event_call, list); for (l = 0; l <= *pos; ) { - call = s_next(m, NULL, &l); + call = s_next(m, call, &l); if (!call) break; } @@ -560,7 +507,7 @@ extern char *__bad_type_size(void); #define FIELD(type, name) \ sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name) + sizeof(field.name), is_signed_type(type) static int trace_write_header(struct trace_seq *s) { @@ -568,17 +515,17 @@ static int trace_write_header(struct trace_seq *s) /* struct trace_entry */ return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, tgid)); + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\n", + FIELD(unsigned short, type), + FIELD(unsigned char, flags), + FIELD(unsigned char, preempt_count), + FIELD(int, pid), + FIELD(int, lock_depth)); } static ssize_t @@ -931,9 +878,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - entry = trace_create_file("enable", 0644, system->entry, - (void *)system->name, - &ftrace_system_enable_fops); + trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } @@ -945,7 +892,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { - struct dentry *entry; int ret; /* @@ -963,12 +909,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, } if (call->regfunc) - entry = trace_create_file("enable", 0644, call->dir, call, - enable); + trace_create_file("enable", 0644, call->dir, call, + enable); if (call->id && call->profile_enable) - entry = trace_create_file("id", 0444, call->dir, call, - id); + trace_create_file("id", 0444, call->dir, call, + id); if (call->define_fields) { ret = call->define_fields(call); @@ -977,16 +923,16 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, " events/%s\n", call->name); return ret; } - entry = trace_create_file("filter", 0644, call->dir, call, - filter); + trace_create_file("filter", 0644, call->dir, call, + filter); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = trace_create_file("format", 0444, call->dir, call, - format); + trace_create_file("format", 0444, call->dir, call, + format); return 0; } @@ -1187,7 +1133,7 @@ static int trace_module_notify(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block trace_module_nb = { +static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; @@ -1359,6 +1305,18 @@ static __init void event_trace_self_tests(void) if (!call->regfunc) continue; +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->system && + strcmp(call->system, "syscalls") == 0) + continue; +#endif + pr_info("Testing event %s: ", call->name); /* @@ -1432,7 +1390,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1449,7 +1407,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1468,7 +1426,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 93660fb..21d3475 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -18,11 +18,10 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ -#include <linux/debugfs.h> -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> +#include <linux/perf_event.h> #include "trace.h" #include "trace_output.h" @@ -31,6 +30,7 @@ enum filter_op_ids { OP_OR, OP_AND, + OP_GLOB, OP_NE, OP_EQ, OP_LT, @@ -48,16 +48,17 @@ struct filter_op { }; static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, }; enum { @@ -121,6 +122,47 @@ struct filter_parse_state { } operand; }; +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + DEFINE_COMPARISON_PRED(s64); DEFINE_COMPARISON_PRED(u64); DEFINE_COMPARISON_PRED(s32); @@ -156,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event, char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -170,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, char **addr = (char **)(event + pred->offset); int cmp, match; - cmp = strncmp(*addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -196,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, str_len); + cmp = pred->regex.match(addr, &pred->regex, str_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -209,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } +/* Basic regex callbacks */ +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strstr(str, r->pattern)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + char *ptr = strstr(str, r->pattern); + + if (ptr && (ptr[r->len] == 0)) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; +} + /* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +int filter_match_preds(struct event_filter *filter, void *rec) { - struct event_filter *filter = call->filter; int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; @@ -355,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred) { kfree(pred->field_name); pred->field_name = NULL; - pred->str_len = 0; + pred->regex.len = 0; } static int filter_set_pred(struct filter_pred *dest, @@ -385,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void destroy_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; if (!filter) @@ -400,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call) kfree(filter->preds); kfree(filter->filter_string); kfree(filter); +} + +void destroy_preds(struct ftrace_event_call *call) +{ + __free_preds(call->filter); call->filter = NULL; + call->filter_active = 0; } -static int init_preds(struct ftrace_event_call *call) +static struct event_filter *__alloc_preds(void) { struct event_filter *filter; struct filter_pred *pred; int i; - if (call->filter) - return 0; - - filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!call->filter) - return -ENOMEM; + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); filter->n_preds = 0; @@ -430,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call) filter->preds[i] = pred; } - return 0; + return filter; oom: - destroy_preds(call); + __free_preds(filter); + return ERR_PTR(-ENOMEM); +} - return -ENOMEM; +static int init_preds(struct ftrace_event_call *call) +{ + if (call->filter) + return 0; + + call->filter_active = 0; + call->filter = __alloc_preds(); + if (IS_ERR(call->filter)) + return PTR_ERR(call->filter); + + return 0; } static int init_subsystem_preds(struct event_subsystem *system) @@ -458,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system) return 0; } -enum { - FILTER_DISABLE_ALL, - FILTER_INIT_NO_RESET, - FILTER_SKIP_NO_RESET, -}; - -static void filter_free_subsystem_preds(struct event_subsystem *system, - int flag) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; @@ -476,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (strcmp(call->system, system->name) != 0) continue; - if (flag == FILTER_INIT_NO_RESET) { - call->filter->no_reset = false; - continue; - } - - if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) - continue; - filter_disable_preds(call); remove_filter_string(call->filter); } @@ -491,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, filter_pred_fn_t fn) { - struct event_filter *filter = call->filter; int idx, err; if (filter->n_preds == MAX_FILTER_PRED) { @@ -509,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return err; filter->n_preds++; - call->filter_active = 1; return 0; } @@ -534,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field) static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) return 0; return 1; @@ -585,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, bool dry_run) { @@ -619,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { - pred->str_len = field->size; + filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; - else if (field->filter_type == FILTER_DYN_STRING) + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; else { fn = filter_pred_pchar; - pred->str_len = strlen(pred->str_val); + pred->regex.field_len = strlen(pred->regex.pattern); } } else { if (field->is_signed) - ret = strict_strtoll(pred->str_val, 0, &val); + ret = strict_strtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->str_val, 0, &val); + ret = strict_strtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; @@ -653,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, pred, fn); - return 0; -} - -static int filter_add_subsystem_pred(struct filter_parse_state *ps, - struct event_subsystem *system, - struct filter_pred *pred, - char *filter_string, - bool dry_run) -{ - struct ftrace_event_call *call; - int err = 0; - bool fail = true; - - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->define_fields) - continue; - - if (strcmp(call->system, system->name)) - continue; - - if (call->filter->no_reset) - continue; - - err = filter_add_pred(ps, call, pred, dry_run); - if (err) - call->filter->no_reset = true; - else - fail = false; - - if (!dry_run) - replace_filter_string(call->filter, filter_string); - } - - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; - } + return filter_add_pred_fn(ps, call, filter, pred, fn); return 0; } @@ -892,8 +1010,9 @@ static void postfix_clear(struct filter_parse_state *ps) while (!list_empty(&ps->postfix)) { elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - kfree(elt->operand); list_del(&elt->list); + kfree(elt->operand); + kfree(elt); } } @@ -1003,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2) return NULL; } - strcpy(pred->str_val, operand2); - pred->str_len = strlen(operand2); + strcpy(pred->regex.pattern, operand2); + pred->regex.len = strlen(pred->regex.pattern); pred->op = op; @@ -1048,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps) return 0; } -static int replace_preds(struct event_subsystem *system, - struct ftrace_event_call *call, +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, struct filter_parse_state *ps, char *filter_string, bool dry_run) @@ -1096,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system, add_pred: if (!pred) return -ENOMEM; - if (call) - err = filter_add_pred(ps, call, pred, false); - else - err = filter_add_subsystem_pred(ps, system, pred, - filter_string, dry_run); + err = filter_add_pred(ps, call, filter, pred, dry_run); filter_free_pred(pred); if (err) return err; @@ -1111,10 +1226,50 @@ add_pred: return 0; } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +static int replace_system_preds(struct event_subsystem *system, + struct filter_parse_state *ps, + char *filter_string) { + struct event_filter *filter = system->filter; + struct ftrace_event_call *call; + bool fail = true; int err; + list_for_each_entry(call, &ftrace_events, list) { + + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + /* try to see if the filter can be applied */ + err = replace_preds(call, filter, ps, filter_string, true); + if (err) + continue; + + /* really apply the filter */ + filter_disable_preds(call); + err = replace_preds(call, filter, ps, filter_string, false); + if (err) + filter_disable_preds(call); + else { + call->filter_active = 1; + replace_filter_string(filter, filter_string); + } + fail = false; + } + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + } + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1126,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1145,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string, false); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); - + else + call->filter_active = 1; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1163,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1173,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); + filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1193,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); - - /* try to see the filter can be applied to which events */ - err = replace_preds(system, NULL, ps, filter_string, true); - if (err) { + err = replace_system_preds(system, ps, filter_string); + if (err) append_filter_err(ps, system->filter); - goto out; + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_EVENT_PROFILE + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_preds(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->id == event_id) + break; } - filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + err = -EINVAL; + if (!call) + goto out_unlock; + + err = -EEXIST; + if (event->filter) + goto out_unlock; - /* really apply the filter to the events */ - err = replace_preds(system, NULL, ps, filter_string, false); - if (err) { - append_filter_err(ps, system->filter); - filter_free_subsystem_preds(system, 2); + filter = __alloc_preds(); + if (IS_ERR(filter)) { + err = PTR_ERR(filter); + goto out_unlock; } -out: + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_preds; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); + +free_preds: + if (err) + __free_preds(filter); + out_unlock: mutex_unlock(&event_mutex); return err; } +#endif /* CONFIG_EVENT_PROFILE */ + diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index df1bf6e..31da218 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -15,146 +15,128 @@ #include "trace_output.h" +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) -extern void __bad_type_size(void); +#undef __field +#define __field(type, item) type item; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - if (sizeof(type) != sizeof(field.item)) \ - __bad_type_size(); \ +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __used ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force cmpile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item), \ + is_signed_type(type)); \ + if (!ret) \ + return 0; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) \ - ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ - "offset:%u;\tsize:0;\n", \ - (unsigned int)offsetof(typeof(field), item)); \ +#undef __array_desc +#define __array_desc(type, container, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) +#undef __dynamic_array +#define __dynamic_array(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:0;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + is_signed_type(type)); \ + if (!ret) \ + return 0; -#undef TP_RAW_FMT -#define TP_RAW_FMT(args...) args +#undef F_printk +#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct args field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ - \ - return ret; \ -} +#undef __entry +#define __entry REC -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ +ftrace_format_##name(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ - struct args field; \ - int ret; \ + struct struct_name field __attribute__((unused)); \ + int ret = 0; \ \ tstruct; \ \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + trace_seq_printf(s, "\nprint fmt: " print); \ \ return ret; \ } -#include "trace_event_types.h" - -#undef TRACE_ZERO_CHAR -#define TRACE_ZERO_CHAR(arg) - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) - -#undef TP_CMD -#define TP_CMD(cmd...) cmd - -#undef TRACE_ENTRY -#define TRACE_ENTRY entry - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - cmd; - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \ -static int ftrace_raw_init_event_##call(void); \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ - .show_format = ftrace_format_##call, \ - .define_fields = ftrace_define_fields_##call, \ -}; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ - -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .show_format = ftrace_format_##call, \ -}; +#include "trace_entries.h" -#include "trace_event_types.h" -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ +#undef __field +#define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ @@ -162,32 +144,45 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ if (ret) \ return ret; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ sizeof(field.item), 0, FILTER_OTHER); \ if (ret) \ return ret; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed, \ +#undef __array_desc +#define __array_desc(type, container, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), 0, \ FILTER_OTHER); \ if (ret) \ return ret; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) +#undef __dynamic_array +#define __dynamic_array(type, item) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ int \ -ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ +ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ - struct args field; \ + struct struct_name field; \ int ret; \ \ ret = trace_define_common_fields(event_call); \ @@ -199,8 +194,42 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ return ret; \ } -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ +static int ftrace_raw_init_event_##call(void); \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = type, \ + .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ + .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ +}; \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + INIT_LIST_HEAD(&event_##call.fields); \ + return 0; \ +} \ -#include "trace_event_types.h" +#include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5b01b94..b3f3776 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, { long count = (long)data; - seq_printf(m, "%pf:", (void *)ip); + seq_printf(m, "%ps:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b3749a2..45e6c01 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); WARN(1, "Bad frame pointer: expected %lx, received %lx\n" - " from func %pF return to %lx\n", + " from func %ps return to %lx\n", current->ret_stack[index].fp, frame_pointer, (void *)current->ret_stack[index].func, @@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid) } +static enum print_line_t +print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + if (!trace_seq_putc(s, ' ')) + return 0; + + return trace_print_lat_fmt(s, entry); +} + /* If the pid changed since the last trace, output this event */ static enum print_line_t verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) @@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } + /* Proc */ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, pid); @@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); + ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); + ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + ret = print_graph_lat_fmt(s, ent); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + return 0; } @@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static void print_lat_header(struct seq_file *s) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); + seq_printf(s, "#%.*s|||| / \n", size, spaces); +} + static void print_graph_headers(struct seq_file *s) { + int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s); + /* 1st line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " TIME "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "CPU"); + seq_printf(s, " CPU"); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); + seq_printf(s, " TASK/PID "); + if (lat) + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "| "); + seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); + seq_printf(s, " | | "); + if (lat) + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ca7d7c4..69543a9 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) seq_print_ip_sym(seq, it->from, symflags) && trace_seq_printf(seq, "\n")) return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE;; + return TRACE_TYPE_PARTIAL_LINE; } return TRACE_TYPE_UNHANDLED; } @@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to) struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; + struct ring_buffer *buf; struct hw_branch_entry *entry; unsigned long irq1; int cpu; @@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, + buf = tr->buffer; + event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, sizeof(*entry), 0, 0); if (!event) goto out; @@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buf, event)) + trace_buffer_unlock_commit(buf, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5555b75..3aa7eaa 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr, unsigned long parent_ip, int cpu) { - unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; int pc; - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr, trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); - latency = nsecs_to_usecs(delta); - if (data->critical_sequence != max_sequence) goto out_unlock; - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - data->critical_end = parent_ip; - update_max_tr_single(tr, current, cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr_single(tr, current, cpu); + } max_sequence++; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index c4c9bbd..0acd834 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { + struct ftrace_event_call *call = &event_mmiotrace_rw; struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->rw = *rw; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { + struct ftrace_event_call *call = &event_mmiotrace_map; struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->map = *map; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e0c2545..ed17565 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, * since individual threads might have already quit! */ rcu_read_lock(); - task = find_task_by_vpid(entry->ent.tgid); + task = find_task_by_vpid(entry->tgid); if (task) mm = get_task_mm(task); rcu_read_unlock(); @@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static int -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count and lock depth. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { int hardirq, softirq; - char comm[TASK_COMM_LEN]; + int ret; - trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", - comm, entry->pid, cpu, + if (!trace_seq_printf(s, "%c%c%c", (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', @@ -482,8 +487,31 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) return 0; if (entry->preempt_count) - return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_puts(s, "."); + ret = trace_seq_printf(s, "%x", entry->preempt_count); + else + ret = trace_seq_putc(s, '.'); + + if (!ret) + return 0; + + if (entry->lock_depth < 0) + return trace_seq_putc(s, '.'); + + return trace_seq_printf(s, "%d", entry->lock_depth); +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu)) + return 0; + + return trace_print_lat_fmt(s, entry); } static unsigned long preempt_mark_thresh = 100; @@ -857,7 +885,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, @@ -892,7 +920,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index d38bec4..9d91c72 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c deleted file mode 100644 index fe1a00f..0000000 --- a/kernel/trace/trace_power.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * ring buffer based C-state tracer - * - * Arjan van de Ven <arjan@linux.intel.com> - * Copyright (C) 2008 Intel Corporation - * - * Much is borrowed from trace_boot.c which is - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - -#include <linux/init.h> -#include <linux/debugfs.h> -#include <trace/power.h> -#include <linux/kallsyms.h> -#include <linux/module.h> - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *power_trace; -static int __read_mostly trace_power_enabled; - -static void probe_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} - - -static void probe_power_end(struct power_trace *it) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static void probe_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static int tracing_power_register(void) -{ - int ret; - - ret = register_trace_power_start(probe_power_start); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_start\n"); - return ret; - } - ret = register_trace_power_end(probe_power_end); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_end\n"); - goto fail_start; - } - ret = register_trace_power_mark(probe_power_mark); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_mark\n"); - goto fail_end; - } - return ret; -fail_end: - unregister_trace_power_end(probe_power_end); -fail_start: - unregister_trace_power_start(probe_power_start); - return ret; -} - -static void start_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 1; -} - -static void stop_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 0; -} - -static void power_trace_reset(struct trace_array *tr) -{ - trace_power_enabled = 0; - unregister_trace_power_start(probe_power_start); - unregister_trace_power_end(probe_power_end); - unregister_trace_power_mark(probe_power_mark); -} - - -static int power_trace_init(struct trace_array *tr) -{ - power_trace = tr; - - trace_power_enabled = 1; - tracing_power_register(); - - tracing_reset_online_cpus(tr); - return 0; -} - -static enum print_line_t power_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_power *field ; - struct power_trace *it; - struct trace_seq *s = &iter->seq; - struct timespec stamp; - struct timespec duration; - - trace_assign_type(field, entry); - it = &field->state_data; - stamp = ktime_to_timespec(it->stamp); - duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); - - if (entry->type == TRACE_POWER) { - if (it->type == POWER_CSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu, - duration.tv_sec, - duration.tv_nsec); - if (it->type == POWER_PSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -static void power_print_header(struct seq_file *s) -{ - seq_puts(s, "# TIMESTAMP STATE EVENT\n"); - seq_puts(s, "# | | |\n"); -} - -static struct tracer power_tracer __read_mostly = -{ - .name = "power", - .init = power_trace_init, - .start = start_power_trace, - .stop = stop_power_trace, - .reset = power_trace_reset, - .print_line = power_print_line, - .print_header = power_print_header, -}; - -static int init_power_trace(void) -{ - return register_tracer(&power_tracer); -} -device_initcall(init_power_trace); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 687699d..2547d88 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -11,7 +11,6 @@ #include <linux/ftrace.h> #include <linux/string.h> #include <linux/module.h> -#include <linux/marker.h> #include <linux/mutex.h> #include <linux/ctype.h> #include <linux/list.h> diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ad69f10..26185d7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; +static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; @@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (unlikely(disabled != 1)) goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); - - if (unlikely(!wakeup_task)) - goto unlock; - - /* - * The task can't disappear because it needs to - * wake up first, and we have the wakeup_lock. - */ - if (task_cpu(wakeup_task) != cpu) - goto unlock; trace_function(tr, ip, parent_ip, flags, pc); - unlock: - __raw_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&data->disabled); - + out_enable: ftrace_preempt_enable(resched); } @@ -107,11 +98,18 @@ static int report_latency(cycle_t delta) return 1; } +static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + static void notrace probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - unsigned long latency = 0, t0 = 0, t1 = 0; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (!report_latency(delta)) goto out_unlock; - latency = nsecs_to_usecs(delta); - - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + } out_unlock: __wakeup_reset(wakeup_trace); @@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; wakeup_task = p; @@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr) goto fail_deprobe_wake_new; } + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + return; + } + wakeup_reset(tr); /* @@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); unregister_trace_sched_wakeup(probe_wakeup); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); } static int __wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0f6facb..8504ac7 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = { int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == !!stack_tracer_enabled)) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8712ce3..d00d1a8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,7 +2,7 @@ #include <trace/events/syscalls.h> #include <linux/kernel.h> #include <linux/ftrace.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/syscall.h> #include "trace_output.h" @@ -14,6 +14,69 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + if (start->name && !strcmp(start->name + 3, str + 3)) + return start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + + for (i = 0; i < NR_syscalls; i++) { + if (syscalls_metadata[i]) { + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + } + } + return -1; +} + +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -103,7 +166,8 @@ extern char *__bad_type_size(void); #define SYSCALL_FIELD(type, name) \ sizeof(type) != sizeof(trace.name) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { @@ -120,7 +184,8 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) if (!entry) return 0; - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr)); if (!ret) return 0; @@ -130,8 +195,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) entry->args[i]); if (!ret) return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, - sizeof(unsigned long)); + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" + "\tsigned:%u;\n", offset, + sizeof(unsigned long), + is_signed_type(unsigned long)); if (!ret) return 0; offset += sizeof(unsigned long); @@ -163,10 +230,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) struct syscall_trace_exit trace; ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(unsigned long, ret)); + SYSCALL_FIELD(long, ret)); if (!ret) return 0; @@ -212,7 +281,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), FILTER_OTHER); return ret; @@ -375,6 +444,29 @@ struct trace_event event_syscall_exit = { .trace = print_syscall_exit, }; +int __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + NR_syscalls, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return -ENOMEM; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + syscalls_metadata[i] = meta; + } + + return 0; +} +core_initcall(init_ftrace_syscalls); + #ifdef CONFIG_EVENT_PROFILE static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); @@ -384,10 +476,13 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { - struct syscall_trace_enter *rec; struct syscall_metadata *sys_data; + struct syscall_trace_enter *rec; + unsigned long flags; + char *raw_data; int syscall_nr; int size; + int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -402,20 +497,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - do { - char raw_data[size]; + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return; + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(flags); - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + cpu = smp_processor_id(); - rec = (struct syscall_trace_enter *) raw_data; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_id; - rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, - (unsigned long *)&rec->args); - perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); - } while(0); + if (in_nmi()) + raw_data = rcu_dereference(trace_profile_buf_nmi); + else + raw_data = rcu_dereference(trace_profile_buf); + + if (!raw_data) + goto end; + + raw_data = per_cpu_ptr(raw_data, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_enter *) raw_data; + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->enter_id; + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + perf_tp_event(sys_data->enter_id, 0, 1, rec, size); + +end: + local_irq_restore(flags); } int reg_prof_syscall_enter(char *name) @@ -460,8 +573,12 @@ void unreg_prof_syscall_enter(char *name) static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; - struct syscall_trace_exit rec; + struct syscall_trace_exit *rec; + unsigned long flags; int syscall_nr; + char *raw_data; + int size; + int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) @@ -471,12 +588,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - tracing_generic_entry_update(&rec.ent, 0, 0); - rec.ent.type = sys_data->exit_id; - rec.nr = syscall_nr; - rec.ret = syscall_get_return_value(current, regs); + /* We can probably do that at build time */ + size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + /* + * Impossible, but be paranoid with the future + * How to put this check outside runtime? + */ + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "exit event has grown above profile buffer size")) + return; + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(flags); + cpu = smp_processor_id(); + + if (in_nmi()) + raw_data = rcu_dereference(trace_profile_buf_nmi); + else + raw_data = rcu_dereference(trace_profile_buf); + + if (!raw_data) + goto end; + + raw_data = per_cpu_ptr(raw_data, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_exit *)raw_data; - perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->exit_id; + rec->nr = syscall_nr; + rec->ret = syscall_get_return_value(current, regs); + + perf_tp_event(sys_data->exit_id, 0, 1, rec, size); + +end: + local_irq_restore(flags); } int reg_prof_syscall_exit(char *name) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9489a0a..cc89be5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : - * It is used to to delay the free of multiple probes array until a quiescent + * It is used to delay the free of multiple probes array until a quiescent * state is reached. * Tracepoint entries modifications are protected by the tracepoints_mutex. */ diff --git a/kernel/uid16.c b/kernel/uid16.c index 0314501..4192098 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -4,7 +4,6 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/mman.h> #include <linux/notifier.h> #include <linux/reboot.h> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 92359cc..69eae35 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); return r; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2d..ccefe57 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork, EXPORT_SYMBOL(schedule_delayed_work); /** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done |