diff options
Diffstat (limited to 'kernel')
67 files changed, 3674 insertions, 3455 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 3d9c7e2..b8d4cd8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o @@ -87,7 +86,6 @@ obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o -obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ @@ -96,7 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit.c b/kernel/audit.c index defc2e6..5feed23 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_SIGNAL_INFO: - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); - if (err) - return err; + len = 0; + if (audit_sig_sid) { + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (err) + return err; + } sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); if (!sig_data) { - security_release_secctx(ctx, len); + if (audit_sig_sid) + security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = audit_sig_uid; sig_data->pid = audit_sig_pid; - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); + if (audit_sig_sid) { + memcpy(sig_data->ctx, ctx, len); + security_release_secctx(ctx, len); + } audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0e96dbc..cc7e879 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -45,8 +45,8 @@ struct audit_watch { atomic_t count; /* reference count */ - char *path; /* insertion path */ dev_t dev; /* associated superblock device */ + char *path; /* insertion path */ unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 68d3c6a..267e484 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,12 +168,12 @@ struct audit_context { int in_syscall; /* 1 if task is in a syscall */ enum audit_state state, current_state; unsigned int serial; /* serial number for record */ - struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ - int return_valid; /* return code is valid */ long return_code;/* syscall return code */ u64 prio; + int return_valid; /* return code is valid */ int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -198,8 +198,8 @@ struct audit_context { char target_comm[TASK_COMM_LEN]; struct audit_tree_refs *trees, *first_trees; - int tree_count; struct list_head killed_trees; + int tree_count; int type; union { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c7ece8f..7ccba4b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -23,6 +23,7 @@ */ #include <linux/cgroup.h> +#include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -48,6 +49,8 @@ #include <linux/namei.h> #include <linux/smp_lock.h> #include <linux/pid_namespace.h> +#include <linux/idr.h> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <asm/atomic.h> @@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = { #include <linux/cgroup_subsys.h> }; +#define MAX_CGROUP_ROOT_NAMELEN 64 + /* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active @@ -74,6 +79,9 @@ struct cgroupfs_root { */ unsigned long subsys_bits; + /* Unique id for this hierarchy. */ + int hierarchy_id; + /* The bitmask of subsystems currently attached to this hierarchy */ unsigned long actual_subsys_bits; @@ -94,6 +102,9 @@ struct cgroupfs_root { /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* @@ -141,6 +152,10 @@ struct css_id { static LIST_HEAD(roots); static int root_count; +static DEFINE_IDA(hierarchy_ida); +static int next_hierarchy_id; +static DEFINE_SPINLOCK(hierarchy_id_lock); + /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -201,6 +216,7 @@ struct cg_cgroup_link { * cgroup, anchored on cgroup->css_sets */ struct list_head cgrp_link_list; + struct cgroup *cgrp; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links @@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); static DEFINE_RWLOCK(css_set_lock); static int css_set_count; -/* hash table for cgroup groups. This improves the performance to - * find an existing css_set */ +/* + * hash table for cgroup groups. This improves the performance to find + * an existing css_set. This hash doesn't (currently) take into + * account cgroups in empty hierarchies. + */ #define CSS_SET_HASH_BITS 7 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; @@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + kfree(cg); +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -/* When we create or destroy a css_set, the operation simply - * takes/releases a reference count on all the cgroups referenced - * by subsystems in this css_set. This can end up multiple-counting - * some cgroups, but that's OK - the ref-count is just a - * busy/not-busy indicator; ensuring that we only count each cgroup - * once would require taking a global lock to ensure that no - * subsystems moved between hierarchies while we were doing so. - * - * Possible TODO: decide at boot time based on the number of - * registered subsystems and the number of CPUs or NUMA nodes whether - * it's better for performance to ref-count every subsystem, or to - * take a global lock and only add one ref count to each hierarchy. - */ - -/* - * unlink a css_set from the list and free it - */ -static void unlink_css_set(struct css_set *cg) +static void __put_css_set(struct css_set *cg, int taskexit) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } -} - -static void __put_css_set(struct css_set *cg, int taskexit) -{ - int i; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit) write_unlock(&css_set_lock); return; } - unlink_css_set(cg); - write_unlock(&css_set_lock); - rcu_read_lock(); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); + /* This css_set is dead. unlink it and release cgroup refcounts */ + hlist_del(&cg->hlist); + css_set_count--; + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { + struct cgroup *cgrp = link->cgrp; + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + + kfree(link); } - rcu_read_unlock(); - kfree(cg); + + write_unlock(&css_set_lock); + call_rcu(&cg->rcu_head, free_css_set_rcu); } /* @@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg) } /* + * compare_css_sets - helper function for find_existing_css_set(). + * @cg: candidate css_set being tested + * @old_cg: existing css_set for a task + * @new_cgrp: cgroup that's being entered by the task + * @template: desired set of css pointers in css_set (pre-calculated) + * + * Returns true if "cg" matches "old_cg" except for the hierarchy + * which "new_cgrp" belongs to, for which it should match "new_cgrp". + */ +static bool compare_css_sets(struct css_set *cg, + struct css_set *old_cg, + struct cgroup *new_cgrp, + struct cgroup_subsys_state *template[]) +{ + struct list_head *l1, *l2; + + if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* Not all subsystems matched */ + return false; + } + + /* + * Compare cgroup pointers in order to distinguish between + * different cgroups in heirarchies with no subsystems. We + * could get by with just this check alone (and skip the + * memcmp above) but on most setups the memcmp check will + * avoid the need for this more expensive check on almost all + * candidates. + */ + + l1 = &cg->cg_links; + l2 = &old_cg->cg_links; + while (1) { + struct cg_cgroup_link *cgl1, *cgl2; + struct cgroup *cg1, *cg2; + + l1 = l1->next; + l2 = l2->next; + /* See if we reached the end - both lists are equal length. */ + if (l1 == &cg->cg_links) { + BUG_ON(l2 != &old_cg->cg_links); + break; + } else { + BUG_ON(l2 == &old_cg->cg_links); + } + /* Locate the cgroups associated with these links. */ + cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); + cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); + cg1 = cgl1->cgrp; + cg2 = cgl2->cgrp; + /* Hierarchies should be linked in the same order. */ + BUG_ON(cg1->root != cg2->root); + + /* + * If this hierarchy is the hierarchy of the cgroup + * that's changing, then we need to check that this + * css_set points to the new cgroup; if it's any other + * hierarchy, then this css_set should point to the + * same cgroup as the old css_set. + */ + if (cg1->root == new_cgrp->root) { + if (cg1 != new_cgrp) + return false; + } else { + if (cg1 != cg2) + return false; + } + } + return true; +} + +/* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing * css_set is suitable. @@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set( hhead = css_set_hash(template); hlist_for_each_entry(cg, node, hhead, hlist) { - if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* All subsystems matched */ - return cg; - } + if (!compare_css_sets(cg, oldcg, cgrp, template)) + continue; + + /* This css_set matches what we need */ + return cg; } /* No existing cgroup group matched */ @@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links, link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, cgrp_link_list); link->cg = cg; + link->cgrp = cgrp; + atomic_inc(&cgrp->count); list_move(&link->cgrp_link_list, &cgrp->css_sets); - list_add(&link->cg_link_list, &cg->cg_links); + /* + * Always add links to the tail of the list so that the list + * is sorted by order of hierarchy creation + */ + list_add_tail(&link->cg_link_list, &cg->cg_links); } /* @@ -451,11 +530,11 @@ static struct css_set *find_css_set( { struct css_set *res; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - int i; struct list_head tmp_cg_links; struct hlist_head *hhead; + struct cg_cgroup_link *link; /* First see if we already have a cgroup group that matches * the desired set */ @@ -489,20 +568,12 @@ static struct css_set *find_css_set( write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = res->subsys[i]->cgroup; - struct cgroup_subsys *ss = subsys[i]; - atomic_inc(&cgrp->count); - /* - * We want to add a link once per cgroup, so we - * only do it for the first subsystem in each - * hierarchy - */ - if (ss->root->subsys_list.next == &ss->sibling) - link_css_set(&tmp_cg_links, res, cgrp); + list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) + c = cgrp; + link_css_set(&tmp_cg_links, res, c); } - if (list_empty(&rootnode.subsys_list)) - link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -518,6 +589,41 @@ static struct css_set *find_css_set( } /* + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroupfs_root *root) +{ + struct css_set *css; + struct cgroup *res = NULL; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + read_lock(&css_set_lock); + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + css = task->cgroups; + if (css == &init_css_set) { + res = &root->top_cgroup; + } else { + struct cg_cgroup_link *link; + list_for_each_entry(link, &css->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == root) { + res = c; + break; + } + } + } + read_unlock(&css_set_lock); + BUG_ON(!res); + return res; +} + +/* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. * See "The task_lock() exception", at the end of this comment. @@ -596,7 +702,7 @@ void cgroup_unlock(void) static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); -static struct inode_operations cgroup_dir_inode_operations; +static const struct inode_operations cgroup_dir_inode_operations; static struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { @@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (strlen(root->name)) + seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); return 0; } @@ -849,6 +963,12 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + char *name; + /* User explicitly requested empty subsystem */ + bool none; + + struct cgroupfs_root *new_root; + }; /* Convert a hierarchy specifier into a bitmask of subsystems and @@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data, mask = ~(1UL << cpuset_subsys_id); #endif - opts->subsys_bits = 0; - opts->flags = 0; - opts->release_agent = NULL; + memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { if (!*token) @@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data, if (!ss->disabled) opts->subsys_bits |= 1ul << i; } + } else if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); + opts->release_agent = + kstrndup(token + 14, PATH_MAX, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - strncpy(opts->release_agent, token + 14, PATH_MAX - 1); - opts->release_agent[PATH_MAX - 1] = 0; + } else if (!strncmp(token, "name=", 5)) { + int i; + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; } else { struct cgroup_subsys *ss; int i; @@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data, } } + /* Consistency checks */ + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just @@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data, (opts->subsys_bits & mask)) return -EINVAL; - /* We can't have an empty hierarchy */ - if (!opts->subsys_bits) + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_bits && opts->none) + return -EINVAL; + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_bits && !opts->name) return -EINVAL; return 0; @@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* Don't allow name to change at remount */ + if (opts.name && strcmp(opts.name, root->name)) { + ret = -EINVAL; + goto out_unlock; + } + ret = rebind_subsystems(root, opts.subsys_bits); if (ret) goto out_unlock; @@ -955,13 +1114,14 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); out_unlock: kfree(opts.release_agent); + kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); unlock_kernel(); return ret; } -static struct super_operations cgroup_ops = { +static const struct super_operations cgroup_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = cgroup_show_options, @@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pids_list); - init_rwsem(&cgrp->pids_mutex); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); } + static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root) init_cgroup_housekeeping(cgrp); } +static bool init_root_id(struct cgroupfs_root *root) +{ + int ret = 0; + + do { + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) + return false; + spin_lock(&hierarchy_id_lock); + /* Try to allocate the next unused ID */ + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, + &root->hierarchy_id); + if (ret == -ENOSPC) + /* Try again starting from 0 */ + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); + if (!ret) { + next_hierarchy_id = root->hierarchy_id + 1; + } else if (ret != -EAGAIN) { + /* Can only get here if the 31-bit IDR is full ... */ + BUG_ON(ret); + } + spin_unlock(&hierarchy_id_lock); + } while (ret); + return true; +} + static int cgroup_test_super(struct super_block *sb, void *data) { - struct cgroupfs_root *new = data; + struct cgroup_sb_opts *opts = data; struct cgroupfs_root *root = sb->s_fs_info; - /* First check subsystems */ - if (new->subsys_bits != root->subsys_bits) - return 0; + /* If we asked for a name then it must match */ + if (opts->name && strcmp(opts->name, root->name)) + return 0; - /* Next check flags */ - if (new->flags != root->flags) + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match + */ + if ((opts->subsys_bits || opts->none) + && (opts->subsys_bits != root->subsys_bits)) return 0; return 1; } +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) +{ + struct cgroupfs_root *root; + + if (!opts->subsys_bits && !opts->none) + return NULL; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + if (!init_root_id(root)) { + kfree(root); + return ERR_PTR(-ENOMEM); + } + init_cgroup_root(root); + + root->subsys_bits = opts->subsys_bits; + root->flags = opts->flags; + if (opts->release_agent) + strcpy(root->release_agent_path, opts->release_agent); + if (opts->name) + strcpy(root->name, opts->name); + return root; +} + +static void cgroup_drop_root(struct cgroupfs_root *root) +{ + if (!root) + return; + + BUG_ON(!root->hierarchy_id); + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + kfree(root); +} + static int cgroup_set_super(struct super_block *sb, void *data) { int ret; - struct cgroupfs_root *root = data; + struct cgroup_sb_opts *opts = data; + + /* If we don't have a new root, we can't set up a new sb */ + if (!opts->new_root) + return -EINVAL; + + BUG_ON(!opts->subsys_bits && !opts->none); ret = set_anon_super(sb, NULL); if (ret) return ret; - sb->s_fs_info = root; - root->sb = sb; + sb->s_fs_info = opts->new_root; + opts->new_root->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type, void *data, struct vfsmount *mnt) { struct cgroup_sb_opts opts; + struct cgroupfs_root *root; int ret = 0; struct super_block *sb; - struct cgroupfs_root *root; - struct list_head tmp_cg_links; + struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - if (ret) { - kfree(opts.release_agent); - return ret; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - kfree(opts.release_agent); - return -ENOMEM; - } + if (ret) + goto out_err; - init_cgroup_root(root); - root->subsys_bits = opts.subsys_bits; - root->flags = opts.flags; - if (opts.release_agent) { - strcpy(root->release_agent_path, opts.release_agent); - kfree(opts.release_agent); + /* + * Allocate a new cgroup root. We may not need it if we're + * reusing an existing hierarchy. + */ + new_root = cgroup_root_from_opts(&opts); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out_err; } + opts.new_root = new_root; - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); - + /* Locate an existing or new sb for this hierarchy */ + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); if (IS_ERR(sb)) { - kfree(root); - return PTR_ERR(sb); + ret = PTR_ERR(sb); + cgroup_drop_root(opts.new_root); + goto out_err; } - if (sb->s_fs_info != root) { - /* Reusing an existing superblock */ - BUG_ON(sb->s_root == NULL); - kfree(root); - root = NULL; - } else { - /* New superblock */ + root = sb->s_fs_info; + BUG_ON(!root); + if (root == opts.new_root) { + /* We used the new root structure, so this is a new hierarchy */ + struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; + struct cgroupfs_root *existing_root; int i; BUG_ON(sb->s_root != NULL); @@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + if (strlen(root->name)) { + /* Check for name clashes with existing mounts */ + for_each_active_root(existing_root) { + if (!strcmp(existing_root->name, root->name)) { + ret = -EBUSY; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + } + } + /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be @@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - goto free_cg_links; + free_cg_links(&tmp_cg_links); + goto drop_new_super; } /* EBUSY should be the only error here */ @@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cgroup_populate_dir(root_cgrp); - mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + } else { + /* + * We re-used an existing hierarchy - the new root (if + * any) is not needed + */ + cgroup_drop_root(opts.new_root); } simple_set_mnt(mnt, sb); + kfree(opts.release_agent); + kfree(opts.name); return 0; - free_cg_links: - free_cg_links(&tmp_cg_links); drop_new_super: deactivate_locked_super(sb); + out_err: + kfree(opts.release_agent); + kfree(opts.name); + return ret; } @@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); kill_litter_super(sb); - kfree(root); + cgroup_drop_root(root); } static struct file_system_type cgroup_fs_type = { @@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } -/* - * Return the first subsystem attached to a cgroup's hierarchy, and - * its subsystem id. - */ - -static void get_first_subsys(const struct cgroup *cgrp, - struct cgroup_subsys_state **css, int *subsys_id) -{ - const struct cgroupfs_root *root = cgrp->root; - const struct cgroup_subsys *test_ss; - BUG_ON(list_empty(&root->subsys_list)); - test_ss = list_entry(root->subsys_list.next, - struct cgroup_subsys, sibling); - if (css) { - *css = cgrp->subsys[test_ss->subsys_id]; - BUG_ON(!*css); - } - if (subsys_id) - *subsys_id = test_ss->subsys_id; -} - /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; - int subsys_id; - - get_first_subsys(cgrp, NULL, &subsys_id); /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup(tsk, subsys_id); + oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) return 0; for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, tsk, false); if (retval) return retval; } @@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) return ret; } -/* The various types of files and directories in a cgroup file system */ -enum cgroup_filetype { - FILE_ROOT, - FILE_DIR, - FILE_TASKLIST, - FILE_NOTIFY_ON_RELEASE, - FILE_RELEASE_AGENT, -}; - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1711,7 +1930,7 @@ static struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; -static struct inode_operations cgroup_dir_inode_operations = { +static const struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, @@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * the start of a css_set */ static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) + struct cgroup_iter *it) { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; @@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) } /* - * Stuff for reading the 'tasks' file. + * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), @@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* - * Load into 'pidarray' up to 'npids' of the tasks using cgroup - * 'cgrp'. Return actual number of pids loaded. No need to - * task_lock(p) when reading out p->cgroup, since we're in an RCU - * read section, so the css_set can't go away, and is - * immutable after creation. + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * If the new stripped list is sufficiently smaller and there's enough memory + * to allocate a new buffer, will let go of the unneeded memory. Returns the + * number of unique elements. + */ +/* is the size difference enough that we should re-allocate the array? */ +#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) +static int pidlist_uniq(pid_t **p, int length) +{ + int src, dest = 1; + pid_t *list = *p; + pid_t *newlist; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + /* + * if the length difference is large enough, we want to allocate a + * smaller buffer to save memory. if this fails due to out of memory, + * we'll just stay with what we've got. + */ + if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { + newlist = pidlist_resize(list, dest); + if (newlist) + *p = newlist; + } + return dest; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. */ -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) { - int n = 0, pid; + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* found a matching list - drop the extra refcount */ + put_pid_ns(ns); + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + l->use_count++; + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + put_pid_ns(ns); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = ns; + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ struct cgroup_iter it; struct task_struct *tsk; + struct cgroup_pidlist *l; + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == npids)) + if (unlikely(n == length)) break; - pid = task_pid_vnr(tsk); - if (pid > 0) - pidarray[n++] = pid; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; } cgroup_iter_end(cgrp, &it); - return n; + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(&array, length); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); + l->list = array; + l->length = length; + l->use_count++; + up_write(&l->mutex); + *lp = l; + return 0; } /** @@ -2216,37 +2604,14 @@ err: return ret; } -/* - * Cache pids for all threads in the same pid namespace that are - * opening the same "tasks" file. - */ -struct cgroup_pids { - /* The node in cgrp->pids_list */ - struct list_head list; - /* The cgroup those pids belong to */ - struct cgroup *cgrp; - /* The namepsace those pids belong to */ - struct pid_namespace *ns; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the this tasks_pids array */ - int use_count; - /* Length of the current tasks_pids array */ - int length; -}; - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} /* - * seq_file methods for the "tasks" file. The seq_file position is the + * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->tasks_pids array. + * in the cgroup->l->list array. */ -static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to @@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; + struct cgroup_pidlist *l = s->private; int index = 0, pid = *pos; int *iter; - down_read(&cgrp->pids_mutex); + down_read(&l->mutex); if (pid) { - int end = cp->length; + int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (cp->tasks_pids[mid] == pid) { + if (l->list[mid] == pid) { index = mid; break; - } else if (cp->tasks_pids[mid] <= pid) + } else if (l->list[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cp->length) + if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cp->tasks_pids + index; + iter = l->list + index; *pos = *iter; return iter; } -static void cgroup_tasks_stop(struct seq_file *s, void *v) +static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; - up_read(&cgrp->pids_mutex); + struct cgroup_pidlist *l = s->private; + up_read(&l->mutex); } -static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pids *cp = s->private; - int *p = v; - int *end = cp->tasks_pids + cp->length; - + struct cgroup_pidlist *l = s->private; + pid_t *p = v; + pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done @@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) } } -static int cgroup_tasks_show(struct seq_file *s, void *v) +static int cgroup_pidlist_show(struct seq_file *s, void *v) { return seq_printf(s, "%d\n", *(int *)v); } -static struct seq_operations cgroup_tasks_seq_operations = { - .start = cgroup_tasks_start, - .stop = cgroup_tasks_stop, - .next = cgroup_tasks_next, - .show = cgroup_tasks_show, +/* + * seq_operations functions for iterating on pidlists through seq_file - + * independent of whether it's tasks or procs + */ +static const struct seq_operations cgroup_pidlist_seq_operations = { + .start = cgroup_pidlist_start, + .stop = cgroup_pidlist_stop, + .next = cgroup_pidlist_next, + .show = cgroup_pidlist_show, }; -static void release_cgroup_pid_array(struct cgroup_pids *cp) +static void cgroup_release_pid_array(struct cgroup_pidlist *l) { - struct cgroup *cgrp = cp->cgrp; - - down_write(&cgrp->pids_mutex); - BUG_ON(!cp->use_count); - if (!--cp->use_count) { - list_del(&cp->list); - put_pid_ns(cp->ns); - kfree(cp->tasks_pids); - kfree(cp); + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); + down_write(&l->mutex); + BUG_ON(!l->use_count); + if (!--l->use_count) { + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; } - up_write(&cgrp->pids_mutex); + mutex_unlock(&l->owner->pidlist_mutex); + up_write(&l->mutex); } -static int cgroup_tasks_release(struct inode *inode, struct file *file) +static int cgroup_pidlist_release(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct cgroup_pids *cp; - + struct cgroup_pidlist *l; if (!(file->f_mode & FMODE_READ)) return 0; - - seq = file->private_data; - cp = seq->private; - - release_cgroup_pid_array(cp); + /* + * the seq_file will only be initialized if the file was opened for + * reading; hence we check if it's not null only in that case. + */ + l = ((struct seq_file *)file->private_data)->private; + cgroup_release_pid_array(l); return seq_release(inode, file); } -static struct file_operations cgroup_tasks_operations = { +static const struct file_operations cgroup_pidlist_operations = { .read = seq_read, .llseek = seq_lseek, .write = cgroup_file_write, - .release = cgroup_tasks_release, + .release = cgroup_pidlist_release, }; /* - * Handle an open on 'tasks' file. Prepare an array containing the - * process id's of tasks currently attached to the cgroup being opened. + * The following functions handle opens on a file that displays a pidlist + * (tasks or procs). Prepare an array of the process/thread IDs of whoever's + * in the cgroup. */ - -static int cgroup_tasks_open(struct inode *unused, struct file *file) +/* helper function for the two below it */ +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct pid_namespace *ns = current->nsproxy->pid_ns; - struct cgroup_pids *cp; - pid_t *pidarray; - int npids; + struct cgroup_pidlist *l; int retval; /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - npids = cgroup_task_count(cgrp); - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - return -ENOMEM; - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* - * Store the array in the cgroup, freeing the old - * array if necessary - */ - down_write(&cgrp->pids_mutex); - - list_for_each_entry(cp, &cgrp->pids_list, list) { - if (ns == cp->ns) - goto found; - } - - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - up_write(&cgrp->pids_mutex); - kfree(pidarray); - return -ENOMEM; - } - cp->cgrp = cgrp; - cp->ns = ns; - get_pid_ns(ns); - list_add(&cp->list, &cgrp->pids_list); -found: - kfree(cp->tasks_pids); - cp->tasks_pids = pidarray; - cp->length = npids; - cp->use_count++; - up_write(&cgrp->pids_mutex); - - file->f_op = &cgroup_tasks_operations; + /* have the array populated */ + retval = pidlist_array_load(cgrp, type, &l); + if (retval) + return retval; + /* configure file information */ + file->f_op = &cgroup_pidlist_operations; - retval = seq_open(file, &cgroup_tasks_seq_operations); + retval = seq_open(file, &cgroup_pidlist_seq_operations); if (retval) { - release_cgroup_pid_array(cp); + cgroup_release_pid_array(l); return retval; } - ((struct seq_file *)file->private_data)->private = cp; + ((struct seq_file *)file->private_data)->private = l; return 0; } +static int cgroup_tasks_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); +} +static int cgroup_procs_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); +} static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft) @@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, /* * for the common functions, 'private' gives the type of file */ +/* for hysterical raisins, we can't put this on the older files */ +#define CGROUP_FILE_GENERIC_PREFIX "cgroup." static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, - .release = cgroup_tasks_release, - .private = FILE_TASKLIST, + .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, - + { + .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .open = cgroup_procs_open, + /* .write_u64 = cgroup_procs_write, TODO */ + .release = cgroup_pidlist_release, + .mode = S_IRUGO, + }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, - .private = FILE_NOTIFY_ON_RELEASE, }, }; @@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = { .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, - .private = FILE_RELEASE_AGENT, }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void) init_task.cgroups = &init_css_set; init_css_set_link.cg = &init_css_set; + init_css_set_link.cgrp = dummytop; list_add(&init_css_set_link.cgrp_link_list, &rootnode.top_cgroup.css_sets); list_add(&init_css_set_link.cg_link_list, @@ -2933,7 +3284,7 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); - + BUG_ON(!init_root_id(&rootnode)); err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; @@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int subsys_id; int count = 0; - seq_printf(m, "%lu:", root->subsys_bits); + seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (strlen(root->name)) + seq_printf(m, "%sname=%s", count ? "," : "", + root->name); seq_putc(m, ':'); - get_first_subsys(&root->top_cgroup, NULL, &subsys_id); - cgrp = task_cgroup(tsk, subsys_id); + cgrp = task_cgroup_from_root(tsk, root); retval = cgroup_path(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; @@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\t%d\n", - ss->name, ss->root->subsys_bits, + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); @@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) { int ret; struct cgroup *target; - int subsys_id; if (cgrp == dummytop) return 1; - get_first_subsys(cgrp, NULL, &subsys_id); - target = task_cgroup(task, subsys_id); + target = task_cgroup_from_root(task, cgrp->root); while (cgrp != target && cgrp!= cgrp->top_cgroup) cgrp = cgrp->parent; ret = (cgrp == target); @@ -3693,3 +4043,154 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + return cgroup_task_count(cont); +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + struct css_set *cg; + + read_lock(&css_set_lock); + rcu_read_lock(); + cg = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + const char *name; + + if (c->dentry) + name = c->dentry->d_name.name; + else + name = "?"; + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name); + } + rcu_read_unlock(); + read_unlock(&css_set_lock); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + struct task_struct *task; + int count = 0; + seq_printf(seq, "css_set %p\n", cg); + list_for_each_entry(task, &cg->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) { + seq_puts(seq, " ...\n"); + break; + } else { + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + } + } + read_unlock(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +static struct cftype debug_files[] = { + { + .name = "cgroup_refcount", + .read_u64 = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .read_seq_string = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .read_seq_string = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, debug_files, + ARRAY_SIZE(debug_files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c deleted file mode 100644 index 0c92d79..0000000 --- a/kernel/cgroup_debug.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * kernel/cgroup_debug.c - Example cgroup subsystem that - * exposes debug info - * - * Copyright (C) Google Inc, 2007 - * - * Developed by Paul Menage (menage@google.com) - * - */ - -#include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> - -#include <asm/atomic.h> - -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - u64 count; - - count = cgroup_task_count(cont); - return count; -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); - return count; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fb249e2..59e9ef6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task) */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct task_struct *task, bool threadgroup) { struct freezer *freezer; @@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (is_task_frozen_enough(c)) { + rcu_read_unlock(); + return -EBUSY; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7e75a41..b5cb469 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp) static cpumask_var_t cpus_attach; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct task_struct *tsk) +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *tsk, bool threadgroup) { + int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - return security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk, 0, NULL); + if (ret) + return ret; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + ret = security_task_setscheduler(c, 0, NULL); + if (ret) { + rcu_read_unlock(); + return ret; + } + } + rcu_read_unlock(); + } + return 0; +} + +static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, + struct cpuset *cs) +{ + int err; + /* + * can_attach beforehand should guarantee that this doesn't fail. + * TODO: have a better way to handle failure here + */ + err = set_cpus_allowed_ptr(tsk, cpus_attach); + WARN_ON_ONCE(err); + + task_lock(tsk); + cpuset_change_task_nodemask(tsk, to); + task_unlock(tsk); + cpuset_update_task_spread_flag(cs, tsk); + } -static void cpuset_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct cgroup *oldcont, - struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *oldcont, struct task_struct *tsk, + bool threadgroup) { nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - int err; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss, guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &to); } - err = set_cpus_allowed_ptr(tsk, cpus_attach); - if (err) - return; - task_lock(tsk); - cpuset_change_task_nodemask(tsk, &to); - task_unlock(tsk); - cpuset_update_task_spread_flag(cs, tsk); + /* do per-task migration stuff possibly for each in the threadgroup */ + cpuset_attach_task(tsk, &to, cs); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + cpuset_attach_task(c, &to, cs); + } + rcu_read_unlock(); + } + /* change mm; only needs to be done once even if threadgroup */ from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); diff --git a/kernel/cred.c b/kernel/cred.c index d7f7a01..dd76cfe 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as); #ifdef CONFIG_DEBUG_CREDENTIALS +bool creds_are_invalid(const struct cred *cred) +{ + if (cred->magic != CRED_MAGIC) + return true; + if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) + return true; +#ifdef CONFIG_SECURITY_SELINUX + if (selinux_is_enabled()) { + if ((unsigned long) cred->security < PAGE_SIZE) + return true; + if ((*(u32 *)cred->security & 0xffffff00) == + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) + return true; + } +#endif + return false; +} +EXPORT_SYMBOL(creds_are_invalid); + /* * dump invalid credentials */ diff --git a/kernel/exit.c b/kernel/exit.c index ae5d866..5859f59 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <trace/events/sched.h> #include <asm/uaccess.h> @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) + if (task_session(curr) != pid) { change_pid(curr, PIDTYPE_SID, pid); + proc_sid_connector(curr); + } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); @@ -945,6 +947,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); + if (tsk->mm) + setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) @@ -972,8 +976,6 @@ NORET_TYPE void do_exit(long code) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); proc_exit_connector(tsk); @@ -981,7 +983,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA @@ -1093,28 +1095,28 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; + wait_queue_t child_wait; int notask_error; }; -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { - int err; - - if (wo->wo_type < PIDTYPE_MAX) { - if (task_pid_type(p, wo->wo_type) != wo->wo_pid) - return 0; - } + return wo->wo_type == PIDTYPE_MAX || + task_pid_type(p, wo->wo_type) == wo->wo_pid; +} +static int eligible_child(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return 0; /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1124,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p) && !(wo->wo_flags & __WALL)) return 0; - err = security_task_wait(p); - if (err) - return err; - return 1; } @@ -1140,18 +1138,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } if (!retval) retval = pid; return retval; @@ -1208,6 +1208,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (likely(!traced) && likely(!task_detached(p))) { struct signal_struct *psig; struct signal_struct *sig; + unsigned long maxrss; /* * The resource counters for the group leader are in its @@ -1256,6 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; + maxrss = max(sig->maxrss, sig->cmaxrss); + if (psig->cmaxrss < maxrss) + psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->real_parent->sighand->siglock); @@ -1477,13 +1481,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. */ -static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, - int ptrace, struct task_struct *p) +static int wait_consider_task(struct wait_opts *wo, int ptrace, + struct task_struct *p) { int ret = eligible_child(wo, p); if (!ret) return ret; + ret = security_task_wait(p); if (unlikely(ret < 0)) { /* * If we have not yet seen any eligible child, @@ -1545,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) * Do not consider detached threads. */ if (!task_detached(p)) { - int ret = wait_consider_task(wo, tsk, 0, p); + int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } @@ -1559,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, tsk, 1, p); + int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } @@ -1567,15 +1572,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +static int child_wait_callback(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, + child_wait); + struct task_struct *p = key; + + if (!eligible_pid(wo, p)) + return 0; + + if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) + return 0; + + return default_wake_function(wait, mode, sync, key); +} + +void __wake_up_parent(struct task_struct *p, struct task_struct *parent) +{ + __wake_up_sync_key(&parent->signal->wait_chldexit, + TASK_INTERRUPTIBLE, 1, p); +} + static long do_wait(struct wait_opts *wo) { - DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); - add_wait_queue(¤t->signal->wait_chldexit,&wait); + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our critiera just get out. @@ -1616,32 +1644,7 @@ notask: } end: __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (wo->wo_info) { - struct siginfo __user *infop = wo->wo_info; - - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } + remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } @@ -1686,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); + + if (ret > 0) { + ret = 0; + } else if (infop) { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + put_pid(pid); /* avoid REGPARM breakage on x86: */ diff --git a/kernel/fork.c b/kernel/fork.c index bfee931..266c6af 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -49,6 +49,7 @@ #include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> +#include <linux/ksm.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> @@ -61,7 +62,8 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> +#include <linux/posix-timers.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +static void account_kernel_stack(struct thread_info *ti, int account) +{ + struct zone *zone = page_zone(virt_to_page(ti)); + + mod_zone_page_state(zone, NR_KERNEL_STACK, account); +} + void free_task(struct task_struct *tsk) { prop_local_destroy_single(&tsk->dirties); + account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + + account_kernel_stack(ti, 1); + return tsk; out: @@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> +static void mm_init_aio(struct mm_struct *mm) +{ +#ifdef CONFIG_AIO + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); +#endif +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->flags = (current->mm) ? + (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); - spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_aio(mm); mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { @@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); + ksm_exit(mm); exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + if (mm->binfmt) + module_put(mm->binfmt->module); mmdrop(mm); } } @@ -618,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; + if (mm->binfmt && !try_module_get(mm->binfmt->module)) + goto free_pt; + return mm; free_pt: + /* don't put binfmt in mmput, we haven't got module yet */ + mm->binfmt = NULL; mmput(mm); fail_nomem: @@ -788,10 +820,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) thread_group_cputime_init(sig); /* Expiration times and increments. */ - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; + sig->it[CPUCLOCK_PROF].expires = cputime_zero; + sig->it[CPUCLOCK_PROF].incr = cputime_zero; + sig->it[CPUCLOCK_VIRT].expires = cputime_zero; + sig->it[CPUCLOCK_VIRT].incr = cputime_zero; /* Cached expiration times. */ sig->cputime_expires.prof_exp = cputime_zero; @@ -849,6 +881,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; + sig->maxrss = sig->cmaxrss = 0; task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); @@ -863,6 +896,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); + sig->oom_adj = current->signal->oom_adj; + return 0; } @@ -958,6 +993,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); + /* + * Siblings of global init remain as zombies on exit since they are + * not reaped by their parent (swapper). To solve this and to avoid + * multi-rooted process trees, prevent global and container-inits + * from creating siblings. + */ + if ((clone_flags & CLONE_PARENT) && + current->signal->flags & SIGNAL_UNKILLABLE) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -999,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - if (p->binfmt && !try_module_get(p->binfmt->module)) - goto bad_fork_cleanup_put_domain; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); @@ -1075,10 +1117,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->bts = NULL; + p->stack_start = stack_start; + /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - retval = perf_counter_init_task(p); + retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; @@ -1253,7 +1297,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); - perf_counter_fork(p); + perf_event_fork(p); return p; bad_fork_free_pid: @@ -1280,16 +1324,13 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_free_task(p); + perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); - if (p->binfmt) - module_put(p->binfmt->module); -bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c03f221..e5d98ce 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,6 +48,8 @@ #include <asm/uaccess.h> +#include <trace/events/timer.h> + /* * The timer bases: * @@ -442,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -798,7 +820,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer *entry; int leftmost = 1; - debug_hrtimer_activate(timer); + debug_activate(timer); /* * Find the right place in the rbtree: @@ -884,7 +906,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, @@ -1117,7 +1139,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - debug_hrtimer_init(timer); + debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -1141,7 +1163,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); -static void __run_hrtimer(struct hrtimer *timer) +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; struct hrtimer_cpu_base *cpu_base = base->cpu_base; @@ -1150,7 +1172,7 @@ static void __run_hrtimer(struct hrtimer *timer) WARN_ON(!irqs_disabled()); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1161,7 +1183,9 @@ static void __run_hrtimer(struct hrtimer *timer) * the timer base. */ spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); restart = fn(timer); + trace_hrtimer_expire_exit(timer); spin_lock(&cpu_base->lock); /* @@ -1272,7 +1296,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - __run_hrtimer(timer); + __run_hrtimer(timer, &basenow); } base++; } @@ -1394,7 +1418,7 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - __run_hrtimer(timer); + __run_hrtimer(timer, &base->softirq_time); } spin_unlock(&cpu_base->lock); } @@ -1571,7 +1595,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 022a492..d4e8417 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout) * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; diff --git a/kernel/itimer.c b/kernel/itimer.c index 58762f7..b03451e 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -12,6 +12,7 @@ #include <linux/time.h> #include <linux/posix-timers.h> #include <linux/hrtimer.h> +#include <trace/events/timer.h> #include <asm/uaccess.h> @@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) return ktime_to_timeval(rem); } +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime_add(cputime.utime, cputime.stime); + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cputime_le(cval, t)) + /* about to fire */ + cval = cputime_one_jiffy; + else + cval = cputime_sub(cval, t); + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; - cputime_t cinterval, cval; switch (which) { case ITIMER_REAL: @@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime cputime; - cputime_t utime; - - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - if (cputime_le(cval, utime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, utime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); break; case ITIMER_PROF: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime times; - cputime_t ptime; - - thread_group_cputimer(tsk, ×); - ptime = cputime_add(times.utime, times.stime); - if (cputime_le(cval, ptime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, ptime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); break; default: return(-EINVAL); @@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; } +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); + it->error = cputime_sub_ns(nval, ns_nval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, cputime_one_jiffy); + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + /* * Returns true if the timeval is in canonical form */ @@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - cputime_t cval, cinterval, nval, ninterval; /* * Validate the timevals in value. @@ -171,51 +221,14 @@ again: } else tsk->signal->it_real_incr.tv64 = 0; + trace_itimer_state(ITIMER_REAL, value, 0); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_VIRT, - &nval, &cval); - } - tsk->signal->it_virt_expires = nval; - tsk->signal->it_virt_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); break; case ITIMER_PROF: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_PROF, - &nval, &cval); - } - tsk->signal->it_prof_expires = nval; - tsk->signal->it_prof_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); break; default: return -EINVAL; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3a29dbe..8b6b8b6 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr) static inline int is_kernel_text(unsigned long addr) { - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || + arch_is_kernel_text(addr)) return 1; return in_gate_area_no_task(addr); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ef177d6..cfadc12 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_seq_ops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f74d2d7..3815ac1d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -578,6 +578,9 @@ static int static_obj(void *obj) if ((addr >= start) && (addr < end)) return 1; + if (arch_is_kernel_data(addr)) + return 1; + #ifdef CONFIG_SMP /* * percpu var? diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d4b3dbc..d4aba4f 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations lockstat_ops = { +static const struct seq_operations lockstat_ops = { .start = ls_start, .next = ls_next, .stop = ls_stop, diff --git a/kernel/marker.c b/kernel/marker.c deleted file mode 100644 index ea54f26..0000000 --- a/kernel/marker.c +++ /dev/null @@ -1,930 +0,0 @@ -/* - * Copyright (C) 2007 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/types.h> -#include <linux/jhash.h> -#include <linux/list.h> -#include <linux/rcupdate.h> -#include <linux/marker.h> -#include <linux/err.h> -#include <linux/slab.h> - -extern struct marker __start___markers[]; -extern struct marker __stop___markers[]; - -/* Set to 1 to enable marker debug output */ -static const int marker_debug; - -/* - * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers and the hash table. - */ -static DEFINE_MUTEX(markers_mutex); - -/* - * Marker hash table, containing the active markers. - * Protected by module_mutex. - */ -#define MARKER_HASH_BITS 6 -#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - -/* - * Note about RCU : - * It is used to make sure every handler has finished using its private data - * between two consecutive operation (add or remove) on a given marker. It is - * also used to delay the free of multiple probes array until a quiescent state - * is reached. - * marker entries modifications are protected by the markers_mutex. - */ -struct marker_entry { - struct hlist_node hlist; - char *format; - /* Probe wrapper */ - void (*call)(const struct marker *mdata, void *call_private, ...); - struct marker_probe_closure single; - struct marker_probe_closure *multi; - int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - int rcu_pending; - unsigned char ptype:1; - unsigned char format_allocated:1; - char name[0]; /* Contains name'\0'format'\0' */ -}; - -/** - * __mark_empty_function - Empty probe callback - * @probe_private: probe private data - * @call_private: call site private data - * @fmt: format string - * @...: variable argument list - * - * Empty callback provided as a probe to the markers. By providing this to a - * disabled marker, we make sure the execution flow is always valid even - * though the function pointer change and the marker enabling are two distinct - * operations that modifies the execution flow of preemptible code. - */ -notrace void __mark_empty_function(void *probe_private, void *call_private, - const char *fmt, va_list *args) -{ -} -EXPORT_SYMBOL_GPL(__mark_empty_function); - -/* - * marker_probe_cb Callback that prepares the variable argument list for probes. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Since we do not use "typical" pointer based RCU in the 1 argument case, we - * need to put a full smp_rmb() in this branch. This is why we do not use - * rcu_dereference() for the pointer read. - */ -notrace void marker_probe_cb(const struct marker *mdata, - void *call_private, ...) -{ - va_list args; - char ptype; - - /* - * rcu_read_lock_sched does two things : disabling preemption to make - * sure the teardown of the callbacks can be done correctly when they - * are in modules and they insure RCU read coherency. - */ - rcu_read_lock_sched_notrace(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - va_start(args, call_private); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - va_end(args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) { - va_start(args, call_private); - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - va_end(args); - } - } - rcu_read_unlock_sched_notrace(); -} -EXPORT_SYMBOL_GPL(marker_probe_cb); - -/* - * marker_probe_cb Callback that does not prepare the variable argument list. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Should be connected to markers "MARK_NOARGS". - */ -static notrace void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, ...) -{ - va_list args; /* not initialized */ - char ptype; - - rcu_read_lock_sched_notrace(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - } - rcu_read_unlock_sched_notrace(); -} - -static void free_old_closure(struct rcu_head *head) -{ - struct marker_entry *entry = container_of(head, - struct marker_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; -} - -static void debug_print_probes(struct marker_entry *entry) -{ - int i; - - if (!marker_debug) - return; - - if (!entry->ptype) { - printk(KERN_DEBUG "Single probe : %p %p\n", - entry->single.func, - entry->single.probe_private); - } else { - for (i = 0; entry->multi[i].func; i++) - printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, - entry->multi[i].func, - entry->multi[i].probe_private); - } -} - -static struct marker_probe_closure * -marker_entry_add_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0; - struct marker_probe_closure *old, *new; - - WARN_ON(!probe); - - debug_print_probes(entry); - old = entry->multi; - if (!entry->ptype) { - if (entry->single.func == probe && - entry->single.probe_private == probe_private) - return ERR_PTR(-EBUSY); - if (entry->single.func == __mark_empty_function) { - /* 0 -> 1 probes */ - entry->single.func = probe; - entry->single.probe_private = probe_private; - entry->refcount = 1; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* 1 -> 2 probes */ - nr_probes = 1; - old = NULL; - } - } else { - /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe - && old[nr_probes].probe_private - == probe_private) - return ERR_PTR(-EBUSY); - } - /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), - GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - if (!old) - new[0] = entry->single; - else - memcpy(new, old, - nr_probes * sizeof(struct marker_probe_closure)); - new[nr_probes].func = probe; - new[nr_probes].probe_private = probe_private; - entry->refcount = nr_probes + 1; - entry->multi = new; - entry->ptype = 1; - debug_print_probes(entry); - return old; -} - -static struct marker_probe_closure * -marker_entry_remove_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0, nr_del = 0, i; - struct marker_probe_closure *old, *new; - - old = entry->multi; - - debug_print_probes(entry); - if (!entry->ptype) { - /* 0 -> N is an error */ - WARN_ON(entry->single.func == __mark_empty_function); - /* 1 -> 0 probes */ - WARN_ON(probe && entry->single.func != probe); - WARN_ON(entry->single.probe_private != probe_private); - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if ((!probe || old[nr_probes].func == probe) - && old[nr_probes].probe_private - == probe_private) - nr_del++; - } - } - - if (nr_probes - nr_del == 0) { - /* N -> 0, (N > 1) */ - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - } else if (nr_probes - nr_del == 1) { - /* N -> 1, (N > 1) */ - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - entry->single = old[i]; - entry->refcount = 1; - entry->ptype = 0; - } else { - int j = 0; - /* N -> M, (N > 1, M > 1) */ - /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(struct marker_probe_closure), GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - new[j++] = old[i]; - entry->refcount = nr_probes - nr_del; - entry->ptype = 1; - entry->multi = new; - } - debug_print_probes(entry); - return old; -} - -/* - * Get marker if the marker is present in the marker hash table. - * Must be called with markers_mutex held. - * Returns NULL if not present. - */ -static struct marker_entry *get_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} - -/* - * Add the marker to the marker hash table. Must be called with markers_mutex - * held. - */ -static struct marker_entry *add_marker(const char *name, const char *format) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - size_t format_len = 0; - u32 hash = jhash(name, name_len-1, 0); - - if (format) - format_len = strlen(format) + 1; - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "Marker %s busy\n", name); - return ERR_PTR(-EBUSY); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - if (format) { - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); - } else { - e->format = NULL; - e->call = marker_probe_cb; - } - e->single.func = __mark_empty_function; - e->single.probe_private = NULL; - e->multi = NULL; - e->ptype = 0; - e->format_allocated = 0; - e->refcount = 0; - e->rcu_pending = 0; - hlist_add_head(&e->hlist, head); - return e; -} - -/* - * Remove the marker from the marker hash table. Must be called with mutex_lock - * held. - */ -static int remove_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->single.func != __mark_empty_function) - return -EBUSY; - hlist_del(&e->hlist); - if (e->format_allocated) - kfree(e->format); - /* Make sure the call_rcu has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); - kfree(e); - return 0; -} - -/* - * Set the mark_entry format to the format found in the element. - */ -static int marker_set_format(struct marker_entry *entry, const char *format) -{ - entry->format = kstrdup(format, GFP_KERNEL); - if (!entry->format) - return -ENOMEM; - entry->format_allocated = 1; - - trace_mark(core_marker_format, "name %s format %s", - entry->name, entry->format); - return 0; -} - -/* - * Sets the probe callback corresponding to one marker. - */ -static int set_marker(struct marker_entry *entry, struct marker *elem, - int active) -{ - int ret = 0; - WARN_ON(strcmp(entry->name, elem->name) != 0); - - if (entry->format) { - if (strcmp(entry->format, elem->format) != 0) { - printk(KERN_NOTICE - "Format mismatch for probe %s " - "(%s), marker (%s)\n", - entry->name, - entry->format, - elem->format); - return -EPERM; - } - } else { - ret = marker_set_format(entry, elem->format); - if (ret) - return ret; - } - - /* - * probe_cb setup (statically known) is done here. It is - * asynchronous with the rest of execution, therefore we only - * pass from a "safe" callback (with argument) to an "unsafe" - * callback (does not set arguments). - */ - elem->call = entry->call; - /* - * Sanity check : - * We only update the single probe private data when the ptr is - * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) - */ - WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private != entry->single.probe_private - && !elem->ptype); - elem->single.probe_private = entry->single.probe_private; - /* - * Make sure the private data is valid when we update the - * single probe ptr. - */ - smp_wmb(); - elem->single.func = entry->single.func; - /* - * We also make sure that the new probe callbacks array is consistent - * before setting a pointer to it. - */ - rcu_assign_pointer(elem->multi, entry->multi); - /* - * Update the function or multi probe array pointer before setting the - * ptype. - */ - smp_wmb(); - elem->ptype = entry->ptype; - - if (elem->tp_name && (active ^ elem->state)) { - WARN_ON(!elem->tp_cb); - /* - * It is ok to directly call the probe registration because type - * checking has been done in the __trace_mark_tp() macro. - */ - - if (active) { - /* - * try_module_get should always succeed because we hold - * lock_module() to get the tp_cb address. - */ - ret = try_module_get(__module_text_address( - (unsigned long)elem->tp_cb)); - BUG_ON(!ret); - ret = tracepoint_probe_register_noupdate( - elem->tp_name, - elem->tp_cb); - } else { - ret = tracepoint_probe_unregister_noupdate( - elem->tp_name, - elem->tp_cb); - /* - * tracepoint_probe_update_all() must be called - * before the module containing tp_cb is unloaded. - */ - module_put(__module_text_address( - (unsigned long)elem->tp_cb)); - } - } - elem->state = active; - - return ret; -} - -/* - * Disable a marker and its probe callback. - * Note: only waiting an RCU period after setting elem->call to the empty - * function insures that the original callback is not used anymore. This insured - * by rcu_read_lock_sched around the call site. - */ -static void disable_marker(struct marker *elem) -{ - int ret; - - /* leave "call" as is. It is known statically. */ - if (elem->tp_name && elem->state) { - WARN_ON(!elem->tp_cb); - /* - * It is ok to directly call the probe registration because type - * checking has been done in the __trace_mark_tp() macro. - */ - ret = tracepoint_probe_unregister_noupdate(elem->tp_name, - elem->tp_cb); - WARN_ON(ret); - /* - * tracepoint_probe_update_all() must be called - * before the module containing tp_cb is unloaded. - */ - module_put(__module_text_address((unsigned long)elem->tp_cb)); - } - elem->state = 0; - elem->single.func = __mark_empty_function; - /* Update the function before setting the ptype */ - smp_wmb(); - elem->ptype = 0; /* single probe */ - /* - * Leave the private data and id there, because removal is racy and - * should be done only after an RCU period. These are never used until - * the next initialization anyway. - */ -} - -/** - * marker_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of markers. - */ -void marker_update_probe_range(struct marker *begin, - struct marker *end) -{ - struct marker *iter; - struct marker_entry *mark_entry; - - mutex_lock(&markers_mutex); - for (iter = begin; iter < end; iter++) { - mark_entry = get_marker(iter->name); - if (mark_entry) { - set_marker(mark_entry, iter, !!mark_entry->refcount); - /* - * ignore error, continue - */ - } else { - disable_marker(iter); - } - } - mutex_unlock(&markers_mutex); -} - -/* - * Update probes, removing the faulty probes. - * - * Internal callback only changed before the first probe is connected to it. - * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 - * transitions. All other transitions will leave the old private data valid. - * This makes the non-atomicity of the callback/private data updates valid. - * - * "special case" updates : - * 0 -> 1 callback - * 1 -> 0 callback - * 1 -> 2 callbacks - * 2 -> 1 callbacks - * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. - * Site effect : marker_set_format may delete the marker entry (creating a - * replacement). - */ -static void marker_update_probes(void) -{ - /* Core kernel markers */ - marker_update_probe_range(__start___markers, __stop___markers); - /* Markers in modules. */ - module_update_markers(); - tracepoint_probe_update_all(); -} - -/** - * marker_probe_register - Connect a probe to a marker - * @name: marker name - * @format: format string - * @probe: probe handler - * @probe_private: probe private data - * - * private data must be a valid allocated memory address, or NULL. - * Returns 0 if ok, error value on error. - * The probe address must at least be aligned on the architecture pointer size. - */ -int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - entry = add_marker(name, format); - if (IS_ERR(entry)) - ret = PTR_ERR(entry); - } else if (format) { - if (!entry->format) - ret = marker_set_format(entry, format); - else if (strcmp(entry->format, format)) - ret = -EPERM; - } - if (ret) - goto end; - - /* - * If we detect that a call_rcu is pending for this marker, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_add_probe(entry, probe, probe_private); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_register); - -/** - * marker_probe_unregister - Disconnect a probe from a marker - * @name: marker name - * @probe: probe function pointer - * @probe_private: probe private data - * - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister(const char *name, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - struct marker_probe_closure *old; - int ret = -ENOENT; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, probe, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(name); /* Ignore busy error message */ - ret = 0; -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister); - -static struct marker_entry * -get_marker_from_private_data(marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - unsigned int i; - struct hlist_head *head; - struct hlist_node *node; - - for (i = 0; i < MARKER_TABLE_SIZE; i++) { - head = &marker_table[i]; - hlist_for_each_entry(entry, node, head, hlist) { - if (!entry->ptype) { - if (entry->single.func == probe - && entry->single.probe_private - == probe_private) - return entry; - } else { - struct marker_probe_closure *closure; - closure = entry->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func == probe && - closure[i].probe_private - == probe_private) - return entry; - } - } - } - } - return NULL; -} - -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @probe: probe function - * @probe_private: probe private data - * - * Unregister a probe by providing the registered private data. - * Only removes the first marker found in hash table. - * Return 0 on success or error value. - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister_private_data(marker_probe_func *probe, - void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) { - ret = -ENOENT; - goto end; - } - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, NULL, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(entry->name); /* Ignore busy error message */ -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); - -/** - * marker_get_private_data - Get a marker's probe private data - * @name: marker name - * @probe: probe to match - * @num: get the nth matching probe's private data - * - * Returns the nth private data pointer (starting from 0) matching, or an - * ERR_PTR. - * Returns the private data pointer, or an ERR_PTR. - * The private data pointer should _only_ be dereferenced if the caller is the - * owner of the data, or its content could vanish. This is mostly used to - * confirm that a caller is the owner of a registered probe. - */ -void *marker_get_private_data(const char *name, marker_probe_func *probe, - int num) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - int i; - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - if (!e->ptype) { - if (num == 0 && e->single.func == probe) - return e->single.probe_private; - } else { - struct marker_probe_closure *closure; - int match = 0; - closure = e->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func != probe) - continue; - if (match++ == num) - return closure[i].probe_private; - } - } - break; - } - } - return ERR_PTR(-ENOENT); -} -EXPORT_SYMBOL_GPL(marker_get_private_data); - -#ifdef CONFIG_MODULES - -int marker_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - break; - case MODULE_STATE_GOING: - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - break; - } - return 0; -} - -struct notifier_block marker_module_nb = { - .notifier_call = marker_module_notify, - .priority = 0, -}; - -static int init_markers(void) -{ - return register_module_notifier(&marker_module_nb); -} -__initcall(init_markers); - -#endif /* CONFIG_MODULES */ diff --git a/kernel/module.c b/kernel/module.c index 05ce49c..5a29397 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -47,6 +47,7 @@ #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/mmu_context.h> #include <linux/license.h> #include <asm/sections.h> #include <linux/tracepoint.h> @@ -1535,6 +1536,10 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_free(mod, mod->module_core); + +#ifdef CONFIG_MPU + update_protections(current->mm); +#endif } void *__symbol_get(const char *symbol) @@ -1792,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } } +static void free_modinfo(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->free) + attr->free(mod); + } +} + #ifdef CONFIG_KALLSYMS /* lookup symbol in given range of kernel_symbols */ @@ -1857,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym, return '?'; } +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF + || src->st_shndx >= shnum + || !src->st_name) + return false; + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +static unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + unsigned long symoffs; + Elf_Shdr *symsect = sechdrs + symindex; + Elf_Shdr *strsect = sechdrs + strindex; + const Elf_Sym *src; + const char *strtab; + unsigned int i, nsrc, ndst; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + symsect->sh_name); + + src = (void *)hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + strtab = (void *)hdr + strsect->sh_offset; + for (ndst = i = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + unsigned int j = src->st_name; + + while(!__test_and_set_bit(j, strmap) && strtab[j]) + ++j; + ++ndst; + } + + /* Append room for core symbols at end of core part. */ + symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + strsect->sh_name); + + /* Append room for core symbols' strings at end of core part. */ + *pstroffs = mod->core_size; + __set_bit(0, strmap); + mod->core_size += bitmap_weight(strmap, strsect->sh_size); + + return symoffs; +} + static void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + unsigned long *strmap) { - unsigned int i; + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; mod->symtab = (void *)sechdrs[symindex].sh_addr; mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); @@ -1873,13 +1969,44 @@ static void add_kallsyms(struct module *mod, for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + + mod->core_symtab = dst = mod->module_core + symoffs; + src = mod->symtab; + *dst = *src; + for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { + if (!is_core_symbol(src, sechdrs, shnum)) + continue; + dst[ndst] = *src; + dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + ++ndst; + } + mod->core_num_syms = ndst; + + mod->core_strtab = s = mod->module_core + stroffs; + for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) + if (test_bit(i, strmap)) + *++s = mod->strtab[i]; } #else +static inline unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Hdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ +} static inline void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + const unsigned long *strmap) { } #endif /* CONFIG_KALLSYMS */ @@ -1954,6 +2081,9 @@ static noinline struct module *load_module(void __user *umod, struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ +#ifdef CONFIG_KALLSYMS + unsigned long symoffs, stroffs, *strmap; +#endif mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2035,11 +2165,6 @@ static noinline struct module *load_module(void __user *umod, /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; -#ifdef CONFIG_KALLSYMS - /* Keep symbol and string tables for decoding later. */ - sechdrs[symindex].sh_flags |= SHF_ALLOC; - sechdrs[strindex].sh_flags |= SHF_ALLOC; -#endif /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2075,6 +2200,13 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) + * sizeof(long), GFP_KERNEL); + if (!strmap) { + err = -ENOMEM; + goto free_mod; + } + if (find_module(mod->name)) { err = -EEXIST; goto free_mod; @@ -2104,6 +2236,8 @@ static noinline struct module *load_module(void __user *umod, this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, hdr, sechdrs, secstrings); + symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, + secstrings, &stroffs, strmap); /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2237,10 +2371,6 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->ctors), &mod->num_ctors); #endif -#ifdef CONFIG_MARKERS - mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", - sizeof(*mod->markers), &mod->num_markers); -#endif #ifdef CONFIG_TRACEPOINTS mod->tracepoints = section_objs(hdr, sechdrs, secstrings, "__tracepoints", @@ -2312,7 +2442,10 @@ static noinline struct module *load_module(void __user *umod, percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); - add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, + symoffs, stroffs, secstrings, strmap); + kfree(strmap); + strmap = NULL; if (!mod->taints) { struct _ddebug *debug; @@ -2384,13 +2517,14 @@ static noinline struct module *load_module(void __user *umod, synchronize_sched(); module_arch_cleanup(mod); cleanup: + free_modinfo(mod); kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - free_init: percpu_modfree(mod->refptr); + free_init: #endif module_free(mod, mod->module_init); free_core: @@ -2401,6 +2535,7 @@ static noinline struct module *load_module(void __user *umod, percpu_modfree(percpu); free_mod: kfree(args); + kfree(strmap); free_hdr: vfree(hdr); return ERR_PTR(err); @@ -2490,6 +2625,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, /* Drop initial reference. */ module_put(mod); trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2958,20 +3098,6 @@ void module_layout(struct module *mod, EXPORT_SYMBOL(module_layout); #endif -#ifdef CONFIG_MARKERS -void module_update_markers(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - mutex_unlock(&module_mutex); -} -#endif - #ifdef CONFIG_TRACEPOINTS void module_update_tracepoints(void) { diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 5aa854f..2a5dfec 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid) * (hence either you are in the same cgroup as task, or in an * ancestor cgroup thereof) */ -static int ns_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task) +static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, + struct task_struct *task, bool threadgroup) { if (current != task) { if (!capable(CAP_SYS_ADMIN)) @@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss, if (!cgroup_is_descendant(new_cgroup, task)) return -EPERM; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (!cgroup_is_descendant(new_cgroup, c)) { + rcu_read_unlock(); + return -EPERM; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/panic.c b/kernel/panic.c index 512ab73..bcdef26 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -177,7 +177,7 @@ static const struct tnt tnts[] = { * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. * - * The string is overwritten by the next call to print_taint(). + * The string is overwritten by the next call to print_tainted(). */ const char *print_tainted(void) { diff --git a/kernel/params.c b/kernel/params.c index 7f6912c..9da58ea 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,7 @@ #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/ctype.h> #if 0 #define DEBUGP printk @@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val) } for (i = 0; args[i]; i++) { - if (args[i] == ' ' && !in_quote) + if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') @@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val) next = args + i; /* Chew up trailing spaces. */ - while (*next == ' ') + while (isspace(*next)) next++; return next; } @@ -138,7 +139,7 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); /* Chew leading spaces */ - while (*args == ' ') + while (isspace(*args)) args++; while (*args) { diff --git a/kernel/perf_counter.c b/kernel/perf_event.c index cc768ab..76ac4db 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_event.c @@ -1,12 +1,12 @@ /* - * Performance counter core code + * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * - * For licensing details see kernel-base/COPYING + * For licensing details see kernel-base/COPYING */ #include <linux/fs.h> @@ -26,66 +26,66 @@ #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/irq_regs.h> /* - * Each CPU has a list of per CPU counters: + * Each CPU has a list of per CPU events: */ DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); -int perf_max_counters __read_mostly = 1; +int perf_max_events __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; -static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_comm_counters __read_mostly; -static atomic_t nr_task_counters __read_mostly; +static atomic_t nr_events __read_mostly; +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; /* - * perf counter paranoia level: + * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu counters for unpriv + * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ -int sysctl_perf_counter_paranoid __read_mostly = 1; +int sysctl_perf_event_paranoid __read_mostly = 1; static inline bool perf_paranoid_tracepoint_raw(void) { - return sysctl_perf_counter_paranoid > -1; + return sysctl_perf_event_paranoid > -1; } static inline bool perf_paranoid_cpu(void) { - return sysctl_perf_counter_paranoid > 0; + return sysctl_perf_event_paranoid > 0; } static inline bool perf_paranoid_kernel(void) { - return sysctl_perf_counter_paranoid > 1; + return sysctl_perf_event_paranoid > 1; } -int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ +int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* - * max perf counter sample rate + * max perf event sample rate */ -int sysctl_perf_counter_sample_rate __read_mostly = 100000; +int sysctl_perf_event_sample_rate __read_mostly = 100000; -static atomic64_t perf_counter_id; +static atomic64_t perf_event_id; /* - * Lock for (sysadmin-configurable) counter reservations: + * Lock for (sysadmin-configurable) event reservations: */ static DEFINE_SPINLOCK(perf_resource_lock); /* * Architecture provided APIs - weak aliases: */ -extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) { return NULL; } @@ -93,18 +93,18 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -void __weak hw_perf_counter_setup(int cpu) { barrier(); } -void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } +void __weak hw_perf_event_setup(int cpu) { barrier(); } +void __weak hw_perf_event_setup_online(int cpu) { barrier(); } int __weak -hw_perf_group_sched_in(struct perf_counter *group_leader, +hw_perf_group_sched_in(struct perf_event *group_leader, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) + struct perf_event_context *ctx, int cpu) { return 0; } -void __weak perf_counter_print_debug(void) { } +void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); @@ -130,20 +130,20 @@ void perf_enable(void) hw_perf_enable(); } -static void get_ctx(struct perf_counter_context *ctx) +static void get_ctx(struct perf_event_context *ctx) { WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } static void free_ctx(struct rcu_head *head) { - struct perf_counter_context *ctx; + struct perf_event_context *ctx; - ctx = container_of(head, struct perf_counter_context, rcu_head); + ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx); } -static void put_ctx(struct perf_counter_context *ctx) +static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) @@ -154,7 +154,7 @@ static void put_ctx(struct perf_counter_context *ctx) } } -static void unclone_ctx(struct perf_counter_context *ctx) +static void unclone_ctx(struct perf_event_context *ctx) { if (ctx->parent_ctx) { put_ctx(ctx->parent_ctx); @@ -163,37 +163,37 @@ static void unclone_ctx(struct perf_counter_context *ctx) } /* - * If we inherit counters we want to return the parent counter id + * If we inherit events we want to return the parent event id * to userspace. */ -static u64 primary_counter_id(struct perf_counter *counter) +static u64 primary_event_id(struct perf_event *event) { - u64 id = counter->id; + u64 id = event->id; - if (counter->parent) - id = counter->parent->id; + if (event->parent) + id = event->parent->id; return id; } /* - * Get the perf_counter_context for a task and lock it. + * Get the perf_event_context for a task and lock it. * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ -static struct perf_counter_context * +static struct perf_event_context * perf_lock_task_context(struct task_struct *task, unsigned long *flags) { - struct perf_counter_context *ctx; + struct perf_event_context *ctx; rcu_read_lock(); retry: - ctx = rcu_dereference(task->perf_counter_ctxp); + ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by - * perf_counter_task_sched_out, though the + * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry @@ -201,7 +201,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped on us any more. */ spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + if (ctx != rcu_dereference(task->perf_event_ctxp)) { spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } @@ -220,9 +220,9 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ -static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) +static struct perf_event_context *perf_pin_task_context(struct task_struct *task) { - struct perf_counter_context *ctx; + struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, &flags); @@ -233,7 +233,7 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta return ctx; } -static void perf_unpin_context(struct perf_counter_context *ctx) +static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; @@ -244,123 +244,122 @@ static void perf_unpin_context(struct perf_counter_context *ctx) } /* - * Add a counter from the lists for its context. + * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void -list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +list_add_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_counter *group_leader = counter->group_leader; + struct perf_event *group_leader = event->group_leader; /* - * Depending on whether it is a standalone or sibling counter, - * add it straight to the context's counter list, or to the group + * Depending on whether it is a standalone or sibling event, + * add it straight to the context's event list, or to the group * leader's sibling list: */ - if (group_leader == counter) - list_add_tail(&counter->list_entry, &ctx->counter_list); + if (group_leader == event) + list_add_tail(&event->group_entry, &ctx->group_list); else { - list_add_tail(&counter->list_entry, &group_leader->sibling_list); + list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } - list_add_rcu(&counter->event_entry, &ctx->event_list); - ctx->nr_counters++; - if (counter->attr.inherit_stat) + list_add_rcu(&event->event_entry, &ctx->event_list); + ctx->nr_events++; + if (event->attr.inherit_stat) ctx->nr_stat++; } /* - * Remove a counter from the lists for its context. + * Remove a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void -list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_counter *sibling, *tmp; + struct perf_event *sibling, *tmp; - if (list_empty(&counter->list_entry)) + if (list_empty(&event->group_entry)) return; - ctx->nr_counters--; - if (counter->attr.inherit_stat) + ctx->nr_events--; + if (event->attr.inherit_stat) ctx->nr_stat--; - list_del_init(&counter->list_entry); - list_del_rcu(&counter->event_entry); + list_del_init(&event->group_entry); + list_del_rcu(&event->event_entry); - if (counter->group_leader != counter) - counter->group_leader->nr_siblings--; + if (event->group_leader != event) + event->group_leader->nr_siblings--; /* - * If this was a group counter with sibling counters then - * upgrade the siblings to singleton counters by adding them + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them * to the context list directly: */ - list_for_each_entry_safe(sibling, tmp, - &counter->sibling_list, list_entry) { + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - list_move_tail(&sibling->list_entry, &ctx->counter_list); + list_move_tail(&sibling->group_entry, &ctx->group_list); sibling->group_leader = sibling; } } static void -counter_sched_out(struct perf_counter *counter, +event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) + struct perf_event_context *ctx) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE) return; - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (counter->pending_disable) { - counter->pending_disable = 0; - counter->state = PERF_COUNTER_STATE_OFF; + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; } - counter->tstamp_stopped = ctx->time; - counter->pmu->disable(counter); - counter->oncpu = -1; + event->tstamp_stopped = ctx->time; + event->pmu->disable(event); + event->oncpu = -1; - if (!is_software_counter(counter)) + if (!is_software_event(event)) cpuctx->active_oncpu--; ctx->nr_active--; - if (counter->attr.exclusive || !cpuctx->active_oncpu) + if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } static void -group_sched_out(struct perf_counter *group_counter, +group_sched_out(struct perf_event *group_event, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) + struct perf_event_context *ctx) { - struct perf_counter *counter; + struct perf_event *event; - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; - counter_sched_out(group_counter, cpuctx, ctx); + event_sched_out(group_event, cpuctx, ctx); /* * Schedule out siblings (if any): */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) - counter_sched_out(counter, cpuctx, ctx); + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); - if (group_counter->attr.exclusive) + if (group_event->attr.exclusive) cpuctx->exclusive = 0; } /* - * Cross CPU call to remove a performance counter + * Cross CPU call to remove a performance event * - * We disable the counter on the hardware level first. After that we + * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static void __perf_counter_remove_from_context(void *info) +static void __perf_event_remove_from_context(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; /* * If this is a task context, we need to check whether it is @@ -373,22 +372,22 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the - * counters on a global level. + * events on a global level. */ perf_disable(); - counter_sched_out(counter, cpuctx, ctx); + event_sched_out(event, cpuctx, ctx); - list_del_counter(counter, ctx); + list_del_event(event, ctx); if (!ctx->task) { /* - * Allow more per task counters with respect to the + * Allow more per task events with respect to the * reservation: */ cpuctx->max_pertask = - min(perf_max_counters - ctx->nr_counters, - perf_max_counters - perf_reserved_percpu); + min(perf_max_events - ctx->nr_events, + perf_max_events - perf_reserved_percpu); } perf_enable(); @@ -397,56 +396,56 @@ static void __perf_counter_remove_from_context(void *info) /* - * Remove the counter from a task's (or a CPU's) list of counters. + * Remove the event from a task's (or a CPU's) list of events. * * Must be called with ctx->mutex held. * - * CPU counters are removed with a smp call. For task counters we only + * CPU events are removed with a smp call. For task events we only * call when the task is on a CPU. * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. - * When called from perf_counter_exit_task, it's OK because the + * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_counter_remove_from_context(struct perf_counter *counter) +static void perf_event_remove_from_context(struct perf_event *event) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; if (!task) { /* - * Per cpu counters are removed via an smp call and + * Per cpu events are removed via an smp call and * the removal is always sucessful. */ - smp_call_function_single(counter->cpu, - __perf_counter_remove_from_context, - counter, 1); + smp_call_function_single(event->cpu, + __perf_event_remove_from_context, + event, 1); return; } retry: - task_oncpu_function_call(task, __perf_counter_remove_from_context, - counter); + task_oncpu_function_call(task, __perf_event_remove_from_context, + event); spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ - if (ctx->nr_active && !list_empty(&counter->list_entry)) { + if (ctx->nr_active && !list_empty(&event->group_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } /* * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if the call above did not + * can remove the event safely, if the call above did not * succeed. */ - if (!list_empty(&counter->list_entry)) { - list_del_counter(counter, ctx); + if (!list_empty(&event->group_entry)) { + list_del_event(event, ctx); } spin_unlock_irq(&ctx->lock); } @@ -459,7 +458,7 @@ static inline u64 perf_clock(void) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_counter_context *ctx) +static void update_context_time(struct perf_event_context *ctx) { u64 now = perf_clock(); @@ -468,51 +467,51 @@ static void update_context_time(struct perf_counter_context *ctx) } /* - * Update the total_time_enabled and total_time_running fields for a counter. + * Update the total_time_enabled and total_time_running fields for a event. */ -static void update_counter_times(struct perf_counter *counter) +static void update_event_times(struct perf_event *event) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; u64 run_end; - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; - counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + event->total_time_enabled = ctx->time - event->tstamp_enabled; - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; else run_end = ctx->time; - counter->total_time_running = run_end - counter->tstamp_running; + event->total_time_running = run_end - event->tstamp_running; } /* - * Update total_time_enabled and total_time_running for all counters in a group. + * Update total_time_enabled and total_time_running for all events in a group. */ -static void update_group_times(struct perf_counter *leader) +static void update_group_times(struct perf_event *leader) { - struct perf_counter *counter; + struct perf_event *event; - update_counter_times(leader); - list_for_each_entry(counter, &leader->sibling_list, list_entry) - update_counter_times(counter); + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); } /* - * Cross CPU call to disable a performance counter + * Cross CPU call to disable a performance event */ -static void __perf_counter_disable(void *info) +static void __perf_event_disable(void *info) { - struct perf_counter *counter = info; + struct perf_event *event = info; struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. */ if (ctx->task && cpuctx->task_ctx != ctx) return; @@ -520,57 +519,57 @@ static void __perf_counter_disable(void *info) spin_lock(&ctx->lock); /* - * If the counter is on, turn it off. + * If the event is on, turn it off. * If it is in error state, leave it in error state. */ - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + if (event->state >= PERF_EVENT_STATE_INACTIVE) { update_context_time(ctx); - update_group_times(counter); - if (counter == counter->group_leader) - group_sched_out(counter, cpuctx, ctx); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); else - counter_sched_out(counter, cpuctx, ctx); - counter->state = PERF_COUNTER_STATE_OFF; + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } spin_unlock(&ctx->lock); } /* - * Disable a counter. + * Disable a event. * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisifed when called through - * perf_counter_for_each_child or perf_counter_for_each because they - * hold the top-level counter's child_mutex, so any descendant that - * goes to exit will block in sync_child_counter. - * When called from perf_pending_counter it's OK because counter->ctx + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_counter_task_sched_out for this context. + * hence we can't get into perf_event_task_sched_out for this context. */ -static void perf_counter_disable(struct perf_counter *counter) +static void perf_event_disable(struct perf_event *event) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; if (!task) { /* - * Disable the counter on the cpu that it's on + * Disable the event on the cpu that it's on */ - smp_call_function_single(counter->cpu, __perf_counter_disable, - counter, 1); + smp_call_function_single(event->cpu, __perf_event_disable, + event, 1); return; } retry: - task_oncpu_function_call(task, __perf_counter_disable, counter); + task_oncpu_function_call(task, __perf_event_disable, event); spin_lock_irq(&ctx->lock); /* - * If the counter is still active, we need to retry the cross-call. + * If the event is still active, we need to retry the cross-call. */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { + if (event->state == PERF_EVENT_STATE_ACTIVE) { spin_unlock_irq(&ctx->lock); goto retry; } @@ -579,73 +578,73 @@ static void perf_counter_disable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; } spin_unlock_irq(&ctx->lock); } static int -counter_sched_in(struct perf_counter *counter, +event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, + struct perf_event_context *ctx, int cpu) { - if (counter->state <= PERF_COUNTER_STATE_OFF) + if (event->state <= PERF_EVENT_STATE_OFF) return 0; - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ /* * The new state must be visible before we turn it on in the hardware: */ smp_wmb(); - if (counter->pmu->enable(counter)) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->oncpu = -1; + if (event->pmu->enable(event)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; return -EAGAIN; } - counter->tstamp_running += ctx->time - counter->tstamp_stopped; + event->tstamp_running += ctx->time - event->tstamp_stopped; - if (!is_software_counter(counter)) + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; - if (counter->attr.exclusive) + if (event->attr.exclusive) cpuctx->exclusive = 1; return 0; } static int -group_sched_in(struct perf_counter *group_counter, +group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, + struct perf_event_context *ctx, int cpu) { - struct perf_counter *counter, *partial_group; + struct perf_event *event, *partial_group; int ret; - if (group_counter->state == PERF_COUNTER_STATE_OFF) + if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); if (ret) return ret < 0 ? ret : 0; - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + if (event_sched_in(group_event, cpuctx, ctx, cpu)) return -EAGAIN; /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx, cpu)) { + partial_group = event; goto group_error; } } @@ -657,57 +656,57 @@ group_error: * Groups can be scheduled in as one unit only, so undo any * partial group before returning: */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter == partial_group) + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) break; - counter_sched_out(counter, cpuctx, ctx); + event_sched_out(event, cpuctx, ctx); } - counter_sched_out(group_counter, cpuctx, ctx); + event_sched_out(group_event, cpuctx, ctx); return -EAGAIN; } /* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. + * Return 1 for a group consisting entirely of software events, + * 0 if the group contains any hardware events. */ -static int is_software_only_group(struct perf_counter *leader) +static int is_software_only_group(struct perf_event *leader) { - struct perf_counter *counter; + struct perf_event *event; - if (!is_software_counter(leader)) + if (!is_software_event(leader)) return 0; - list_for_each_entry(counter, &leader->sibling_list, list_entry) - if (!is_software_counter(counter)) + list_for_each_entry(event, &leader->sibling_list, group_entry) + if (!is_software_event(event)) return 0; return 1; } /* - * Work out whether we can put this counter group on the CPU now. + * Work out whether we can put this event group on the CPU now. */ -static int group_can_go_on(struct perf_counter *counter, +static int group_can_go_on(struct perf_event *event, struct perf_cpu_context *cpuctx, int can_add_hw) { /* - * Groups consisting entirely of software counters can always go on. + * Groups consisting entirely of software events can always go on. */ - if (is_software_only_group(counter)) + if (is_software_only_group(event)) return 1; /* * If an exclusive group is already on, no other hardware - * counters can go on. + * events can go on. */ if (cpuctx->exclusive) return 0; /* * If this group is exclusive and there are already - * counters on the CPU, it can't go on. + * events on the CPU, it can't go on. */ - if (counter->attr.exclusive && cpuctx->active_oncpu) + if (event->attr.exclusive && cpuctx->active_oncpu) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -716,26 +715,26 @@ static int group_can_go_on(struct perf_counter *counter, return can_add_hw; } -static void add_counter_to_ctx(struct perf_counter *counter, - struct perf_counter_context *ctx) +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) { - list_add_counter(counter, ctx); - counter->tstamp_enabled = ctx->time; - counter->tstamp_running = ctx->time; - counter->tstamp_stopped = ctx->time; + list_add_event(event, ctx); + event->tstamp_enabled = ctx->time; + event->tstamp_running = ctx->time; + event->tstamp_stopped = ctx->time; } /* - * Cross CPU call to install and enable a performance counter + * Cross CPU call to install and enable a performance event * * Must be called with ctx->mutex held */ static void __perf_install_in_context(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; int cpu = smp_processor_id(); int err; @@ -744,7 +743,7 @@ static void __perf_install_in_context(void *info) * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. * Or possibly this is the right context but it isn't - * on this cpu because it had no counters. + * on this cpu because it had no events. */ if (ctx->task && cpuctx->task_ctx != ctx) { if (cpuctx->task_ctx || ctx->task != current) @@ -758,41 +757,41 @@ static void __perf_install_in_context(void *info) /* * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. + * events on a global level. NOP for non NMI based events. */ perf_disable(); - add_counter_to_ctx(counter, ctx); + add_event_to_ctx(event, ctx); /* - * Don't put the counter on if it is disabled or if + * Don't put the event on if it is disabled or if * it is in a group and the group isn't on. */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE || - (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) goto unlock; /* - * An exclusive counter can't go on if there are already active - * hardware counters, and no hardware counter can go on if there - * is already an exclusive counter on. + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. */ - if (!group_can_go_on(counter, cpuctx, 1)) + if (!group_can_go_on(event, cpuctx, 1)) err = -EEXIST; else - err = counter_sched_in(counter, cpuctx, ctx, cpu); + err = event_sched_in(event, cpuctx, ctx, cpu); if (err) { /* - * This counter couldn't go on. If it is in a group + * This event couldn't go on. If it is in a group * then we have to pull the whole group off. - * If the counter group is pinned then put it in error state. + * If the event group is pinned then put it in error state. */ - if (leader != counter) + if (leader != event) group_sched_out(leader, cpuctx, ctx); if (leader->attr.pinned) { update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; + leader->state = PERF_EVENT_STATE_ERROR; } } @@ -806,92 +805,92 @@ static void __perf_install_in_context(void *info) } /* - * Attach a performance counter to a context + * Attach a performance event to a context * - * First we add the counter to the list with the hardware enable bit - * in counter->hw_config cleared. + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. * - * If the counter is attached to a task which is on a CPU we use a smp + * If the event is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. * * Must be called with ctx->mutex held. */ static void -perf_install_in_context(struct perf_counter_context *ctx, - struct perf_counter *counter, +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, int cpu) { struct task_struct *task = ctx->task; if (!task) { /* - * Per cpu counters are installed via an smp call and + * Per cpu events are installed via an smp call and * the install is always sucessful. */ smp_call_function_single(cpu, __perf_install_in_context, - counter, 1); + event, 1); return; } retry: task_oncpu_function_call(task, __perf_install_in_context, - counter); + event); spin_lock_irq(&ctx->lock); /* * we need to retry the smp call. */ - if (ctx->is_active && list_empty(&counter->list_entry)) { + if (ctx->is_active && list_empty(&event->group_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } /* * The lock prevents that this context is scheduled in so we - * can add the counter safely, if it the call above did not + * can add the event safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list_entry)) - add_counter_to_ctx(counter, ctx); + if (list_empty(&event->group_entry)) + add_event_to_ctx(event, ctx); spin_unlock_irq(&ctx->lock); } /* - * Put a counter into inactive state and update time fields. + * Put a event into inactive state and update time fields. * Enabling the leader of a group effectively enables all * the group members that aren't explicitly disabled, so we * have to update their ->tstamp_enabled also. * Note: this works for group members as well as group leaders * since the non-leader members' sibling_lists will be empty. */ -static void __perf_counter_mark_enabled(struct perf_counter *counter, - struct perf_counter_context *ctx) +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) { - struct perf_counter *sub; + struct perf_event *sub; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - list_for_each_entry(sub, &counter->sibling_list, list_entry) - if (sub->state >= PERF_COUNTER_STATE_INACTIVE) + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = ctx->time - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) + if (sub->state >= PERF_EVENT_STATE_INACTIVE) sub->tstamp_enabled = ctx->time - sub->total_time_enabled; } /* - * Cross CPU call to enable a performance counter + * Cross CPU call to enable a performance event */ -static void __perf_counter_enable(void *info) +static void __perf_event_enable(void *info) { - struct perf_counter *counter = info; + struct perf_event *event = info; struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; int err; /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. */ if (ctx->task && cpuctx->task_ctx != ctx) { if (cpuctx->task_ctx || ctx->task != current) @@ -903,40 +902,40 @@ static void __perf_counter_enable(void *info) ctx->is_active = 1; update_context_time(ctx); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + if (event->state >= PERF_EVENT_STATE_INACTIVE) goto unlock; - __perf_counter_mark_enabled(counter, ctx); + __perf_event_mark_enabled(event, ctx); /* - * If the counter is in a group and isn't the group leader, + * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) goto unlock; - if (!group_can_go_on(counter, cpuctx, 1)) { + if (!group_can_go_on(event, cpuctx, 1)) { err = -EEXIST; } else { perf_disable(); - if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, + if (event == leader) + err = group_sched_in(event, cpuctx, ctx, smp_processor_id()); else - err = counter_sched_in(counter, cpuctx, ctx, + err = event_sched_in(event, cpuctx, ctx, smp_processor_id()); perf_enable(); } if (err) { /* - * If this counter can't go on and it's part of a + * If this event can't go on and it's part of a * group, then the whole group has to come off. */ - if (leader != counter) + if (leader != event) group_sched_out(leader, cpuctx, ctx); if (leader->attr.pinned) { update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; + leader->state = PERF_EVENT_STATE_ERROR; } } @@ -945,98 +944,98 @@ static void __perf_counter_enable(void *info) } /* - * Enable a counter. + * Enable a event. * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through - * perf_counter_for_each_child or perf_counter_for_each as described - * for perf_counter_disable. + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. */ -static void perf_counter_enable(struct perf_counter *counter) +static void perf_event_enable(struct perf_event *event) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; if (!task) { /* - * Enable the counter on the cpu that it's on + * Enable the event on the cpu that it's on */ - smp_call_function_single(counter->cpu, __perf_counter_enable, - counter, 1); + smp_call_function_single(event->cpu, __perf_event_enable, + event, 1); return; } spin_lock_irq(&ctx->lock); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + if (event->state >= PERF_EVENT_STATE_INACTIVE) goto out; /* - * If the counter is in error state, clear that first. - * That way, if we see the counter in error state below, we + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we * know that it has gone back into error state, as distinct * from the task having been scheduled away before the * cross-call arrived. */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - counter->state = PERF_COUNTER_STATE_OFF; + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; retry: spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_counter_enable, counter); + task_oncpu_function_call(task, __perf_event_enable, event); spin_lock_irq(&ctx->lock); /* - * If the context is active and the counter is still off, + * If the context is active and the event is still off, * we need to retry the cross-call. */ - if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) goto retry; /* * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) - __perf_counter_mark_enabled(counter, ctx); + if (event->state == PERF_EVENT_STATE_OFF) + __perf_event_mark_enabled(event, ctx); out: spin_unlock_irq(&ctx->lock); } -static int perf_counter_refresh(struct perf_counter *counter, int refresh) +static int perf_event_refresh(struct perf_event *event, int refresh) { /* - * not supported on inherited counters + * not supported on inherited events */ - if (counter->attr.inherit) + if (event->attr.inherit) return -EINVAL; - atomic_add(refresh, &counter->event_limit); - perf_counter_enable(counter); + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); return 0; } -void __perf_counter_sched_out(struct perf_counter_context *ctx, +void __perf_event_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct perf_counter *counter; + struct perf_event *event; spin_lock(&ctx->lock); ctx->is_active = 0; - if (likely(!ctx->nr_counters)) + if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); perf_disable(); if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter != counter->group_leader) - counter_sched_out(counter, cpuctx, ctx); + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event != event->group_leader) + event_sched_out(event, cpuctx, ctx); else - group_sched_out(counter, cpuctx, ctx); + group_sched_out(event, cpuctx, ctx); } } perf_enable(); @@ -1047,46 +1046,46 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, /* * Test whether two contexts are equivalent, i.e. whether they * have both been cloned from the same version of the same context - * and they both have the same number of enabled counters. - * If the number of enabled counters is the same, then the set - * of enabled counters should be the same, because these are both - * inherited contexts, therefore we can't access individual counters + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events * in them directly with an fd; we can only enable/disable all - * counters via prctl, or enable/disable all counters in a family + * events via prctl, or enable/disable all events in a family * via ioctl, which will have the same effect on both contexts. */ -static int context_equiv(struct perf_counter_context *ctx1, - struct perf_counter_context *ctx2) +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen && !ctx1->pin_count && !ctx2->pin_count; } -static void __perf_counter_read(void *counter); +static void __perf_event_read(void *event); -static void __perf_counter_sync_stat(struct perf_counter *counter, - struct perf_counter *next_counter) +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) { u64 value; - if (!counter->attr.inherit_stat) + if (!event->attr.inherit_stat) return; /* - * Update the counter value, we cannot use perf_counter_read() + * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however - * we know the counter must be on the current CPU, therefore we + * we know the event must be on the current CPU, therefore we * don't need to use it. */ - switch (counter->state) { - case PERF_COUNTER_STATE_ACTIVE: - __perf_counter_read(counter); + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + __perf_event_read(event); break; - case PERF_COUNTER_STATE_INACTIVE: - update_counter_times(counter); + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); break; default: @@ -1094,73 +1093,73 @@ static void __perf_counter_sync_stat(struct perf_counter *counter, } /* - * In order to keep per-task stats reliable we need to flip the counter + * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ - value = atomic64_read(&next_counter->count); - value = atomic64_xchg(&counter->count, value); - atomic64_set(&next_counter->count, value); + value = atomic64_read(&next_event->count); + value = atomic64_xchg(&event->count, value); + atomic64_set(&next_event->count, value); - swap(counter->total_time_enabled, next_counter->total_time_enabled); - swap(counter->total_time_running, next_counter->total_time_running); + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); /* * Since we swizzled the values, update the user visible data too. */ - perf_counter_update_userpage(counter); - perf_counter_update_userpage(next_counter); + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); } #define list_next_entry(pos, member) \ list_entry(pos->member.next, typeof(*pos), member) -static void perf_counter_sync_stat(struct perf_counter_context *ctx, - struct perf_counter_context *next_ctx) +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) { - struct perf_counter *counter, *next_counter; + struct perf_event *event, *next_event; if (!ctx->nr_stat) return; - counter = list_first_entry(&ctx->event_list, - struct perf_counter, event_entry); + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); - next_counter = list_first_entry(&next_ctx->event_list, - struct perf_counter, event_entry); + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); - while (&counter->event_entry != &ctx->event_list && - &next_counter->event_entry != &next_ctx->event_list) { + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { - __perf_counter_sync_stat(counter, next_counter); + __perf_event_sync_stat(event, next_event); - counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(next_counter, event_entry); + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); } } /* - * Called from scheduler to remove the counters of the current task, + * Called from scheduler to remove the events of the current task, * with interrupts disabled. * - * We stop each counter and update the counter value in counter->count. + * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * not restart the counter. + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. */ -void perf_counter_task_sched_out(struct task_struct *task, +void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter_context *next_ctx; - struct perf_counter_context *parent; + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; struct pt_regs *regs; int do_switch = 1; regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; @@ -1169,7 +1168,7 @@ void perf_counter_task_sched_out(struct task_struct *task, rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_counter_ctxp; + next_ctx = next->perf_event_ctxp; if (parent && next_ctx && rcu_dereference(next_ctx->parent_ctx) == parent) { /* @@ -1186,15 +1185,15 @@ void perf_counter_task_sched_out(struct task_struct *task, if (context_equiv(ctx, next_ctx)) { /* * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_counter_ctxp + * wrt to rcu_dereference() of perf_event_ctxp */ - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; ctx->task = next; next_ctx->task = task; do_switch = 0; - perf_counter_sync_stat(ctx, next_ctx); + perf_event_sync_stat(ctx, next_ctx); } spin_unlock(&next_ctx->lock); spin_unlock(&ctx->lock); @@ -1202,7 +1201,7 @@ void perf_counter_task_sched_out(struct task_struct *task, rcu_read_unlock(); if (do_switch) { - __perf_counter_sched_out(ctx, cpuctx); + __perf_event_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } } @@ -1210,7 +1209,7 @@ void perf_counter_task_sched_out(struct task_struct *task, /* * Called with IRQs disabled */ -static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) +static void __perf_event_task_sched_out(struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); @@ -1220,28 +1219,28 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - __perf_counter_sched_out(ctx, cpuctx); + __perf_event_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } /* * Called with IRQs disabled */ -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) +static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) { - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); + __perf_event_sched_out(&cpuctx->ctx, cpuctx); } static void -__perf_counter_sched_in(struct perf_counter_context *ctx, +__perf_event_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { - struct perf_counter *counter; + struct perf_event *event; int can_add_hw = 1; spin_lock(&ctx->lock); ctx->is_active = 1; - if (likely(!ctx->nr_counters)) + if (likely(!ctx->nr_events)) goto out; ctx->timestamp = perf_clock(); @@ -1252,52 +1251,52 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->attr.pinned) + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF || + !event->attr.pinned) continue; - if (counter->cpu != -1 && counter->cpu != cpu) + if (event->cpu != -1 && event->cpu != cpu) continue; - if (counter != counter->group_leader) - counter_sched_in(counter, cpuctx, ctx, cpu); + if (event != event->group_leader) + event_sched_in(event, cpuctx, ctx, cpu); else { - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); } /* * If this pinned group hasn't been scheduled, * put it in error state. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_ERROR; + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; } } - list_for_each_entry(counter, &ctx->counter_list, list_entry) { + list_for_each_entry(event, &ctx->group_list, group_entry) { /* - * Ignore counters in OFF or ERROR state, and - * ignore pinned counters since we did them already. + * Ignore events in OFF or ERROR state, and + * ignore pinned events since we did them already. */ - if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->attr.pinned) + if (event->state <= PERF_EVENT_STATE_OFF || + event->attr.pinned) continue; /* * Listen to the 'cpu' scheduling filter constraint - * of counters: + * of events: */ - if (counter->cpu != -1 && counter->cpu != cpu) + if (event->cpu != -1 && event->cpu != cpu) continue; - if (counter != counter->group_leader) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) + if (event != event->group_leader) { + if (event_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; } else { - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; } } @@ -1308,48 +1307,48 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, } /* - * Called from scheduler to add the counters of the current task + * Called from scheduler to add the events of the current task * with interrupts disabled. * - * We restore the counter value and then enable it. + * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) +void perf_event_task_sched_in(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_event_context *ctx = task->perf_event_ctxp; if (likely(!ctx)) return; if (cpuctx->task_ctx == ctx) return; - __perf_counter_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } -static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) { - struct perf_counter_context *ctx = &cpuctx->ctx; + struct perf_event_context *ctx = &cpuctx->ctx; - __perf_counter_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx, cpu); } #define MAX_INTERRUPTS (~0ULL) -static void perf_log_throttle(struct perf_counter *counter, int enable); +static void perf_log_throttle(struct perf_event *event, int enable); -static void perf_adjust_period(struct perf_counter *counter, u64 events) +static void perf_adjust_period(struct perf_event *event, u64 events) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; u64 period, sample_period; s64 delta; events *= hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); + period = div64_u64(events, event->attr.sample_freq); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ @@ -1362,39 +1361,39 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events) hwc->sample_period = sample_period; } -static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) +static void perf_ctx_adjust_freq(struct perf_event_context *ctx) { - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct perf_event *event; + struct hw_perf_event *hwc; u64 interrupts, freq; spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) continue; - hwc = &counter->hw; + hwc = &event->hw; interrupts = hwc->interrupts; hwc->interrupts = 0; /* - * unthrottle counters on the tick + * unthrottle events on the tick */ if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(counter, 1); - counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_sample_rate/HZ; + perf_log_throttle(event, 1); + event->pmu->unthrottle(event); + interrupts = 2*sysctl_perf_event_sample_rate/HZ; } - if (!counter->attr.freq || !counter->attr.sample_freq) + if (!event->attr.freq || !event->attr.sample_freq) continue; /* * if the specified freq < HZ then we need to skip ticks */ - if (counter->attr.sample_freq < HZ) { - freq = counter->attr.sample_freq; + if (event->attr.sample_freq < HZ) { + freq = event->attr.sample_freq; hwc->freq_count += freq; hwc->freq_interrupts += interrupts; @@ -1408,7 +1407,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) } else freq = HZ; - perf_adjust_period(counter, freq * interrupts); + perf_adjust_period(event, freq * interrupts); /* * In order to avoid being stalled by an (accidental) huge @@ -1417,9 +1416,9 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) */ if (!interrupts) { perf_disable(); - counter->pmu->disable(counter); + event->pmu->disable(event); atomic64_set(&hwc->period_left, 0); - counter->pmu->enable(counter); + event->pmu->enable(event); perf_enable(); } } @@ -1427,22 +1426,22 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) } /* - * Round-robin a context's counters: + * Round-robin a context's events: */ -static void rotate_ctx(struct perf_counter_context *ctx) +static void rotate_ctx(struct perf_event_context *ctx) { - struct perf_counter *counter; + struct perf_event *event; - if (!ctx->nr_counters) + if (!ctx->nr_events) return; spin_lock(&ctx->lock); /* - * Rotate the first entry last (works just fine for group counters too): + * Rotate the first entry last (works just fine for group events too): */ perf_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - list_move_tail(&counter->list_entry, &ctx->counter_list); + list_for_each_entry(event, &ctx->group_list, group_entry) { + list_move_tail(&event->group_entry, &ctx->group_list); break; } perf_enable(); @@ -1450,93 +1449,93 @@ static void rotate_ctx(struct perf_counter_context *ctx) spin_unlock(&ctx->lock); } -void perf_counter_task_tick(struct task_struct *curr, int cpu) +void perf_event_task_tick(struct task_struct *curr, int cpu) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_event_context *ctx; - if (!atomic_read(&nr_counters)) + if (!atomic_read(&nr_events)) return; cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = curr->perf_counter_ctxp; + ctx = curr->perf_event_ctxp; perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) perf_ctx_adjust_freq(ctx); - perf_counter_cpu_sched_out(cpuctx); + perf_event_cpu_sched_out(cpuctx); if (ctx) - __perf_counter_task_sched_out(ctx); + __perf_event_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - perf_counter_cpu_sched_in(cpuctx, cpu); + perf_event_cpu_sched_in(cpuctx, cpu); if (ctx) - perf_counter_task_sched_in(curr, cpu); + perf_event_task_sched_in(curr, cpu); } /* - * Enable all of a task's counters that have been marked enable-on-exec. + * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_counter_enable_on_exec(struct task_struct *task) +static void perf_event_enable_on_exec(struct task_struct *task) { - struct perf_counter_context *ctx; - struct perf_counter *counter; + struct perf_event_context *ctx; + struct perf_event *event; unsigned long flags; int enabled = 0; local_irq_save(flags); - ctx = task->perf_counter_ctxp; - if (!ctx || !ctx->nr_counters) + ctx = task->perf_event_ctxp; + if (!ctx || !ctx->nr_events) goto out; - __perf_counter_task_sched_out(ctx); + __perf_event_task_sched_out(ctx); spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (!counter->attr.enable_on_exec) + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (!event->attr.enable_on_exec) continue; - counter->attr.enable_on_exec = 0; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) continue; - __perf_counter_mark_enabled(counter, ctx); + __perf_event_mark_enabled(event, ctx); enabled = 1; } /* - * Unclone this context if we enabled any counter. + * Unclone this context if we enabled any event. */ if (enabled) unclone_ctx(ctx); spin_unlock(&ctx->lock); - perf_counter_task_sched_in(task, smp_processor_id()); + perf_event_task_sched_in(task, smp_processor_id()); out: local_irq_restore(flags); } /* - * Cross CPU call to read the hardware counter + * Cross CPU call to read the hardware event */ -static void __perf_counter_read(void *info) +static void __perf_event_read(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; unsigned long flags; /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. In that case - * counter->count would have been updated to a recent sample - * when the counter was scheduled out. + * event->count would have been updated to a recent sample + * when the event was scheduled out. */ if (ctx->task && cpuctx->task_ctx != ctx) return; @@ -1544,56 +1543,56 @@ static void __perf_counter_read(void *info) local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); - counter->pmu->read(counter); - update_counter_times(counter); + event->pmu->read(event); + update_event_times(event); local_irq_restore(flags); } -static u64 perf_counter_read(struct perf_counter *counter) +static u64 perf_event_read(struct perf_event *event) { /* - * If counter is enabled and currently active on a CPU, update the - * value in the counter structure: + * If event is enabled and currently active on a CPU, update the + * value in the event structure: */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - smp_call_function_single(counter->oncpu, - __perf_counter_read, counter, 1); - } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_event_times(event); } - return atomic64_read(&counter->count); + return atomic64_read(&event->count); } /* - * Initialize the perf_counter context in a task_struct: + * Initialize the perf_event context in a task_struct: */ static void -__perf_counter_init_context(struct perf_counter_context *ctx, +__perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->group_list); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); ctx->task = task; } -static struct perf_counter_context *find_get_context(pid_t pid, int cpu) +static struct perf_event_context *find_get_context(pid_t pid, int cpu) { - struct perf_counter_context *ctx; + struct perf_event_context *ctx; struct perf_cpu_context *cpuctx; struct task_struct *task; unsigned long flags; int err; /* - * If cpu is not a wildcard then this is a percpu counter: + * If cpu is not a wildcard then this is a percpu event: */ if (cpu != -1) { - /* Must be root to operate on a CPU counter: */ + /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); @@ -1601,7 +1600,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(-EINVAL); /* - * We could be clever and allow to attach a counter to an + * We could be clever and allow to attach a event to an * offline CPU and activate it when the CPU comes up, but * that's for later. */ @@ -1628,7 +1627,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(-ESRCH); /* - * Can't attach counters to a dying task. + * Can't attach events to a dying task. */ err = -ESRCH; if (task->flags & PF_EXITING) @@ -1647,13 +1646,13 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) } if (!ctx) { - ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); err = -ENOMEM; if (!ctx) goto errout; - __perf_counter_init_context(ctx, task); + __perf_event_init_context(ctx, task); get_ctx(ctx); - if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { /* * We raced with some other task; use * the context they set. @@ -1672,42 +1671,42 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(err); } -static void free_counter_rcu(struct rcu_head *head) +static void free_event_rcu(struct rcu_head *head) { - struct perf_counter *counter; + struct perf_event *event; - counter = container_of(head, struct perf_counter, rcu_head); - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); } -static void perf_pending_sync(struct perf_counter *counter); +static void perf_pending_sync(struct perf_event *event); -static void free_counter(struct perf_counter *counter) +static void free_event(struct perf_event *event) { - perf_pending_sync(counter); + perf_pending_sync(event); - if (!counter->parent) { - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); - if (counter->attr.task) - atomic_dec(&nr_task_counters); + if (!event->parent) { + atomic_dec(&nr_events); + if (event->attr.mmap) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); } - if (counter->output) { - fput(counter->output->filp); - counter->output = NULL; + if (event->output) { + fput(event->output->filp); + event->output = NULL; } - if (counter->destroy) - counter->destroy(counter); + if (event->destroy) + event->destroy(event); - put_ctx(counter->ctx); - call_rcu(&counter->rcu_head, free_counter_rcu); + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); } /* @@ -1715,43 +1714,43 @@ static void free_counter(struct perf_counter *counter) */ static int perf_release(struct inode *inode, struct file *file) { - struct perf_counter *counter = file->private_data; - struct perf_counter_context *ctx = counter->ctx; + struct perf_event *event = file->private_data; + struct perf_event_context *ctx = event->ctx; file->private_data = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); - perf_counter_remove_from_context(counter); + perf_event_remove_from_context(event); mutex_unlock(&ctx->mutex); - mutex_lock(&counter->owner->perf_counter_mutex); - list_del_init(&counter->owner_entry); - mutex_unlock(&counter->owner->perf_counter_mutex); - put_task_struct(counter->owner); + mutex_lock(&event->owner->perf_event_mutex); + list_del_init(&event->owner_entry); + mutex_unlock(&event->owner->perf_event_mutex); + put_task_struct(event->owner); - free_counter(counter); + free_event(event); return 0; } -static int perf_counter_read_size(struct perf_counter *counter) +static int perf_event_read_size(struct perf_event *event) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); - if (counter->attr.read_format & PERF_FORMAT_ID) + if (event->attr.read_format & PERF_FORMAT_ID) entry += sizeof(u64); - if (counter->attr.read_format & PERF_FORMAT_GROUP) { - nr += counter->group_leader->nr_siblings; + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; size += sizeof(u64); } @@ -1760,27 +1759,27 @@ static int perf_counter_read_size(struct perf_counter *counter) return size; } -static u64 perf_counter_read_value(struct perf_counter *counter) +static u64 perf_event_read_value(struct perf_event *event) { - struct perf_counter *child; + struct perf_event *child; u64 total = 0; - total += perf_counter_read(counter); - list_for_each_entry(child, &counter->child_list, child_list) - total += perf_counter_read(child); + total += perf_event_read(event); + list_for_each_entry(child, &event->child_list, child_list) + total += perf_event_read(child); return total; } -static int perf_counter_read_entry(struct perf_counter *counter, +static int perf_event_read_entry(struct perf_event *event, u64 read_format, char __user *buf) { int n = 0, count = 0; u64 values[2]; - values[n++] = perf_counter_read_value(counter); + values[n++] = perf_event_read_value(event); if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + values[n++] = primary_event_id(event); count = n * sizeof(u64); @@ -1790,10 +1789,10 @@ static int perf_counter_read_entry(struct perf_counter *counter, return count; } -static int perf_counter_read_group(struct perf_counter *counter, +static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { - struct perf_counter *leader = counter->group_leader, *sub; + struct perf_event *leader = event->group_leader, *sub; int n = 0, size = 0, err = -EFAULT; u64 values[3]; @@ -1812,14 +1811,14 @@ static int perf_counter_read_group(struct perf_counter *counter, if (copy_to_user(buf, values, size)) return -EFAULT; - err = perf_counter_read_entry(leader, read_format, buf + size); + err = perf_event_read_entry(leader, read_format, buf + size); if (err < 0) return err; size += err; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - err = perf_counter_read_entry(sub, read_format, + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + err = perf_event_read_entry(sub, read_format, buf + size); if (err < 0) return err; @@ -1830,23 +1829,23 @@ static int perf_counter_read_group(struct perf_counter *counter, return size; } -static int perf_counter_read_one(struct perf_counter *counter, +static int perf_event_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 values[4]; int n = 0; - values[n++] = perf_counter_read_value(counter); + values[n++] = perf_event_read_value(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + values[n++] = primary_event_id(event); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; @@ -1855,32 +1854,32 @@ static int perf_counter_read_one(struct perf_counter *counter, } /* - * Read the performance counter - simple non blocking version for now + * Read the performance event - simple non blocking version for now */ static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) { - u64 read_format = counter->attr.read_format; + u64 read_format = event->attr.read_format; int ret; /* - * Return end-of-file for a read on a counter that is in + * Return end-of-file for a read on a event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ - if (counter->state == PERF_COUNTER_STATE_ERROR) + if (event->state == PERF_EVENT_STATE_ERROR) return 0; - if (count < perf_counter_read_size(counter)) + if (count < perf_event_read_size(event)) return -ENOSPC; - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); if (read_format & PERF_FORMAT_GROUP) - ret = perf_counter_read_group(counter, read_format, buf); + ret = perf_event_read_group(event, read_format, buf); else - ret = perf_counter_read_one(counter, read_format, buf); - mutex_unlock(&counter->child_mutex); + ret = perf_event_read_one(event, read_format, buf); + mutex_unlock(&event->child_mutex); return ret; } @@ -1888,79 +1887,79 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct perf_counter *counter = file->private_data; + struct perf_event *event = file->private_data; - return perf_read_hw(counter, buf, count); + return perf_read_hw(event, buf, count); } static unsigned int perf_poll(struct file *file, poll_table *wait) { - struct perf_counter *counter = file->private_data; + struct perf_event *event = file->private_data; struct perf_mmap_data *data; unsigned int events = POLL_HUP; rcu_read_lock(); - data = rcu_dereference(counter->data); + data = rcu_dereference(event->data); if (data) events = atomic_xchg(&data->poll, 0); rcu_read_unlock(); - poll_wait(file, &counter->waitq, wait); + poll_wait(file, &event->waitq, wait); return events; } -static void perf_counter_reset(struct perf_counter *counter) +static void perf_event_reset(struct perf_event *event) { - (void)perf_counter_read(counter); - atomic64_set(&counter->count, 0); - perf_counter_update_userpage(counter); + (void)perf_event_read(event); + atomic64_set(&event->count, 0); + perf_event_update_userpage(event); } /* - * Holding the top-level counter's child_mutex means that any - * descendant process that has inherited this counter will block - * in sync_child_counter if it goes to exit, thus satisfying the - * task existence requirements of perf_counter_enable/disable. + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. */ -static void perf_counter_for_each_child(struct perf_counter *counter, - void (*func)(struct perf_counter *)) +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) { - struct perf_counter *child; + struct perf_event *child; - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - func(counter); - list_for_each_entry(child, &counter->child_list, child_list) + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) func(child); - mutex_unlock(&counter->child_mutex); + mutex_unlock(&event->child_mutex); } -static void perf_counter_for_each(struct perf_counter *counter, - void (*func)(struct perf_counter *)) +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) { - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); - counter = counter->group_leader; + event = event->group_leader; - perf_counter_for_each_child(counter, func); - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, list_entry) - perf_counter_for_each_child(counter, func); + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); mutex_unlock(&ctx->mutex); } -static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) +static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; unsigned long size; int ret = 0; u64 value; - if (!counter->attr.sample_period) + if (!event->attr.sample_period) return -EINVAL; size = copy_from_user(&value, arg, sizeof(value)); @@ -1971,16 +1970,16 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) return -EINVAL; spin_lock_irq(&ctx->lock); - if (counter->attr.freq) { - if (value > sysctl_perf_counter_sample_rate) { + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { ret = -EINVAL; goto unlock; } - counter->attr.sample_freq = value; + event->attr.sample_freq = value; } else { - counter->attr.sample_period = value; - counter->hw.sample_period = value; + event->attr.sample_period = value; + event->hw.sample_period = value; } unlock: spin_unlock_irq(&ctx->lock); @@ -1988,80 +1987,80 @@ unlock: return ret; } -int perf_counter_set_output(struct perf_counter *counter, int output_fd); +int perf_event_set_output(struct perf_event *event, int output_fd); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct perf_counter *counter = file->private_data; - void (*func)(struct perf_counter *); + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { - case PERF_COUNTER_IOC_ENABLE: - func = perf_counter_enable; + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; break; - case PERF_COUNTER_IOC_DISABLE: - func = perf_counter_disable; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; break; - case PERF_COUNTER_IOC_RESET: - func = perf_counter_reset; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; break; - case PERF_COUNTER_IOC_REFRESH: - return perf_counter_refresh(counter, arg); + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); - case PERF_COUNTER_IOC_PERIOD: - return perf_counter_period(counter, (u64 __user *)arg); + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); - case PERF_COUNTER_IOC_SET_OUTPUT: - return perf_counter_set_output(counter, arg); + case PERF_EVENT_IOC_SET_OUTPUT: + return perf_event_set_output(event, arg); default: return -ENOTTY; } if (flags & PERF_IOC_FLAG_GROUP) - perf_counter_for_each(counter, func); + perf_event_for_each(event, func); else - perf_counter_for_each_child(counter, func); + perf_event_for_each_child(event, func); return 0; } -int perf_counter_task_enable(void) +int perf_event_task_enable(void) { - struct perf_counter *counter; + struct perf_event *event; - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_enable); - mutex_unlock(¤t->perf_counter_mutex); + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); return 0; } -int perf_counter_task_disable(void) +int perf_event_task_disable(void) { - struct perf_counter *counter; + struct perf_event *event; - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_disable); - mutex_unlock(¤t->perf_counter_mutex); + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); return 0; } -#ifndef PERF_COUNTER_INDEX_OFFSET -# define PERF_COUNTER_INDEX_OFFSET 0 +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 #endif -static int perf_counter_index(struct perf_counter *counter) +static int perf_event_index(struct perf_event *event) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; - return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; } /* @@ -2069,13 +2068,13 @@ static int perf_counter_index(struct perf_counter *counter) * the seqlock logic goes bad. We can not serialize this because the arch * code calls this from NMI context. */ -void perf_counter_update_userpage(struct perf_counter *counter) +void perf_event_update_userpage(struct perf_event *event) { - struct perf_counter_mmap_page *userpg; + struct perf_event_mmap_page *userpg; struct perf_mmap_data *data; rcu_read_lock(); - data = rcu_dereference(counter->data); + data = rcu_dereference(event->data); if (!data) goto unlock; @@ -2088,16 +2087,16 @@ void perf_counter_update_userpage(struct perf_counter *counter) preempt_disable(); ++userpg->lock; barrier(); - userpg->index = perf_counter_index(counter); - userpg->offset = atomic64_read(&counter->count); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - userpg->offset -= atomic64_read(&counter->hw.prev_count); + userpg->index = perf_event_index(event); + userpg->offset = atomic64_read(&event->count); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= atomic64_read(&event->hw.prev_count); - userpg->time_enabled = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); - userpg->time_running = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); barrier(); ++userpg->lock; @@ -2108,7 +2107,7 @@ unlock: static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct perf_counter *counter = vma->vm_file->private_data; + struct perf_event *event = vma->vm_file->private_data; struct perf_mmap_data *data; int ret = VM_FAULT_SIGBUS; @@ -2119,7 +2118,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - data = rcu_dereference(counter->data); + data = rcu_dereference(event->data); if (!data) goto unlock; @@ -2148,13 +2147,13 @@ unlock: return ret; } -static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) +static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { struct perf_mmap_data *data; unsigned long size; int i; - WARN_ON(atomic_read(&counter->mmap_count)); + WARN_ON(atomic_read(&event->mmap_count)); size = sizeof(struct perf_mmap_data); size += nr_pages * sizeof(void *); @@ -2176,14 +2175,14 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) data->nr_pages = nr_pages; atomic_set(&data->lock, -1); - if (counter->attr.watermark) { + if (event->attr.watermark) { data->watermark = min_t(long, PAGE_SIZE * nr_pages, - counter->attr.wakeup_watermark); + event->attr.wakeup_watermark); } if (!data->watermark) data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - rcu_assign_pointer(counter->data, data); + rcu_assign_pointer(event->data, data); return 0; @@ -2222,35 +2221,35 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head) kfree(data); } -static void perf_mmap_data_free(struct perf_counter *counter) +static void perf_mmap_data_free(struct perf_event *event) { - struct perf_mmap_data *data = counter->data; + struct perf_mmap_data *data = event->data; - WARN_ON(atomic_read(&counter->mmap_count)); + WARN_ON(atomic_read(&event->mmap_count)); - rcu_assign_pointer(counter->data, NULL); + rcu_assign_pointer(event->data, NULL); call_rcu(&data->rcu_head, __perf_mmap_data_free); } static void perf_mmap_open(struct vm_area_struct *vma) { - struct perf_counter *counter = vma->vm_file->private_data; + struct perf_event *event = vma->vm_file->private_data; - atomic_inc(&counter->mmap_count); + atomic_inc(&event->mmap_count); } static void perf_mmap_close(struct vm_area_struct *vma) { - struct perf_counter *counter = vma->vm_file->private_data; + struct perf_event *event = vma->vm_file->private_data; - WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + WARN_ON_ONCE(event->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { struct user_struct *user = current_user(); - atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= counter->data->nr_locked; - perf_mmap_data_free(counter); - mutex_unlock(&counter->mmap_mutex); + atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->data->nr_locked; + perf_mmap_data_free(event); + mutex_unlock(&event->mmap_mutex); } } @@ -2263,7 +2262,7 @@ static struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { - struct perf_counter *counter = file->private_data; + struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; @@ -2291,21 +2290,21 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_pgoff != 0) return -EINVAL; - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->mmap_mutex); - if (counter->output) { + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->output) { ret = -EINVAL; goto unlock; } - if (atomic_inc_not_zero(&counter->mmap_count)) { - if (nr_pages != counter->data->nr_pages) + if (atomic_inc_not_zero(&event->mmap_count)) { + if (nr_pages != event->data->nr_pages) ret = -EINVAL; goto unlock; } user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* * Increase the limit linearly with more CPUs: @@ -2328,20 +2327,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(counter->data); - ret = perf_mmap_data_alloc(counter, nr_pages); + WARN_ON(event->data); + ret = perf_mmap_data_alloc(event, nr_pages); if (ret) goto unlock; - atomic_set(&counter->mmap_count, 1); + atomic_set(&event->mmap_count, 1); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; - counter->data->nr_locked = extra; + event->data->nr_locked = extra; if (vma->vm_flags & VM_WRITE) - counter->data->writable = 1; + event->data->writable = 1; unlock: - mutex_unlock(&counter->mmap_mutex); + mutex_unlock(&event->mmap_mutex); vma->vm_flags |= VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; @@ -2352,11 +2351,11 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_counter *counter = filp->private_data; + struct perf_event *event = filp->private_data; int retval; mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &counter->fasync); + retval = fasync_helper(fd, filp, on, &event->fasync); mutex_unlock(&inode->i_mutex); if (retval < 0) @@ -2376,19 +2375,19 @@ static const struct file_operations perf_fops = { }; /* - * Perf counter wakeup + * Perf event wakeup * * If there's data, ensure we set the poll() state and publish everything * to user-space before waking everybody up. */ -void perf_counter_wakeup(struct perf_counter *counter) +void perf_event_wakeup(struct perf_event *event) { - wake_up_all(&counter->waitq); + wake_up_all(&event->waitq); - if (counter->pending_kill) { - kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); - counter->pending_kill = 0; + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; } } @@ -2401,19 +2400,19 @@ void perf_counter_wakeup(struct perf_counter *counter) * single linked list and use cmpxchg() to add entries lockless. */ -static void perf_pending_counter(struct perf_pending_entry *entry) +static void perf_pending_event(struct perf_pending_entry *entry) { - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); + struct perf_event *event = container_of(entry, + struct perf_event, pending); - if (counter->pending_disable) { - counter->pending_disable = 0; - __perf_counter_disable(counter); + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); } - if (counter->pending_wakeup) { - counter->pending_wakeup = 0; - perf_counter_wakeup(counter); + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); } } @@ -2439,7 +2438,7 @@ static void perf_pending_queue(struct perf_pending_entry *entry, entry->next = *head; } while (cmpxchg(head, entry->next, entry) != entry->next); - set_perf_counter_pending(); + set_perf_event_pending(); put_cpu_var(perf_pending_head); } @@ -2472,7 +2471,7 @@ static int __perf_pending_run(void) return nr; } -static inline int perf_not_pending(struct perf_counter *counter) +static inline int perf_not_pending(struct perf_event *event) { /* * If we flush on whatever cpu we run, there is a chance we don't @@ -2487,15 +2486,15 @@ static inline int perf_not_pending(struct perf_counter *counter) * so that we do not miss the wakeup. -- see perf_pending_handle() */ smp_rmb(); - return counter->pending.next == NULL; + return event->pending.next == NULL; } -static void perf_pending_sync(struct perf_counter *counter) +static void perf_pending_sync(struct perf_event *event) { - wait_event(counter->waitq, perf_not_pending(counter)); + wait_event(event->waitq, perf_not_pending(event)); } -void perf_counter_do_pending(void) +void perf_event_do_pending(void) { __perf_pending_run(); } @@ -2536,25 +2535,25 @@ static void perf_output_wakeup(struct perf_output_handle *handle) atomic_set(&handle->data->poll, POLL_IN); if (handle->nmi) { - handle->counter->pending_wakeup = 1; - perf_pending_queue(&handle->counter->pending, - perf_pending_counter); + handle->event->pending_wakeup = 1; + perf_pending_queue(&handle->event->pending, + perf_pending_event); } else - perf_counter_wakeup(handle->counter); + perf_event_wakeup(handle->event); } /* * Curious locking construct. * - * We need to ensure a later event doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we + * We need to ensure a later event_id doesn't publish a head when a former + * event_id isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * * What we do is serialize between CPUs so we only have to deal with NMI * nesting on a single CPU. * * We only publish the head (and generate a wakeup) when the outer-most - * event completes. + * event_id completes. */ static void perf_output_lock(struct perf_output_handle *handle) { @@ -2658,10 +2657,10 @@ void perf_output_copy(struct perf_output_handle *handle, } int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, + struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_counter *output_counter; + struct perf_event *output_event; struct perf_mmap_data *data; unsigned long tail, offset, head; int have_lost; @@ -2673,21 +2672,21 @@ int perf_output_begin(struct perf_output_handle *handle, rcu_read_lock(); /* - * For inherited counters we send all the output towards the parent. + * For inherited events we send all the output towards the parent. */ - if (counter->parent) - counter = counter->parent; + if (event->parent) + event = event->parent; - output_counter = rcu_dereference(counter->output); - if (output_counter) - counter = output_counter; + output_event = rcu_dereference(event->output); + if (output_event) + event = output_event; - data = rcu_dereference(counter->data); + data = rcu_dereference(event->data); if (!data) goto out; handle->data = data; - handle->counter = counter; + handle->event = event; handle->nmi = nmi; handle->sample = sample; @@ -2721,10 +2720,10 @@ int perf_output_begin(struct perf_output_handle *handle, atomic_set(&data->wakeup, 1); if (have_lost) { - lost_event.header.type = PERF_EVENT_LOST; + lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); - lost_event.id = counter->id; + lost_event.id = event->id; lost_event.lost = atomic_xchg(&data->lost, 0); perf_output_put(handle, lost_event); @@ -2743,10 +2742,10 @@ out: void perf_output_end(struct perf_output_handle *handle) { - struct perf_counter *counter = handle->counter; + struct perf_event *event = handle->event; struct perf_mmap_data *data = handle->data; - int wakeup_events = counter->attr.wakeup_events; + int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { int events = atomic_inc_return(&data->events); @@ -2760,58 +2759,58 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { /* - * only top level counters have the pid namespace they were created in + * only top level events have the pid namespace they were created in */ - if (counter->parent) - counter = counter->parent; + if (event->parent) + event = event->parent; - return task_tgid_nr_ns(p, counter->ns); + return task_tgid_nr_ns(p, event->ns); } -static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { /* - * only top level counters have the pid namespace they were created in + * only top level events have the pid namespace they were created in */ - if (counter->parent) - counter = counter->parent; + if (event->parent) + event = event->parent; - return task_pid_nr_ns(p, counter->ns); + return task_pid_nr_ns(p, event->ns); } static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_counter *counter) + struct perf_event *event) { - u64 read_format = counter->attr.read_format; + u64 read_format = event->attr.read_format; u64 values[4]; int n = 0; - values[n++] = atomic64_read(&counter->count); + values[n++] = atomic64_read(&event->count); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + values[n++] = primary_event_id(event); perf_output_copy(handle, values, n * sizeof(u64)); } /* - * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. */ static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_counter *counter) + struct perf_event *event) { - struct perf_counter *leader = counter->group_leader, *sub; - u64 read_format = counter->attr.read_format; + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; u64 values[5]; int n = 0; @@ -2823,42 +2822,42 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = leader->total_time_running; - if (leader != counter) + if (leader != event) leader->pmu->read(leader); values[n++] = atomic64_read(&leader->count); if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(leader); + values[n++] = primary_event_id(leader); perf_output_copy(handle, values, n * sizeof(u64)); - list_for_each_entry(sub, &leader->sibling_list, list_entry) { + list_for_each_entry(sub, &leader->sibling_list, group_entry) { n = 0; - if (sub != counter) + if (sub != event) sub->pmu->read(sub); values[n++] = atomic64_read(&sub->count); if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(sub); + values[n++] = primary_event_id(sub); perf_output_copy(handle, values, n * sizeof(u64)); } } static void perf_output_read(struct perf_output_handle *handle, - struct perf_counter *counter) + struct perf_event *event) { - if (counter->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, counter); + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event); else - perf_output_read_one(handle, counter); + perf_output_read_one(handle, event); } void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, - struct perf_counter *counter) + struct perf_event *event) { u64 sample_type = data->type; @@ -2889,7 +2888,7 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, data->period); if (sample_type & PERF_SAMPLE_READ) - perf_output_read(handle, counter); + perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (data->callchain) { @@ -2927,14 +2926,14 @@ void perf_output_sample(struct perf_output_handle *handle, void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_counter *counter, + struct perf_event *event, struct pt_regs *regs) { - u64 sample_type = counter->attr.sample_type; + u64 sample_type = event->attr.sample_type; data->type = sample_type; - header->type = PERF_EVENT_SAMPLE; + header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header); header->misc = 0; @@ -2948,8 +2947,8 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ - data->tid_entry.pid = perf_counter_pid(counter, current); - data->tid_entry.tid = perf_counter_tid(counter, current); + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); header->size += sizeof(data->tid_entry); } @@ -2964,13 +2963,13 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_ID) { - data->id = primary_counter_id(counter); + data->id = primary_event_id(event); header->size += sizeof(data->id); } if (sample_type & PERF_SAMPLE_STREAM_ID) { - data->stream_id = counter->id; + data->stream_id = event->id; header->size += sizeof(data->stream_id); } @@ -2986,7 +2985,7 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += sizeof(data->period); if (sample_type & PERF_SAMPLE_READ) - header->size += perf_counter_read_size(counter); + header->size += perf_event_read_size(event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; @@ -3012,25 +3011,25 @@ void perf_prepare_sample(struct perf_event_header *header, } } -static void perf_counter_output(struct perf_counter *counter, int nmi, +static void perf_event_output(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct perf_output_handle handle; struct perf_event_header header; - perf_prepare_sample(&header, data, counter, regs); + perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, counter, header.size, nmi, 1)) + if (perf_output_begin(&handle, event, header.size, nmi, 1)) return; - perf_output_sample(&handle, &header, data, counter); + perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); } /* - * read event + * read event_id */ struct perf_read_event { @@ -3041,27 +3040,27 @@ struct perf_read_event { }; static void -perf_counter_read_event(struct perf_counter *counter, +perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; - struct perf_read_event event = { + struct perf_read_event read_event = { .header = { - .type = PERF_EVENT_READ, + .type = PERF_RECORD_READ, .misc = 0, - .size = sizeof(event) + perf_counter_read_size(counter), + .size = sizeof(read_event) + perf_event_read_size(event), }, - .pid = perf_counter_pid(counter, task), - .tid = perf_counter_tid(counter, task), + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), }; int ret; - ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); if (ret) return; - perf_output_put(&handle, event); - perf_output_read(&handle, counter); + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); perf_output_end(&handle); } @@ -3074,7 +3073,7 @@ perf_counter_read_event(struct perf_counter *counter, struct perf_task_event { struct task_struct *task; - struct perf_counter_context *task_ctx; + struct perf_event_context *task_ctx; struct { struct perf_event_header header; @@ -3084,10 +3083,10 @@ struct perf_task_event { u32 tid; u32 ptid; u64 time; - } event; + } event_id; }; -static void perf_counter_task_output(struct perf_counter *counter, +static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; @@ -3095,85 +3094,85 @@ static void perf_counter_task_output(struct perf_counter *counter, struct task_struct *task = task_event->task; int ret; - size = task_event->event.header.size; - ret = perf_output_begin(&handle, counter, size, 0, 0); + size = task_event->event_id.header.size; + ret = perf_output_begin(&handle, event, size, 0, 0); if (ret) return; - task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, current); + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); - task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, current); + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); - task_event->event.time = perf_clock(); + task_event->event_id.time = perf_clock(); - perf_output_put(&handle, task_event->event); + perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); } -static int perf_counter_task_match(struct perf_counter *counter) +static int perf_event_task_match(struct perf_event *event) { - if (counter->attr.comm || counter->attr.mmap || counter->attr.task) + if (event->attr.comm || event->attr.mmap || event->attr.task) return 1; return 0; } -static void perf_counter_task_ctx(struct perf_counter_context *ctx, +static void perf_event_task_ctx(struct perf_event_context *ctx, struct perf_task_event *task_event) { - struct perf_counter *counter; + struct perf_event *event; if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_task_match(counter)) - perf_counter_task_output(counter, task_event); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); } rcu_read_unlock(); } -static void perf_counter_task_event(struct perf_task_event *task_event) +static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx = task_event->task_ctx; + struct perf_event_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_task_ctx(&cpuctx->ctx, task_event); + perf_event_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); if (!ctx) - ctx = rcu_dereference(task_event->task->perf_counter_ctxp); + ctx = rcu_dereference(task_event->task->perf_event_ctxp); if (ctx) - perf_counter_task_ctx(ctx, task_event); + perf_event_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, - struct perf_counter_context *task_ctx, +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, int new) { struct perf_task_event task_event; - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_task_counters)) + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) return; task_event = (struct perf_task_event){ .task = task, .task_ctx = task_ctx, - .event = { + .event_id = { .header = { - .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, .misc = 0, - .size = sizeof(task_event.event), + .size = sizeof(task_event.event_id), }, /* .pid */ /* .ppid */ @@ -3182,12 +3181,12 @@ static void perf_counter_task(struct task_struct *task, }, }; - perf_counter_task_event(&task_event); + perf_event_task_event(&task_event); } -void perf_counter_fork(struct task_struct *task) +void perf_event_fork(struct task_struct *task) { - perf_counter_task(task, NULL, 1); + perf_event_task(task, NULL, 1); } /* @@ -3204,56 +3203,56 @@ struct perf_comm_event { u32 pid; u32 tid; - } event; + } event_id; }; -static void perf_counter_comm_output(struct perf_counter *counter, +static void perf_event_comm_output(struct perf_event *event, struct perf_comm_event *comm_event) { struct perf_output_handle handle; - int size = comm_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); + int size = comm_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); if (ret) return; - comm_event->event.pid = perf_counter_pid(counter, comm_event->task); - comm_event->event.tid = perf_counter_tid(counter, comm_event->task); + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); - perf_output_put(&handle, comm_event->event); + perf_output_put(&handle, comm_event->event_id); perf_output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_output_end(&handle); } -static int perf_counter_comm_match(struct perf_counter *counter) +static int perf_event_comm_match(struct perf_event *event) { - if (counter->attr.comm) + if (event->attr.comm) return 1; return 0; } -static void perf_counter_comm_ctx(struct perf_counter_context *ctx, +static void perf_event_comm_ctx(struct perf_event_context *ctx, struct perf_comm_event *comm_event) { - struct perf_counter *counter; + struct perf_event *event; if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter)) - perf_counter_comm_output(counter, comm_event); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); } rcu_read_unlock(); } -static void perf_counter_comm_event(struct perf_comm_event *comm_event) +static void perf_event_comm_event(struct perf_comm_event *comm_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_event_context *ctx; unsigned int size; char comm[TASK_COMM_LEN]; @@ -3264,10 +3263,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) comm_event->comm = comm; comm_event->comm_size = size; - comm_event->event.header.size = sizeof(comm_event->event) + size; + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -3275,29 +3274,29 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) * doesn't really matter which of the child contexts the * events ends up in. */ - ctx = rcu_dereference(current->perf_counter_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_counter_comm_ctx(ctx, comm_event); + perf_event_comm_ctx(ctx, comm_event); rcu_read_unlock(); } -void perf_counter_comm(struct task_struct *task) +void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; - if (task->perf_counter_ctxp) - perf_counter_enable_on_exec(task); + if (task->perf_event_ctxp) + perf_event_enable_on_exec(task); - if (!atomic_read(&nr_comm_counters)) + if (!atomic_read(&nr_comm_events)) return; comm_event = (struct perf_comm_event){ .task = task, /* .comm */ /* .comm_size */ - .event = { + .event_id = { .header = { - .type = PERF_EVENT_COMM, + .type = PERF_RECORD_COMM, .misc = 0, /* .size */ }, @@ -3306,7 +3305,7 @@ void perf_counter_comm(struct task_struct *task) }, }; - perf_counter_comm_event(&comm_event); + perf_event_comm_event(&comm_event); } /* @@ -3327,57 +3326,57 @@ struct perf_mmap_event { u64 start; u64 len; u64 pgoff; - } event; + } event_id; }; -static void perf_counter_mmap_output(struct perf_counter *counter, +static void perf_event_mmap_output(struct perf_event *event, struct perf_mmap_event *mmap_event) { struct perf_output_handle handle; - int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); + int size = mmap_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); if (ret) return; - mmap_event->event.pid = perf_counter_pid(counter, current); - mmap_event->event.tid = perf_counter_tid(counter, current); + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); - perf_output_put(&handle, mmap_event->event); + perf_output_put(&handle, mmap_event->event_id); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_output_end(&handle); } -static int perf_counter_mmap_match(struct perf_counter *counter, +static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { - if (counter->attr.mmap) + if (event->attr.mmap) return 1; return 0; } -static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, +static void perf_event_mmap_ctx(struct perf_event_context *ctx, struct perf_mmap_event *mmap_event) { - struct perf_counter *counter; + struct perf_event *event; if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_mmap_match(counter, mmap_event)) - perf_counter_mmap_output(counter, mmap_event); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event)) + perf_event_mmap_output(event, mmap_event); } rcu_read_unlock(); } -static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_event_context *ctx; struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; unsigned int size; @@ -3425,10 +3424,10 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; - mmap_event->event.header.size = sizeof(mmap_event->event) + size; + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -3436,28 +3435,28 @@ got_name: * doesn't really matter which of the child contexts the * events ends up in. */ - ctx = rcu_dereference(current->perf_counter_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_counter_mmap_ctx(ctx, mmap_event); + perf_event_mmap_ctx(ctx, mmap_event); rcu_read_unlock(); kfree(buf); } -void __perf_counter_mmap(struct vm_area_struct *vma) +void __perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; - if (!atomic_read(&nr_mmap_counters)) + if (!atomic_read(&nr_mmap_events)) return; mmap_event = (struct perf_mmap_event){ .vma = vma, /* .file_name */ /* .file_size */ - .event = { + .event_id = { .header = { - .type = PERF_EVENT_MMAP, + .type = PERF_RECORD_MMAP, .misc = 0, /* .size */ }, @@ -3469,14 +3468,14 @@ void __perf_counter_mmap(struct vm_area_struct *vma) }, }; - perf_counter_mmap_event(&mmap_event); + perf_event_mmap_event(&mmap_event); } /* * IRQ throttle logging */ -static void perf_log_throttle(struct perf_counter *counter, int enable) +static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; int ret; @@ -3488,19 +3487,19 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) u64 stream_id; } throttle_event = { .header = { - .type = PERF_EVENT_THROTTLE, + .type = PERF_RECORD_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, .time = perf_clock(), - .id = primary_counter_id(counter), - .stream_id = counter->id, + .id = primary_event_id(event), + .stream_id = event->id, }; if (enable) - throttle_event.header.type = PERF_EVENT_UNTHROTTLE; + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); + ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); if (ret) return; @@ -3509,18 +3508,18 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) } /* - * Generic counter overflow handling, sampling. + * Generic event overflow handling, sampling. */ -static int __perf_counter_overflow(struct perf_counter *counter, int nmi, +static int __perf_event_overflow(struct perf_event *event, int nmi, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { - int events = atomic_read(&counter->event_limit); - struct hw_perf_counter *hwc = &counter->hw; + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; int ret = 0; - throttle = (throttle && counter->pmu->unthrottle != NULL); + throttle = (throttle && event->pmu->unthrottle != NULL); if (!throttle) { hwc->interrupts++; @@ -3528,73 +3527,73 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi, if (hwc->interrupts != MAX_INTERRUPTS) { hwc->interrupts++; if (HZ * hwc->interrupts > - (u64)sysctl_perf_counter_sample_rate) { + (u64)sysctl_perf_event_sample_rate) { hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); + perf_log_throttle(event, 0); ret = 1; } } else { /* - * Keep re-disabling counters even though on the previous + * Keep re-disabling events even though on the previous * pass we disabled it - just in case we raced with a - * sched-in and the counter got enabled again: + * sched-in and the event got enabled again: */ ret = 1; } } - if (counter->attr.freq) { + if (event->attr.freq) { u64 now = perf_clock(); s64 delta = now - hwc->freq_stamp; hwc->freq_stamp = now; if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); + perf_adjust_period(event, NSEC_PER_SEC / (int)delta); } /* * XXX event_limit might not quite work as expected on inherited - * counters + * events */ - counter->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&counter->event_limit)) { + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; - counter->pending_kill = POLL_HUP; + event->pending_kill = POLL_HUP; if (nmi) { - counter->pending_disable = 1; - perf_pending_queue(&counter->pending, - perf_pending_counter); + event->pending_disable = 1; + perf_pending_queue(&event->pending, + perf_pending_event); } else - perf_counter_disable(counter); + perf_event_disable(event); } - perf_counter_output(counter, nmi, data, regs); + perf_event_output(event, nmi, data, regs); return ret; } -int perf_counter_overflow(struct perf_counter *counter, int nmi, +int perf_event_overflow(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - return __perf_counter_overflow(counter, nmi, 1, data, regs); + return __perf_event_overflow(event, nmi, 1, data, regs); } /* - * Generic software counter infrastructure + * Generic software event infrastructure */ /* - * We directly increment counter->count and keep a second value in - * counter->hw.period_left to count intervals. This period counter + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event * is kept in the range [-sample_period, 0] so that we can use the * sign as trigger. */ -static u64 perf_swcounter_set_period(struct perf_counter *counter) +static u64 perf_swevent_set_period(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; u64 nr, offset; s64 old, val; @@ -3615,22 +3614,22 @@ again: return nr; } -static void perf_swcounter_overflow(struct perf_counter *counter, +static void perf_swevent_overflow(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; int throttle = 0; u64 overflow; - data->period = counter->hw.last_period; - overflow = perf_swcounter_set_period(counter); + data->period = event->hw.last_period; + overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; for (; overflow; overflow--) { - if (__perf_counter_overflow(counter, nmi, throttle, + if (__perf_event_overflow(event, nmi, throttle, data, regs)) { /* * We inhibit the overflow from happening when @@ -3642,20 +3641,20 @@ static void perf_swcounter_overflow(struct perf_counter *counter, } } -static void perf_swcounter_unthrottle(struct perf_counter *counter) +static void perf_swevent_unthrottle(struct perf_event *event) { /* * Nothing to do, we already reset hwc->interrupts. */ } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, +static void perf_swevent_add(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; - atomic64_add(nr, &counter->count); + atomic64_add(nr, &event->count); if (!hwc->sample_period) return; @@ -3664,29 +3663,29 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, return; if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data, regs); + perf_swevent_overflow(event, nmi, data, regs); } -static int perf_swcounter_is_counting(struct perf_counter *counter) +static int perf_swevent_is_counting(struct perf_event *event) { /* - * The counter is active, we're good! + * The event is active, we're good! */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) + if (event->state == PERF_EVENT_STATE_ACTIVE) return 1; /* - * The counter is off/error, not counting. + * The event is off/error, not counting. */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE) + if (event->state != PERF_EVENT_STATE_INACTIVE) return 0; /* - * The counter is inactive, if the context is active + * The event is inactive, if the context is active * we're part of a group that didn't make it on the 'pmu', * not counting. */ - if (counter->ctx->is_active) + if (event->ctx->is_active) return 0; /* @@ -3697,49 +3696,49 @@ static int perf_swcounter_is_counting(struct perf_counter *counter) return 1; } -static int perf_swcounter_match(struct perf_counter *counter, +static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, - u32 event, struct pt_regs *regs) + u32 event_id, struct pt_regs *regs) { - if (!perf_swcounter_is_counting(counter)) + if (!perf_swevent_is_counting(event)) return 0; - if (counter->attr.type != type) + if (event->attr.type != type) return 0; - if (counter->attr.config != event) + if (event->attr.config != event_id) return 0; if (regs) { - if (counter->attr.exclude_user && user_mode(regs)) + if (event->attr.exclude_user && user_mode(regs)) return 0; - if (counter->attr.exclude_kernel && !user_mode(regs)) + if (event->attr.exclude_kernel && !user_mode(regs)) return 0; } return 1; } -static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, +static void perf_swevent_ctx_event(struct perf_event_context *ctx, enum perf_type_id type, - u32 event, u64 nr, int nmi, + u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_counter *counter; + struct perf_event *event; if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, data, regs); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_swevent_match(event, type, event_id, regs)) + perf_swevent_add(event, nr, nmi, data, regs); } rcu_read_unlock(); } -static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) { if (in_nmi()) return &cpuctx->recursion[3]; @@ -3753,14 +3752,14 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) return &cpuctx->recursion[0]; } -static void do_perf_swcounter_event(enum perf_type_id type, u32 event, +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swcounter_recursion_context(cpuctx); - struct perf_counter_context *ctx; + int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_event_context *ctx; if (*recursion) goto out; @@ -3768,16 +3767,16 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event, (*recursion)++; barrier(); - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, + perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, nr, nmi, data, regs); rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. */ - ctx = rcu_dereference(current->perf_counter_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs); + perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); rcu_read_unlock(); barrier(); @@ -3787,57 +3786,57 @@ out: put_cpu_var(perf_cpu_context); } -void __perf_swcounter_event(u32 event, u64 nr, int nmi, +void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { struct perf_sample_data data = { .addr = addr, }; - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); } -static void perf_swcounter_read(struct perf_counter *counter) +static void perf_swevent_read(struct perf_event *event) { } -static int perf_swcounter_enable(struct perf_counter *counter) +static int perf_swevent_enable(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; if (hwc->sample_period) { hwc->last_period = hwc->sample_period; - perf_swcounter_set_period(counter); + perf_swevent_set_period(event); } return 0; } -static void perf_swcounter_disable(struct perf_counter *counter) +static void perf_swevent_disable(struct perf_event *event) { } static const struct pmu perf_ops_generic = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, - .unthrottle = perf_swcounter_unthrottle, + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, + .read = perf_swevent_read, + .unthrottle = perf_swevent_unthrottle, }; /* - * hrtimer based swcounter callback + * hrtimer based swevent callback */ -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; struct pt_regs *regs; - struct perf_counter *counter; + struct perf_event *event; u64 period; - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); data.addr = 0; regs = get_irq_regs(); @@ -3845,45 +3844,45 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->attr.exclude_kernel || !regs) && - !counter->attr.exclude_user) + if ((event->attr.exclude_kernel || !regs) && + !event->attr.exclude_user) regs = task_pt_regs(current); if (regs) { - if (perf_counter_overflow(counter, 0, &data, regs)) + if (perf_event_overflow(event, 0, &data, regs)) ret = HRTIMER_NORESTART; } - period = max_t(u64, 10000, counter->hw.sample_period); + period = max_t(u64, 10000, event->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } /* - * Software counter: cpu wall time clock + * Software event: cpu wall time clock */ -static void cpu_clock_perf_counter_update(struct perf_counter *counter) +static void cpu_clock_perf_event_update(struct perf_event *event) { int cpu = raw_smp_processor_id(); s64 prev; u64 now; now = cpu_clock(cpu); - prev = atomic64_read(&counter->hw.prev_count); - atomic64_set(&counter->hw.prev_count, now); - atomic64_add(now - prev, &counter->count); + prev = atomic64_read(&event->hw.prev_count); + atomic64_set(&event->hw.prev_count, now); + atomic64_add(now - prev, &event->count); } -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +static int cpu_clock_perf_event_enable(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; + hwc->hrtimer.function = perf_swevent_hrtimer; if (hwc->sample_period) { u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, @@ -3894,48 +3893,48 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) return 0; } -static void cpu_clock_perf_counter_disable(struct perf_counter *counter) +static void cpu_clock_perf_event_disable(struct perf_event *event) { - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - cpu_clock_perf_counter_update(counter); + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + cpu_clock_perf_event_update(event); } -static void cpu_clock_perf_counter_read(struct perf_counter *counter) +static void cpu_clock_perf_event_read(struct perf_event *event) { - cpu_clock_perf_counter_update(counter); + cpu_clock_perf_event_update(event); } static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_counter_enable, - .disable = cpu_clock_perf_counter_disable, - .read = cpu_clock_perf_counter_read, + .enable = cpu_clock_perf_event_enable, + .disable = cpu_clock_perf_event_disable, + .read = cpu_clock_perf_event_read, }; /* - * Software counter: task time clock + * Software event: task time clock */ -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +static void task_clock_perf_event_update(struct perf_event *event, u64 now) { u64 prev; s64 delta; - prev = atomic64_xchg(&counter->hw.prev_count, now); + prev = atomic64_xchg(&event->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &counter->count); + atomic64_add(delta, &event->count); } -static int task_clock_perf_counter_enable(struct perf_counter *counter) +static int task_clock_perf_event_enable(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; u64 now; - now = counter->ctx->time; + now = event->ctx->time; atomic64_set(&hwc->prev_count, now); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; + hwc->hrtimer.function = perf_swevent_hrtimer; if (hwc->sample_period) { u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, @@ -3946,38 +3945,38 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) return 0; } -static void task_clock_perf_counter_disable(struct perf_counter *counter) +static void task_clock_perf_event_disable(struct perf_event *event) { - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, counter->ctx->time); + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + task_clock_perf_event_update(event, event->ctx->time); } -static void task_clock_perf_counter_read(struct perf_counter *counter) +static void task_clock_perf_event_read(struct perf_event *event) { u64 time; if (!in_nmi()) { - update_context_time(counter->ctx); - time = counter->ctx->time; + update_context_time(event->ctx); + time = event->ctx->time; } else { u64 now = perf_clock(); - u64 delta = now - counter->ctx->timestamp; - time = counter->ctx->time + delta; + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; } - task_clock_perf_counter_update(counter, time); + task_clock_perf_event_update(event, time); } static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, + .enable = task_clock_perf_event_enable, + .disable = task_clock_perf_event_disable, + .read = task_clock_perf_event_read, }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { struct perf_raw_record raw = { @@ -3995,78 +3994,78 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, if (!regs) regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data, regs); } -EXPORT_SYMBOL_GPL(perf_tpcounter_event); +EXPORT_SYMBOL_GPL(perf_tp_event); extern int ftrace_profile_enable(int); extern void ftrace_profile_disable(int); -static void tp_perf_counter_destroy(struct perf_counter *counter) +static void tp_perf_event_destroy(struct perf_event *event) { - ftrace_profile_disable(counter->attr.config); + ftrace_profile_disable(event->attr.config); } -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_event_init(struct perf_event *event) { /* * Raw tracepoint data is a severe data leak, only allow root to * have these. */ - if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (ftrace_profile_enable(counter->attr.config)) + if (ftrace_profile_enable(event->attr.config)) return NULL; - counter->destroy = tp_perf_counter_destroy; + event->destroy = tp_perf_event_destroy; return &perf_ops_generic; } #else -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } #endif -atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; -static void sw_perf_counter_destroy(struct perf_counter *counter) +static void sw_perf_event_destroy(struct perf_event *event) { - u64 event = counter->attr.config; + u64 event_id = event->attr.config; - WARN_ON(counter->parent); + WARN_ON(event->parent); - atomic_dec(&perf_swcounter_enabled[event]); + atomic_dec(&perf_swevent_enabled[event_id]); } -static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) +static const struct pmu *sw_perf_event_init(struct perf_event *event) { const struct pmu *pmu = NULL; - u64 event = counter->attr.config; + u64 event_id = event->attr.config; /* - * Software counters (currently) can't in general distinguish + * Software events (currently) can't in general distinguish * between user, kernel and hypervisor events. * However, context switches and cpu migrations are considered * to be kernel events, and page faults are never hypervisor * events. */ - switch (event) { + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: pmu = &perf_ops_cpu_clock; break; case PERF_COUNT_SW_TASK_CLOCK: /* - * If the user instantiates this as a per-cpu counter, - * use the cpu_clock counter instead. + * If the user instantiates this as a per-cpu event, + * use the cpu_clock event instead. */ - if (counter->ctx->task) + if (event->ctx->task) pmu = &perf_ops_task_clock; else pmu = &perf_ops_cpu_clock; @@ -4077,9 +4076,9 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: - if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event]); - counter->destroy = sw_perf_counter_destroy; + if (!event->parent) { + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; } pmu = &perf_ops_generic; break; @@ -4089,62 +4088,62 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) } /* - * Allocate and initialize a counter structure + * Allocate and initialize a event structure */ -static struct perf_counter * -perf_counter_alloc(struct perf_counter_attr *attr, +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, int cpu, - struct perf_counter_context *ctx, - struct perf_counter *group_leader, - struct perf_counter *parent_counter, + struct perf_event_context *ctx, + struct perf_event *group_leader, + struct perf_event *parent_event, gfp_t gfpflags) { const struct pmu *pmu; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct perf_event *event; + struct hw_perf_event *hwc; long err; - counter = kzalloc(sizeof(*counter), gfpflags); - if (!counter) + event = kzalloc(sizeof(*event), gfpflags); + if (!event) return ERR_PTR(-ENOMEM); /* - * Single counters are their own group leaders, with an + * Single events are their own group leaders, with an * empty sibling list: */ if (!group_leader) - group_leader = counter; + group_leader = event; - mutex_init(&counter->child_mutex); - INIT_LIST_HEAD(&counter->child_list); + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); - INIT_LIST_HEAD(&counter->list_entry); - INIT_LIST_HEAD(&counter->event_entry); - INIT_LIST_HEAD(&counter->sibling_list); - init_waitqueue_head(&counter->waitq); + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); - mutex_init(&counter->mmap_mutex); + mutex_init(&event->mmap_mutex); - counter->cpu = cpu; - counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->ctx = ctx; + event->oncpu = -1; - counter->parent = parent_counter; + event->parent = parent_event; - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); - counter->state = PERF_COUNTER_STATE_INACTIVE; + event->state = PERF_EVENT_STATE_INACTIVE; if (attr->disabled) - counter->state = PERF_COUNTER_STATE_OFF; + event->state = PERF_EVENT_STATE_OFF; pmu = NULL; - hwc = &counter->hw; + hwc = &event->hw; hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; @@ -4153,7 +4152,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_FORMAT_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited events */ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; @@ -4162,15 +4161,15 @@ perf_counter_alloc(struct perf_counter_attr *attr, case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: case PERF_TYPE_HW_CACHE: - pmu = hw_perf_counter_init(counter); + pmu = hw_perf_event_init(event); break; case PERF_TYPE_SOFTWARE: - pmu = sw_perf_counter_init(counter); + pmu = sw_perf_event_init(event); break; case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_counter_init(counter); + pmu = tp_perf_event_init(event); break; default: @@ -4184,29 +4183,29 @@ done: err = PTR_ERR(pmu); if (err) { - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); return ERR_PTR(err); } - counter->pmu = pmu; + event->pmu = pmu; - if (!counter->parent) { - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); - if (counter->attr.task) - atomic_inc(&nr_task_counters); + if (!event->parent) { + atomic_inc(&nr_events); + if (event->attr.mmap) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); } - return counter; + return event; } -static int perf_copy_attr(struct perf_counter_attr __user *uattr, - struct perf_counter_attr *attr) +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) { u32 size; int ret; @@ -4285,11 +4284,11 @@ err_size: goto out; } -int perf_counter_set_output(struct perf_counter *counter, int output_fd) +int perf_event_set_output(struct perf_event *event, int output_fd) { - struct perf_counter *output_counter = NULL; + struct perf_event *output_event = NULL; struct file *output_file = NULL; - struct perf_counter *old_output; + struct perf_event *old_output; int fput_needed = 0; int ret = -EINVAL; @@ -4303,28 +4302,28 @@ int perf_counter_set_output(struct perf_counter *counter, int output_fd) if (output_file->f_op != &perf_fops) goto out; - output_counter = output_file->private_data; + output_event = output_file->private_data; /* Don't chain output fds */ - if (output_counter->output) + if (output_event->output) goto out; /* Don't set an output fd when we already have an output channel */ - if (counter->data) + if (event->data) goto out; atomic_long_inc(&output_file->f_count); set: - mutex_lock(&counter->mmap_mutex); - old_output = counter->output; - rcu_assign_pointer(counter->output, output_counter); - mutex_unlock(&counter->mmap_mutex); + mutex_lock(&event->mmap_mutex); + old_output = event->output; + rcu_assign_pointer(event->output, output_event); + mutex_unlock(&event->mmap_mutex); if (old_output) { /* * we need to make sure no existing perf_output_*() - * is still referencing this counter. + * is still referencing this event. */ synchronize_rcu(); fput(old_output->filp); @@ -4337,21 +4336,21 @@ out: } /** - * sys_perf_counter_open - open a performance counter, associate it to a task/cpu + * sys_perf_event_open - open a performance event, associate it to a task/cpu * - * @attr_uptr: event type attributes for monitoring/sampling + * @attr_uptr: event_id type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu - * @group_fd: group leader counter fd + * @group_fd: group leader event fd */ -SYSCALL_DEFINE5(perf_counter_open, - struct perf_counter_attr __user *, attr_uptr, +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_counter *counter, *group_leader; - struct perf_counter_attr attr; - struct perf_counter_context *ctx; - struct file *counter_file = NULL; + struct perf_event *event, *group_leader; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; struct file *group_file = NULL; int fput_needed = 0; int fput_needed2 = 0; @@ -4371,7 +4370,7 @@ SYSCALL_DEFINE5(perf_counter_open, } if (attr.freq) { - if (attr.sample_freq > sysctl_perf_counter_sample_rate) + if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; } @@ -4383,7 +4382,7 @@ SYSCALL_DEFINE5(perf_counter_open, return PTR_ERR(ctx); /* - * Look up the group leader (we will attach this counter to it): + * Look up the group leader (we will attach this event to it): */ group_leader = NULL; if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { @@ -4414,45 +4413,45 @@ SYSCALL_DEFINE5(perf_counter_open, goto err_put_context; } - counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, + event = perf_event_alloc(&attr, cpu, ctx, group_leader, NULL, GFP_KERNEL); - err = PTR_ERR(counter); - if (IS_ERR(counter)) + err = PTR_ERR(event); + if (IS_ERR(event)) goto err_put_context; - err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); + err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); if (err < 0) goto err_free_put_context; - counter_file = fget_light(err, &fput_needed2); - if (!counter_file) + event_file = fget_light(err, &fput_needed2); + if (!event_file) goto err_free_put_context; if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_counter_set_output(counter, group_fd); + err = perf_event_set_output(event, group_fd); if (err) goto err_fput_free_put_context; } - counter->filp = counter_file; + event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, counter, cpu); + perf_install_in_context(ctx, event, cpu); ++ctx->generation; mutex_unlock(&ctx->mutex); - counter->owner = current; + event->owner = current; get_task_struct(current); - mutex_lock(¤t->perf_counter_mutex); - list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); - mutex_unlock(¤t->perf_counter_mutex); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); err_fput_free_put_context: - fput_light(counter_file, fput_needed2); + fput_light(event_file, fput_needed2); err_free_put_context: if (err < 0) - kfree(counter); + kfree(event); err_put_context: if (err < 0) @@ -4464,88 +4463,88 @@ err_put_context: } /* - * inherit a counter from parent task to child task: + * inherit a event from parent task to child task: */ -static struct perf_counter * -inherit_counter(struct perf_counter *parent_counter, +static struct perf_event * +inherit_event(struct perf_event *parent_event, struct task_struct *parent, - struct perf_counter_context *parent_ctx, + struct perf_event_context *parent_ctx, struct task_struct *child, - struct perf_counter *group_leader, - struct perf_counter_context *child_ctx) + struct perf_event *group_leader, + struct perf_event_context *child_ctx) { - struct perf_counter *child_counter; + struct perf_event *child_event; /* - * Instead of creating recursive hierarchies of counters, - * we link inherited counters back to the original parent, + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, * which has a filp for sure, which we use as the reference * count: */ - if (parent_counter->parent) - parent_counter = parent_counter->parent; + if (parent_event->parent) + parent_event = parent_event->parent; - child_counter = perf_counter_alloc(&parent_counter->attr, - parent_counter->cpu, child_ctx, - group_leader, parent_counter, + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, child_ctx, + group_leader, parent_event, GFP_KERNEL); - if (IS_ERR(child_counter)) - return child_counter; + if (IS_ERR(child_event)) + return child_event; get_ctx(child_ctx); /* - * Make the child state follow the state of the parent counter, + * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en, dis}able_family. + * so we won't race with perf_event_{en, dis}able_family. */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; else - child_counter->state = PERF_COUNTER_STATE_OFF; + child_event->state = PERF_EVENT_STATE_OFF; - if (parent_counter->attr.freq) - child_counter->hw.sample_period = parent_counter->hw.sample_period; + if (parent_event->attr.freq) + child_event->hw.sample_period = parent_event->hw.sample_period; /* * Link it up in the child's context: */ - add_counter_to_ctx(child_counter, child_ctx); + add_event_to_ctx(child_event, child_ctx); /* * Get a reference to the parent filp - we will fput it - * when the child counter exits. This is safe to do because + * when the child event exits. This is safe to do because * we are in the parent and we know that the filp still * exists and has a nonzero count: */ - atomic_long_inc(&parent_counter->filp->f_count); + atomic_long_inc(&parent_event->filp->f_count); /* - * Link this into the parent counter's child list + * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_add_tail(&child_counter->child_list, &parent_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); - return child_counter; + return child_event; } -static int inherit_group(struct perf_counter *parent_counter, +static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, - struct perf_counter_context *parent_ctx, + struct perf_event_context *parent_ctx, struct task_struct *child, - struct perf_counter_context *child_ctx) + struct perf_event_context *child_ctx) { - struct perf_counter *leader; - struct perf_counter *sub; - struct perf_counter *child_ctr; + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; - leader = inherit_counter(parent_counter, parent, parent_ctx, + leader = inherit_event(parent_event, parent, parent_ctx, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); - list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - child_ctr = inherit_counter(sub, parent, parent_ctx, + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); @@ -4553,74 +4552,74 @@ static int inherit_group(struct perf_counter *parent_counter, return 0; } -static void sync_child_counter(struct perf_counter *child_counter, +static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { - struct perf_counter *parent_counter = child_counter->parent; + struct perf_event *parent_event = child_event->parent; u64 child_val; - if (child_counter->attr.inherit_stat) - perf_counter_read_event(child_counter, child); + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); - child_val = atomic64_read(&child_counter->count); + child_val = atomic64_read(&child_event->count); /* * Add back the child's count to the parent's count: */ - atomic64_add(child_val, &parent_counter->count); - atomic64_add(child_counter->total_time_enabled, - &parent_counter->child_total_time_enabled); - atomic64_add(child_counter->total_time_running, - &parent_counter->child_total_time_running); + atomic64_add(child_val, &parent_event->count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); /* - * Remove this counter from the parent's list + * Remove this event from the parent's list */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); /* - * Release the parent counter, if this was the last + * Release the parent event, if this was the last * reference to it. */ - fput(parent_counter->filp); + fput(parent_event->filp); } static void -__perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx, +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, struct task_struct *child) { - struct perf_counter *parent_counter; + struct perf_event *parent_event; - update_counter_times(child_counter); - perf_counter_remove_from_context(child_counter); + update_event_times(child_event); + perf_event_remove_from_context(child_event); - parent_counter = child_counter->parent; + parent_event = child_event->parent; /* - * It can happen that parent exits first, and has counters + * It can happen that parent exits first, and has events * that are still around due to the child reference. These - * counters need to be zapped - but otherwise linger. + * events need to be zapped - but otherwise linger. */ - if (parent_counter) { - sync_child_counter(child_counter, child); - free_counter(child_counter); + if (parent_event) { + sync_child_event(child_event, child); + free_event(child_event); } } /* - * When a child task exits, feed back counter values to parent counters. + * When a child task exits, feed back event values to parent events. */ -void perf_counter_exit_task(struct task_struct *child) +void perf_event_exit_task(struct task_struct *child) { - struct perf_counter *child_counter, *tmp; - struct perf_counter_context *child_ctx; + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; unsigned long flags; - if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, NULL, 0); + if (likely(!child->perf_event_ctxp)) { + perf_event_task(child, NULL, 0); return; } @@ -4631,37 +4630,37 @@ void perf_counter_exit_task(struct task_struct *child) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_counter_ctxp; - __perf_counter_task_sched_out(child_ctx); + child_ctx = child->perf_event_ctxp; + __perf_event_task_sched_out(child_ctx); /* * Take the context lock here so that if find_get_context is - * reading child->perf_counter_ctxp, we wait until it has + * reading child->perf_event_ctxp, we wait until it has * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; + child->perf_event_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all - * the counters from it. + * the events from it. */ unclone_ctx(child_ctx); spin_unlock_irqrestore(&child_ctx->lock, flags); /* - * Report the task dead after unscheduling the counters so that we - * won't get any samples after PERF_EVENT_EXIT. We can however still - * get a few PERF_EVENT_READ events. + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. */ - perf_counter_task(child, child_ctx, 0); + perf_event_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: * - * __perf_counter_exit_task() - * sync_child_counter() - * fput(parent_counter->filp) + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) * perf_release() * mutex_lock(&ctx->mutex) * @@ -4670,16 +4669,16 @@ void perf_counter_exit_task(struct task_struct *child) mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); again: - list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, - list_entry) - __perf_counter_exit_task(child_counter, child_ctx, child); + list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); /* - * If the last counter was a group counter, it will have appended all + * If the last event was a group event, it will have appended all * its siblings to the list, but we obtained 'tmp' before that which * will still point to the list head terminating the iteration. */ - if (!list_empty(&child_ctx->counter_list)) + if (!list_empty(&child_ctx->group_list)) goto again; mutex_unlock(&child_ctx->mutex); @@ -4691,33 +4690,33 @@ again: * free an unexposed, unused context as created by inheritance by * init_task below, used by fork() in case of fail. */ -void perf_counter_free_task(struct task_struct *task) +void perf_event_free_task(struct task_struct *task) { - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter *counter, *tmp; + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event *event, *tmp; if (!ctx) return; mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { - struct perf_counter *parent = counter->parent; + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { + struct perf_event *parent = event->parent; if (WARN_ON_ONCE(!parent)) continue; mutex_lock(&parent->child_mutex); - list_del_init(&counter->child_list); + list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); fput(parent->filp); - list_del_counter(counter, ctx); - free_counter(counter); + list_del_event(event, ctx); + free_event(event); } - if (!list_empty(&ctx->counter_list)) + if (!list_empty(&ctx->group_list)) goto again; mutex_unlock(&ctx->mutex); @@ -4726,37 +4725,37 @@ again: } /* - * Initialize the perf_counter context in task_struct + * Initialize the perf_event context in task_struct */ -int perf_counter_init_task(struct task_struct *child) +int perf_event_init_task(struct task_struct *child) { - struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter_context *cloned_ctx; - struct perf_counter *counter; + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; int ret = 0; - child->perf_counter_ctxp = NULL; + child->perf_event_ctxp = NULL; - mutex_init(&child->perf_counter_mutex); - INIT_LIST_HEAD(&child->perf_counter_list); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_counter_ctxp)) + if (likely(!parent->perf_event_ctxp)) return 0; /* * This is executed from the parent task context, so inherit - * counters that have been marked for cloning. + * events that have been marked for cloning. * First allocate and initialize a context for the child. */ - child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!child_ctx) return -ENOMEM; - __perf_counter_init_context(child_ctx, child); - child->perf_counter_ctxp = child_ctx; + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; get_task_struct(child); /* @@ -4782,16 +4781,16 @@ int perf_counter_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { - if (counter != counter->group_leader) + list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { + if (event != event->group_leader) continue; - if (!counter->attr.inherit) { + if (!event->attr.inherit) { inherited_all = 0; continue; } - ret = inherit_group(counter, parent, parent_ctx, + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) { inherited_all = 0; @@ -4805,7 +4804,7 @@ int perf_counter_init_task(struct task_struct *child) * context, or of whatever the parent is a clone of. * Note that if the parent is a clone, it could get * uncloned at any point, but that doesn't matter - * because the list of counters and the generation + * because the list of events and the generation * count can't have changed since we took the mutex. */ cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); @@ -4826,41 +4825,41 @@ int perf_counter_init_task(struct task_struct *child) return ret; } -static void __cpuinit perf_counter_init_cpu(int cpu) +static void __cpuinit perf_event_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_counter_init_context(&cpuctx->ctx, NULL); + __perf_event_init_context(&cpuctx->ctx, NULL); spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; + cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); - hw_perf_counter_setup(cpu); + hw_perf_event_setup(cpu); } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_counter_exit_cpu(void *info) +static void __perf_event_exit_cpu(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = &cpuctx->ctx; - struct perf_counter *counter, *tmp; + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event *event, *tmp; - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) - __perf_counter_remove_from_context(counter); + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + __perf_event_remove_from_context(event); } -static void perf_counter_exit_cpu(int cpu) +static void perf_event_exit_cpu(int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &cpuctx->ctx; + struct perf_event_context *ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); + smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); mutex_unlock(&ctx->mutex); } #else -static inline void perf_counter_exit_cpu(int cpu) { } +static inline void perf_event_exit_cpu(int cpu) { } #endif static int __cpuinit @@ -4872,17 +4871,17 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - perf_counter_init_cpu(cpu); + perf_event_init_cpu(cpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - hw_perf_counter_setup_online(cpu); + hw_perf_event_setup_online(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - perf_counter_exit_cpu(cpu); + perf_event_exit_cpu(cpu); break; default: @@ -4900,7 +4899,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { .priority = 20, }; -void __init perf_counter_init(void) +void __init perf_event_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); @@ -4926,7 +4925,7 @@ perf_set_reserve_percpu(struct sysdev_class *class, err = strict_strtoul(buf, 10, &val); if (err) return err; - if (val > perf_max_counters) + if (val > perf_max_events) return -EINVAL; spin_lock(&perf_resource_lock); @@ -4934,8 +4933,8 @@ perf_set_reserve_percpu(struct sysdev_class *class, for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, - perf_max_counters - perf_reserved_percpu); + mpt = min(perf_max_events - cpuctx->ctx.nr_events, + perf_max_events - perf_reserved_percpu); cpuctx->max_pertask = mpt; spin_unlock_irq(&cpuctx->ctx.lock); } @@ -4990,12 +4989,12 @@ static struct attribute *perfclass_attrs[] = { static struct attribute_group perfclass_attr_group = { .attrs = perfclass_attrs, - .name = "perf_counters", + .name = "perf_events", }; -static int __init perf_counter_sysfs_init(void) +static int __init perf_event_sysfs_init(void) { return sysfs_create_group(&cpu_sysdev_class.kset.kobj, &perfclass_attr_group); } -device_initcall(perf_counter_sysfs_init); +device_initcall(perf_event_sysfs_init); diff --git a/kernel/pid.c b/kernel/pid.c index 31310b5..d3f722d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -40,7 +40,7 @@ #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) static struct hlist_head *pid_hash; -static int pidhash_shift; +static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) void __init pidhash_init(void) { int i, pidhash_size; - unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); - pidhash_shift = max(4, fls(megabytes * 4)); - pidhash_shift = min(12, pidhash_shift); + pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, + HASH_EARLY | HASH_SMALL, + &pidhash_shift, NULL, 4096); pidhash_size = 1 << pidhash_shift; - printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", - pidhash_size, pidhash_shift, - pidhash_size * sizeof(struct hlist_head)); - - pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); - if (!pid_hash) - panic("Could not alloc pidhash!\n"); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 821722a..86b3796 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & CLONE_THREAD) + if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); return create_pid_namespace(old_ns); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e33a21c..5c9dc22 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -8,17 +8,18 @@ #include <linux/math64.h> #include <asm/uaccess.h> #include <linux/kernel_stat.h> +#include <trace/events/timer.h> /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ void update_rlimit_cpu(unsigned long rlim_new) { - cputime_t cputime; + cputime_t cputime = secs_to_cputime(rlim_new); + struct signal_struct *const sig = current->signal; - cputime = secs_to_cputime(rlim_new); - if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_gt(current->signal->it_prof_expires, cputime)) { + if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || + cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return cputime_eq(expires, cputime_zero) || + cputime_gt(expires, new_exp); +} + +static inline int expires_le(cputime_t expires, cputime_t new_exp) +{ + return !cputime_eq(expires, cputime_zero) && + cputime_le(expires, new_exp); +} /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + union cpu_time_count *exp = &nt->expires; + switch (CPUCLOCK_WHICH(timer->it_clock)) { default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->cputime_expires.prof_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.prof_exp, - nt->expires.cpu)) - p->cputime_expires.prof_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.prof_exp, + exp->cpu)) + p->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->cputime_expires.virt_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.virt_exp, - nt->expires.cpu)) - p->cputime_expires.virt_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.virt_exp, + exp->cpu)) + p->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_SCHED: if (p->cputime_expires.sched_exp == 0 || - p->cputime_expires.sched_exp > - nt->expires.sched) + p->cputime_expires.sched_exp > exp->sched) p->cputime_expires.sched_exp = - nt->expires.sched; + exp->sched; break; } } else { + struct signal_struct *const sig = p->signal; + union cpu_time_count *exp = &timer->it.cpu.expires; + /* * For a process timer, set the cached expiration time. */ @@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_VIRT: - if (!cputime_eq(p->signal->it_virt_expires, - cputime_zero) && - cputime_lt(p->signal->it_virt_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_VIRT].expires, + exp->cpu)) break; - p->signal->cputime_expires.virt_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_PROF: - if (!cputime_eq(p->signal->it_prof_expires, - cputime_zero) && - cputime_lt(p->signal->it_prof_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_PROF].expires, + exp->cpu)) break; - i = p->signal->rlim[RLIMIT_CPU].rlim_cur; + i = sig->rlim[RLIMIT_CPU].rlim_cur; if (i != RLIM_INFINITY && - i <= cputime_to_secs(timer->it.cpu.expires.cpu)) + i <= cputime_to_secs(exp->cpu)) break; - p->signal->cputime_expires.prof_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_SCHED: - p->signal->cputime_expires.sched_exp = - timer->it.cpu.expires.sched; + sig->cputime_expires.sched_exp = exp->sched; break; } } @@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static u32 onecputick; + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + cputime_t *expires, cputime_t cur_time, int signo) +{ + if (cputime_eq(it->expires, cputime_zero)) + return; + + if (cputime_ge(cur_time, it->expires)) { + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires = cputime_sub(it->expires, + cputime_one_jiffy); + it->error -= onecputick; + } + } else { + it->expires = cputime_zero; + } + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { + *expires = it->expires; + } +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk, * Don't sample the current process CPU clocks if there are no timers. */ if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it_prof_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it_virt_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { stop_process_timers(tsk); return; @@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk, /* * Check for the special case process timers. */ - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - if (cputime_ge(ptime, sig->it_prof_expires)) { - /* ITIMER_PROF fires and reloads. */ - sig->it_prof_expires = sig->it_prof_incr; - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - sig->it_prof_expires = cputime_add( - sig->it_prof_expires, ptime); - } - __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_prof_expires, cputime_zero) && - (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(sig->it_prof_expires, prof_expires))) { - prof_expires = sig->it_prof_expires; - } - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - if (cputime_ge(utime, sig->it_virt_expires)) { - /* ITIMER_VIRTUAL fires and reloads. */ - sig->it_virt_expires = sig->it_virt_incr; - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - sig->it_virt_expires = cputime_add( - sig->it_virt_expires, utime); - } - __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero) && - (cputime_eq(virt_expires, cputime_zero) || - cputime_lt(sig->it_virt_expires, virt_expires))) { - virt_expires = sig->it_virt_expires; - } - } + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); cputime_t x; @@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ - *oldval = jiffies_to_cputime(1); + *oldval = cputime_one_jiffy; } else { *oldval = cputime_sub(*oldval, now.cpu); } @@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + struct timespec ts; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + cputime_to_timespec(cputime_one_jiffy, &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + return 0; } __initcall(init_posix_cpu_timers); diff --git a/kernel/power/process.c b/kernel/power/process.c index da2072d..cc2e553 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -9,6 +9,7 @@ #undef DEBUG #include <linux/interrupt.h> +#include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 97955b0..36cb168 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + region = alloc_bootmem(sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8ba052c..b101cdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/file.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> diff --git a/kernel/printk.c b/kernel/printk.c index 602033a..f38b07f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -206,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY static unsigned int boot_delay; /* msecs delay after each printk during bootup */ -static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ +static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; - unsigned long long loops_per_msec; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; @@ -220,10 +219,9 @@ static int __init boot_delay_setup(char *str) if (boot_delay > 10 * 1000) boot_delay = 0; - printk_delay_msec = loops_per_msec; - printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " - "HZ: %d, printk_delay_msec: %llu\n", - boot_delay, preset_lpj, lpj, HZ, printk_delay_msec); + pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " + "HZ: %d, loops_per_msec: %llu\n", + boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 1; } __setup("boot_delay=", boot_delay_setup); @@ -236,7 +234,7 @@ static void boot_delay_msec(void) if (boot_delay == 0 || system_state != SYSTEM_BOOTING) return; - k = (unsigned long long)printk_delay_msec * boot_delay; + k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { @@ -655,6 +653,20 @@ static int recursion_bug; static int new_text_line = 1; static char printk_buf[1024]; +int printk_delay_msec __read_mostly; + +static inline void printk_delay(void) +{ + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; + + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} + asmlinkage int vprintk(const char *fmt, va_list args) { int printed_len = 0; @@ -664,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) char *p; boot_delay_msec(); + printk_delay(); preempt_disable(); /* This stops the holder of console_sem just where we want him */ diff --git a/kernel/profile.c b/kernel/profile.c index 419250e..a55d3a3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,48 +442,51 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <asm/uaccess.h> -static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { - int len = cpumask_scnprintf(page, count, data); - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + seq_cpumask(m, prof_cpu_mask); + seq_putc(m, '\n'); + return 0; } -static int prof_cpu_mask_write_proc(struct file *file, - const char __user *buffer, unsigned long count, void *data) +static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, prof_cpu_mask_proc_show, NULL); +} + +static ssize_t prof_cpu_mask_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) { - struct cpumask *mask = data; - unsigned long full_count = count, err; cpumask_var_t new_value; + int err; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); if (!err) { - cpumask_copy(mask, new_value); - err = full_count; + cpumask_copy(prof_cpu_mask, new_value); + err = count; } free_cpumask_var(new_value); return err; } +static const struct file_operations prof_cpu_mask_proc_fops = { + .open = prof_cpu_mask_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = prof_cpu_mask_proc_write, +}; + void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) { - struct proc_dir_entry *entry; - /* create /proc/irq/prof_cpu_mask */ - entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); - if (!entry) - return; - entry->data = prof_cpu_mask; - entry->read_proc = prof_cpu_mask_read_proc; - entry->write_proc = prof_cpu_mask_write_proc; + proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 307c285..23bd09c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh) * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. + * If it's our own child, there is no notification to do. But if our normal + * children self-reap, then this child was prevented by ptrace and we must + * reap it now, in that case we must also wake up sub-threads sleeping in + * do_wait(). */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { @@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); p->exit_signal = -1; + } } if (task_detached(p)) { /* Mark it as in the process of being reaped. */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index bd5d5c8..37ac454 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -19,7 +19,7 @@ * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * + * * Based on the original work by Paul McKenney <paulmck@us.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: @@ -27,7 +27,7 @@ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - - * http://lse.sourceforge.net/locking/rcupdate.html + * http://lse.sourceforge.net/locking/rcupdate.html * */ #include <linux/types.h> @@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head *head) complete(&rcu->completion); } +#ifdef CONFIG_TREE_PREEMPT_RCU + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -87,7 +89,7 @@ void synchronize_rcu(void) { struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) + if (!rcu_scheduler_active) return; init_completion(&rcu.completion); @@ -98,6 +100,46 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns. However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + /** * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. * diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b33db53..233768f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -18,7 +18,7 @@ * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney <paulmck@us.ibm.com> - * Josh Triplett <josh@freedesktop.org> + * Josh Triplett <josh@freedesktop.org> * * See also: Documentation/RCU/torture.txt */ @@ -50,7 +50,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " - "Josh Triplett <josh@freedesktop.org>"); + "Josh Triplett <josh@freedesktop.org>"); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ @@ -110,8 +110,8 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current = NULL; -static long rcu_torture_current_version = 0; +static struct rcu_torture *rcu_torture_current; +static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = @@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; -static long n_rcu_torture_timers = 0; +static long n_rcu_torture_timers; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; -static int stutter_pause_test = 0; +static int stutter_pause_test; #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -267,7 +267,8 @@ struct rcu_torture_ops { int irq_capable; char *name; }; -static struct rcu_torture_ops *cur_ops = NULL; + +static struct rcu_torture_ops *cur_ops; /* * Definitions for rcu torture testing. @@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct rcu_random_state *rrsp) { - long delay; - const long longdelay = 200; + const unsigned long shortdelay_us = 200; + const unsigned long longdelay_ms = 50; - /* We want there to be long-running readers, but not all the time. */ + /* We want a short delay sometimes to make a reader delay the grace + * period, and we want a long delay occasionally to trigger + * force_quiescent_state. */ - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); - if (!delay) - udelay(longdelay); + if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + udelay(shortdelay_us); } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -339,8 +343,8 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .cb_barrier = rcu_barrier, .stats = NULL, - .irq_capable = 1, - .name = "rcu" + .irq_capable = 1, + .name = "rcu" }; static void rcu_sync_torture_deferred_free(struct rcu_torture *p) @@ -638,7 +642,8 @@ rcu_torture_writer(void *arg) do { schedule_timeout_uninterruptible(1); - if ((rp = rcu_torture_alloc()) == NULL) + rp = rcu_torture_alloc(); + if (rp == NULL) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); @@ -1110,7 +1115,7 @@ rcu_torture_init(void) printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", torture_type); mutex_unlock(&fullstop_mutex); - return (-EINVAL); + return -EINVAL; } if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ @@ -1161,7 +1166,7 @@ rcu_torture_init(void) goto unwind; } fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); + GFP_KERNEL); if (fakewriter_tasks == NULL) { VERBOSE_PRINTK_ERRSTRING("out of memory"); firsterr = -ENOMEM; @@ -1170,7 +1175,7 @@ rcu_torture_init(void) for (i = 0; i < nfakewriters; i++) { VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); + "rcu_torture_fakewriter"); if (IS_ERR(fakewriter_tasks[i])) { firsterr = PTR_ERR(fakewriter_tasks[i]); VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b11b07..52b06f6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -25,7 +25,7 @@ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU + * Documentation/RCU */ #include <linux/types.h> #include <linux/kernel.h> @@ -107,27 +107,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, */ void rcu_sched_qs(int cpu) { - unsigned long flags; struct rcu_data *rdp; - local_irq_save(flags); rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; - rcu_preempt_qs(cpu); - local_irq_restore(flags); + barrier(); + rdp->passed_quiesc = 1; + rcu_preempt_note_context_switch(cpu); } void rcu_bh_qs(int cpu) { - unsigned long flags; struct rcu_data *rdp; - local_irq_save(flags); rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; - local_irq_restore(flags); + barrier(); + rdp->passed_quiesc = 1; } #ifdef CONFIG_NO_HZ @@ -605,8 +601,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = rsp->rda[smp_processor_id()]; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur; - struct rcu_node *rnp_end; if (!cpu_needs_another_gp(rsp, rdp)) { spin_unlock_irqrestore(&rnp->lock, flags); @@ -615,6 +609,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; + WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -631,7 +626,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { + rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -644,42 +641,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* - * Set the quiescent-state-needed bits in all the non-leaf RCU - * nodes for all currently online CPUs. This operation relies - * on the layout of the hierarchy within the rsp->node[] array. - * Note that other CPUs will access only the leaves of the - * hierarchy, which still indicate that no grace period is in - * progress. In addition, we have excluded CPU-hotplug operations. - * - * We therefore do not need to hold any locks. Any required - * memory barriers will be supplied by the locks guarding the - * leaf rcu_nodes in the hierarchy. - */ - - rnp_end = rsp->level[NUM_RCU_LVLS - 1]; - for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++) - rnp_cur->qsmask = rnp_cur->qsmaskinit; - - /* - * Now set up the leaf nodes. Here we must be careful. First, - * we need to hold the lock in order to exclude other CPUs, which - * might be contending for the leaf nodes' locks. Second, as - * soon as we initialize a given leaf node, its CPUs might run - * up the rest of the hierarchy. We must therefore acquire locks - * for each node that we touch during this stage. (But we still - * are excluding CPU-hotplug operations.) + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure. This + * operation relies on the layout of the hierarchy within the + * rsp->node[] array. Note that other CPUs will access only + * the leaves of the hierarchy, which still indicate that no + * grace period is in progress, at least until the corresponding + * leaf node has been initialized. In addition, we have excluded + * CPU-hotplug operations. * * Note that the grace period cannot complete until we finish * the initialization process, as there will be at least one * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU. + * one corresponding to this CPU, due to the fact that we have + * irqs disabled. */ - rnp_end = &rsp->node[NUM_RCU_NODES]; - rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - for (; rnp_cur < rnp_end; rnp_cur++) { - spin_lock(&rnp_cur->lock); /* irqs already disabled. */ - rnp_cur->qsmask = rnp_cur->qsmaskinit; - spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ + for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + spin_unlock(&rnp->lock); /* irqs already disabled. */ } rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ @@ -722,6 +705,7 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) __releases(rnp->lock) { + WARN_ON_ONCE(rsp->completed == rsp->gpnum); rsp->completed = rsp->gpnum; rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -739,6 +723,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { + struct rcu_node *rnp_c; + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask)) { @@ -762,8 +748,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, break; } spin_unlock_irqrestore(&rnp->lock, flags); + rnp_c = rnp; rnp = rnp->parent; spin_lock_irqsave(&rnp->lock, flags); + WARN_ON_ONCE(rnp_c->qsmask); } /* @@ -776,10 +764,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, /* * Record a quiescent state for the specified CPU, which must either be - * the current CPU or an offline CPU. The lastcomp argument is used to - * make sure we are still in the grace period of interest. We don't want - * to end the current grace period based on quiescent states detected in - * an earlier grace period! + * the current CPU. The lastcomp argument is used to make sure we are + * still in the grace period of interest. We don't want to end the current + * grace period based on quiescent states detected in an earlier grace + * period! */ static void cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) @@ -814,7 +802,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rdp = rsp->rda[smp_processor_id()]; rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ @@ -872,7 +859,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_lock_irqsave(&rsp->onofflock, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; + rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { spin_lock(&rnp->lock); /* irqs already disabled. */ @@ -881,7 +868,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp); + rcu_preempt_offline_tasks(rsp, rnp, rdp); mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; @@ -890,9 +877,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - /* Being offline is a quiescent state, so go record it. */ - cpu_quiet(cpu, rsp, rdp, lastcomp); - /* * Move callbacks from the outgoing CPU to the running CPU. * Note that the outgoing CPU is now quiscent, so it is now @@ -1457,20 +1441,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - - /* - * A new grace period might start here. If so, we will be part of - * it, and its gpnum will be greater than ours, so we will - * participate. It is also possible for the gpnum to have been - * incremented before this function was called, and the bitmasks - * to not be filled out until now, in which case we will also - * participate due to our gpnum being behind. - */ - - /* Since it is coming online, the CPU is in a quiescent state. */ - cpu_quiet(cpu, rsp, rdp, lastcomp); - local_irq_restore(flags); + spin_unlock_irqrestore(&rsp->onofflock, flags); } static void __cpuinit rcu_online_cpu(int cpu) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index bf8a6f9..8e8287a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -142,7 +142,7 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; - long qlen; /* # of queued callbacks */ + long qlen; /* # of queued callbacks */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4778936..1cee04f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -64,22 +64,31 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. */ -static void rcu_preempt_qs_record(int cpu) +static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc = 1; rdp->passed_quiesc_completed = rdp->completed; + barrier(); + rdp->passed_quiesc = 1; } /* - * We have entered the scheduler or are between softirqs in ksoftirqd. - * If we are in an RCU read-side critical section, we need to reflect - * that in the state of the rcu_node structure corresponding to this CPU. - * Caller must disable hardirqs. + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the appropriate entry + * of the blocked_tasks[] array. The task will dequeue itself when + * it exits the outermost enclosing RCU read-side critical section. + * Therefore, the current grace period cannot be permitted to complete + * until the blocked_tasks[] entry indexed by the low-order bit of + * rnp->gpnum empties. + * + * Caller must disable preemption. */ -static void rcu_preempt_qs(int cpu) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; + unsigned long flags; int phase; struct rcu_data *rdp; struct rcu_node *rnp; @@ -90,7 +99,7 @@ static void rcu_preempt_qs(int cpu) /* Possibly blocking in an RCU read-side critical section. */ rdp = rcu_preempt_state.rda[cpu]; rnp = rdp->mynode; - spin_lock(&rnp->lock); + spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -103,11 +112,15 @@ static void rcu_preempt_qs(int cpu) * state for the current grace period), then as long * as that task remains queued, the current grace period * cannot end. + * + * But first, note that the current CPU must still be + * on line! */ - phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1); + WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); + phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); - smp_mb(); /* Ensure later ctxt swtch seen after above. */ - spin_unlock(&rnp->lock); + spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -119,9 +132,10 @@ static void rcu_preempt_qs(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs_record(cpu); - t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS | - RCU_READ_UNLOCK_GOT_QS); + rcu_preempt_qs(cpu); + local_irq_save(flags); + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + local_irq_restore(flags); } /* @@ -157,7 +171,7 @@ static void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special & RCU_READ_UNLOCK_NEED_QS) { t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS; + rcu_preempt_qs(smp_processor_id()); } /* Hardware IRQ handlers cannot block. */ @@ -177,10 +191,10 @@ static void rcu_read_unlock_special(struct task_struct *t) */ for (;;) { rnp = t->rcu_blocked_node; - spin_lock(&rnp->lock); + spin_lock(&rnp->lock); /* irqs already disabled. */ if (rnp == t->rcu_blocked_node) break; - spin_unlock(&rnp->lock); + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); list_del_init(&t->rcu_node_entry); @@ -194,9 +208,8 @@ static void rcu_read_unlock_special(struct task_struct *t) */ if (!empty && rnp->qsmask == 0 && list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { - t->rcu_read_unlock_special &= - ~(RCU_READ_UNLOCK_NEED_QS | - RCU_READ_UNLOCK_GOT_QS); + struct rcu_node *rnp_p; + if (rnp->parent == NULL) { /* Only one rcu_node in the tree. */ cpu_quiet_msk_finish(&rcu_preempt_state, flags); @@ -205,9 +218,10 @@ static void rcu_read_unlock_special(struct task_struct *t) /* Report up the rest of the hierarchy. */ mask = rnp->grpmask; spin_unlock_irqrestore(&rnp->lock, flags); - rnp = rnp->parent; - spin_lock_irqsave(&rnp->lock, flags); - cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags); + rnp_p = rnp->parent; + spin_lock_irqsave(&rnp_p->lock, flags); + WARN_ON_ONCE(rnp->qsmask); + cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); return; } spin_unlock(&rnp->lock); @@ -259,6 +273,19 @@ static void rcu_print_task_stall(struct rcu_node *rnp) #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* + * Check that the list of blocked tasks for the newly completed grace + * period is in fact empty. It is a serious bug to complete a grace + * period that still has RCU readers blocked! This function must be + * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * must be held by the caller. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); + WARN_ON_ONCE(rnp->qsmask); +} + +/* * Check for preempted RCU readers for the specified rcu_node structure. * If the caller needs a reliable answer, it must hold the rcu_node's * >lock. @@ -280,7 +307,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp) * The caller must hold rnp->lock with irqs disabled. */ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; @@ -292,6 +320,9 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, WARN_ONCE(1, "Last CPU thought to be offlined?"); return; /* Shouldn't happen: at least one CPU online. */ } + WARN_ON_ONCE(rnp != rdp->mynode && + (!list_empty(&rnp->blocked_tasks[0]) || + !list_empty(&rnp->blocked_tasks[1]))); /* * Move tasks up to root rcu_node. Rely on the fact that the @@ -335,20 +366,12 @@ static void rcu_preempt_check_callbacks(int cpu) struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - t->rcu_read_unlock_special &= - ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS); - rcu_preempt_qs_record(cpu); + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + rcu_preempt_qs(cpu); return; } - if (per_cpu(rcu_preempt_data, cpu).qs_pending) { - if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) { - rcu_preempt_qs_record(cpu); - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS; - } else if (!(t->rcu_read_unlock_special & - RCU_READ_UNLOCK_NEED_QS)) { - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; - } - } + if (per_cpu(rcu_preempt_data, cpu).qs_pending) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } /* @@ -434,7 +457,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * Because preemptable RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_qs(int cpu) +static void rcu_preempt_note_context_switch(int cpu) { } @@ -451,6 +474,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* + * Because there is no preemptable RCU, there can be no readers blocked, + * so there is no need to check for blocked tasks. So check only for + * bogus qsmask values. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(rnp->qsmask); +} + +/* * Because preemptable RCU does not exist, there are never any preempted * RCU readers. */ @@ -466,7 +499,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp) * tasks that were blocked within RCU read-side critical sections. */ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp, + struct rcu_data *rdp) { } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0ea1bff..c89f5e9 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -20,7 +20,7 @@ * Papers: http://www.rdrop.com/users/paulmck/RCU * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU + * Documentation/RCU * */ #include <linux/types.h> diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e1338f0..88faec2 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = RESOURCE_MAX; + counter->soft_limit = RESOURCE_MAX; counter->parent = parent; } @@ -36,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) } int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) + struct res_counter **limit_fail_at, + struct res_counter **soft_limit_fail_at) { int ret; unsigned long flags; struct res_counter *c, *u; *limit_fail_at = NULL; + if (soft_limit_fail_at) + *soft_limit_fail_at = NULL; local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); ret = res_counter_charge_locked(c, val); + /* + * With soft limits, we return the highest ancestor + * that exceeds its soft limit + */ + if (soft_limit_fail_at && + !res_counter_soft_limit_check_locked(c)) + *soft_limit_fail_at = c; spin_unlock(&c->lock); if (ret < 0) { *limit_fail_at = c; @@ -74,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) counter->usage -= val; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +void res_counter_uncharge(struct res_counter *counter, unsigned long val, + bool *was_soft_limit_excess) { unsigned long flags; struct res_counter *c; @@ -82,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); + if (was_soft_limit_excess) + *was_soft_limit_excess = + !res_counter_soft_limit_check_locked(c); res_counter_uncharge_locked(c, val); spin_unlock(&c->lock); } @@ -101,6 +116,8 @@ res_counter_member(struct res_counter *counter, int member) return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_SOFT_LIMIT: + return &counter->soft_limit; }; BUG(); diff --git a/kernel/resource.c b/kernel/resource.c index 78b0872..fb11a58 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -223,13 +223,13 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* * Finds the lowest memory reosurce exists within [res->start.res->end) - * the caller must specify res->start, res->end, res->flags. + * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. */ -static int find_next_system_ram(struct resource *res) +static int find_next_system_ram(struct resource *res, char *name) { resource_size_t start, end; struct resource *p; @@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res) /* system ram is just marked as IORESOURCE_MEM */ if (p->flags != res->flags) continue; + if (name && strcmp(p->name, name)) + continue; if (p->start > end) { p = NULL; break; @@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res) res->end = p->end; return 0; } -int -walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)) + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". + */ +int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg, int (*func)(unsigned long, unsigned long, void *)) { struct resource res; unsigned long pfn, len; u64 orig_end; int ret = -1; + res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; - while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { + while ((res.start < res.end) && + (find_next_system_ram(&res, "System RAM") >= 0)) { pfn = (unsigned long)(res.start >> PAGE_SHIFT); len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); ret = (*func)(pfn, len, arg); diff --git a/kernel/sched.c b/kernel/sched.c index faf4d46..ee61f45 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@ #include <linux/completion.h> #include <linux/kernel_stat.h> #include <linux/debug_locks.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> @@ -681,15 +681,9 @@ inline void update_rq_clock(struct rq *rq) * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -int runqueue_is_locked(void) +int runqueue_is_locked(int cpu) { - int cpu = get_cpu(); - struct rq *rq = cpu_rq(cpu); - int ret; - - ret = spin_is_locked(&rq->lock); - put_cpu(); - return ret; + return spin_is_locked(&cpu_rq(cpu)->lock); } /* @@ -2059,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - @@ -2724,7 +2718,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_counter_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -2910,6 +2904,19 @@ unsigned long nr_iowait(void) return sum; } +unsigned long nr_iowait_cpu(void) +{ + struct rq *this = this_rq(); + return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; @@ -5085,17 +5092,16 @@ void account_idle_time(cputime_t cputime) */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, one_jiffy_scaled); else - account_idle_time(one_jiffy); + account_idle_time(cputime_one_jiffy); } /* @@ -5199,7 +5205,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); - perf_counter_task_tick(curr, cpu); + perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5415,7 +5421,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -6825,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - /* - * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER - * tasks that are on an otherwise idle runqueue: - */ - time_slice = 0; - if (p->policy == SCHED_RR) { - time_slice = DEF_TIMESLICE; - } else if (p->policy != SCHED_FIFO) { - struct sched_entity *se = &p->se; - unsigned long flags; - struct rq *rq; + time_slice = p->sched_class->get_rr_interval(p); - rq = task_rq_lock(p, &flags); - if (rq->cfs.load.weight) - time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); - } read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -7692,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -9171,6 +9162,7 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -9202,7 +9194,6 @@ void __init sched_init_smp(void) sched_init_granularity(); free_cpumask_var(non_isolated_cpus); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); init_sched_rt_class(); } #else @@ -9549,7 +9540,7 @@ void __init sched_init(void) alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_counter_init(); + perf_event_init(); scheduler_running = 1; } @@ -10321,7 +10312,7 @@ static int sched_rt_global_constraints(void) #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -10332,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); @@ -10386,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) } static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -10397,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if (tsk->sched_class != &fair_sched_class) return -EINVAL; #endif + return 0; +} +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup) +{ + int retval = cpu_cgroup_can_attach_task(cgrp, tsk); + if (retval) + return retval; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + retval = cpu_cgroup_can_attach_task(cgrp, c); + if (retval) { + rcu_read_unlock(); + return retval; + } + } + rcu_read_unlock(); + } return 0; } static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) + struct cgroup *old_cont, struct task_struct *tsk, + bool threadgroup) { sched_move_task(tsk); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + sched_move_task(c); + } + rcu_read_unlock(); + } } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 990b188..4e777b4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) #ifdef CONFIG_SCHED_DEBUG int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -710,31 +710,28 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice(cfs_rq, se); - if (!initial) { - /* sleeps upto a single latency don't count. */ - if (sched_feat(FAIR_SLEEPERS)) { - unsigned long thresh = sysctl_sched_latency; + /* sleeps up to a single latency don't count. */ + if (!initial && sched_feat(FAIR_SLEEPERS)) { + unsigned long thresh = sysctl_sched_latency; - /* - * Convert the sleeper threshold into virtual time. - * SCHED_IDLE is a special sub-class. We care about - * fairness only relative to other SCHED_IDLE tasks, - * all of which have the same weight. - */ - if (sched_feat(NORMALIZED_SLEEPER) && - (!entity_is_task(se) || - task_of(se)->policy != SCHED_IDLE)) - thresh = calc_delta_fair(thresh, se); + /* + * Convert the sleeper threshold into virtual time. + * SCHED_IDLE is a special sub-class. We care about + * fairness only relative to other SCHED_IDLE tasks, + * all of which have the same weight. + */ + if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || + task_of(se)->policy != SCHED_IDLE)) + thresh = calc_delta_fair(thresh, se); - /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: - */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; - vruntime -= thresh; - } + vruntime -= thresh; } /* ensure we never gain time by being placed backwards. */ @@ -1343,7 +1340,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { - if (sched_feat(AFFINE_WAKEUPS)) + if (sched_feat(AFFINE_WAKEUPS) && + cpumask_test_cpu(cpu, &p->cpus_allowed)) want_affine = 1; new_cpu = prev_cpu; } @@ -1941,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p) } #endif +unsigned int get_rr_interval_fair(struct task_struct *task) +{ + struct sched_entity *se = &task->se; + unsigned long flags; + struct rq *rq; + unsigned int rr_interval = 0; + + /* + * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise + * idle runqueue: + */ + rq = task_rq_lock(task, &flags); + if (rq->cfs.load.weight) + rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + task_rq_unlock(rq, &flags); + + return rr_interval; +} + /* * All the scheduling class methods: */ @@ -1969,6 +1986,8 @@ static const struct sched_class fair_sched_class = { .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, + .get_rr_interval = get_rr_interval_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .moved_group = moved_group_fair, #endif diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a8b448a..b133a28 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } +unsigned int get_rr_interval_idle(struct task_struct *task) +{ + return 0; +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = { .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, + .get_rr_interval = get_rr_interval_idle, + .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 13de712..a4d790c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1734,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } +unsigned int get_rr_interval_rt(struct task_struct *task) +{ + /* + * Time slice is 0 for SCHED_FIFO tasks + */ + if (task->policy == SCHED_RR) + return DEF_TIMESLICE; + else + return 0; +} + static const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, @@ -1762,6 +1773,8 @@ static const struct sched_class rt_sched_class = { .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, + .get_rr_interval = get_rr_interval_rt, + .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, }; diff --git a/kernel/signal.c b/kernel/signal.c index 64c5dee..6705320 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) if (why) { /* - * The first thread which returns from finish_stop() + * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ @@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) return send_signal(sig, info, t, 0); } +int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + bool group) +{ + unsigned long flags; + int ret = -ESRCH; + + if (lock_task_sighand(p, &flags)) { + ret = send_signal(sig, info, p, group); + unlock_task_sighand(p, &flags); + } + + return ret; +} + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p) } } -int __fatal_signal_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL); -} -EXPORT_SYMBOL(__fatal_signal_pending); - struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; @@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - unsigned long flags; - int ret; + int ret = check_kill_permission(sig, info, p); - ret = check_kill_permission(sig, info, p); - - if (!ret && sig) { - ret = -ESRCH; - if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); - } - } + if (!ret && sig) + ret = do_send_sig_info(sig, info, p, true); return ret; } @@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -/* - * The caller must ensure the task can't exit. - */ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret; - unsigned long flags; - /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). @@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = specific_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; + return do_send_sig_info(sig, info, p, false); } #define __si_special(priv) \ @@ -1383,15 +1374,6 @@ ret: } /* - * Wake up any threads in the parent blocked in wait* syscalls. - */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) -{ - wake_up_interruptible_sync(&parent->signal->wait_chldexit); -} - -/* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * @@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code) spin_unlock_irq(¤t->sighand->siglock); } -static void -finish_stop(int stop_count) -{ - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, - * report to the parent. When ptraced, every thread reports itself. - */ - if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, CLD_STOPPED); - read_unlock(&tasklist_lock); - } - - do { - schedule(); - } while (try_to_freeze()); - /* - * Now we don't run again until continued. - */ - current->exit_code = 0; -} - /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. @@ -1705,15 +1664,9 @@ finish_stop(int stop_count) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int stop_count; + int notify; - if (sig->group_stop_count > 0) { - /* - * There is a group stop in progress. We don't need to - * start another one. - */ - stop_count = --sig->group_stop_count; - } else { + if (!sig->group_stop_count) { struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; - stop_count = 0; + sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) /* * Setting state to TASK_STOPPED for a group @@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr) */ if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { - stop_count++; + sig->group_stop_count++; signal_wake_up(t, 0); } - sig->group_stop_count = stop_count; } + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, report + * to the parent. When ptraced, every thread reports itself. + */ + notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; + notify = tracehook_notify_jctl(notify, CLD_STOPPED); + /* + * tracehook_notify_jctl() can drop and reacquire siglock, so + * we keep ->group_stop_count != 0 before the call. If SIGCONT + * or SIGKILL comes in between ->group_stop_count == 0. + */ + if (sig->group_stop_count) { + if (!--sig->group_stop_count) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + } + spin_unlock_irq(¤t->sighand->siglock); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + do { + schedule(); + } while (try_to_freeze()); + + tracehook_finish_jctl(); + current->exit_code = 0; - spin_unlock_irq(¤t->sighand->siglock); - finish_stop(stop_count); return 1; } @@ -1815,14 +1793,15 @@ relock: int why = (signal->flags & SIGNAL_STOP_CONTINUED) ? CLD_CONTINUED : CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; - spin_unlock_irq(&sighand->siglock); - if (unlikely(!tracehook_notify_jctl(1, why))) - goto relock; + why = tracehook_notify_jctl(why, CLD_CONTINUED); + spin_unlock_irq(&sighand->siglock); - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); + if (why) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); + } goto relock; } @@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk) if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = 1; + group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); } out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { + if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, CLD_STOPPED); + do_notify_parent_cldstop(tsk, group_stop); read_unlock(&tasklist_lock); } } @@ -2290,7 +2269,6 @@ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { struct task_struct *p; - unsigned long flags; int error = -ESRCH; rcu_read_lock(); @@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. - * - * If lock_task_sighand() fails we pretend the task dies - * after receiving the signal. The window is tiny, and the - * signal is private anyway. */ - if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); + if (!error && sig) { + error = do_send_sig_info(sig, info, p, false); + /* + * If lock_task_sighand() failed we pretend the task + * dies after receiving the signal. The window is tiny, + * and the signal is private anyway. + */ + if (unlikely(error == -ESRCH)) + error = 0; } } rcu_read_unlock(); diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 09d7519..0d31135 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); #ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, +static int slow_work_min_threads_sysctl(struct ctl_table *, int, void __user *, size_t *, loff_t *); -static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, +static int slow_work_max_threads_sysctl(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif @@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data) * Handle adjustment of the minimum number of threads */ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, * Handle adjustment of the maximum number of threads */ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { diff --git a/kernel/smp.c b/kernel/smp.c index 8e21850..c9d1c78 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,8 +29,7 @@ enum { struct call_function_data { struct call_single_data csd; - spinlock_t lock; - unsigned int refs; + atomic_t refs; cpumask_var_t cpumask; }; @@ -39,9 +38,7 @@ struct call_single_queue { spinlock_t lock; }; -static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { - .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), -}; +static DEFINE_PER_CPU(struct call_function_data, cfd_data); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -196,25 +193,18 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - spin_lock(&data->lock); - if (!cpumask_test_cpu(cpu, data->cpumask)) { - spin_unlock(&data->lock); + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; - } - cpumask_clear_cpu(cpu, data->cpumask); - spin_unlock(&data->lock); data->csd.func(data->csd.info); - spin_lock(&data->lock); - WARN_ON(data->refs == 0); - refs = --data->refs; + refs = atomic_dec_return(&data->refs); + WARN_ON(refs < 0); if (!refs) { spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); spin_unlock(&call_function.lock); } - spin_unlock(&data->lock); if (refs) continue; @@ -357,13 +347,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, generic_exec_single(cpu, data, wait); } -/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ - -#ifndef arch_send_call_function_ipi_mask -# define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) -#endif - /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -419,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); - spin_lock_irqsave(&data->lock, flags); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); - data->refs = cpumask_weight(data->cpumask); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); - spin_lock(&call_function.lock); + spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock(&call_function.lock); - - spin_unlock_irqrestore(&data->lock, flags); + spin_unlock_irqrestore(&call_function.lock, flags); /* * Make the list addition visible before sending the ipi. diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 88796c3..81324d1 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void) EXPORT_SYMBOL(touch_all_softlockup_watchdogs); int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } /* diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097..255475d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,7 +14,7 @@ #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/kexec.h> @@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long flags; cputime_t utime, stime; struct task_cputime cputime; + unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; @@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = task_utime(current); stime = task_stime(current); accumulate_thread_rusage(p, r); + maxrss = p->signal->maxrss; goto out; } @@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt = p->signal->cmaj_flt; r->ru_inblock = p->signal->cinblock; r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; if (who == RUSAGE_CHILDREN) break; @@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt += p->signal->maj_flt; r->ru_inblock += p->signal->inblock; r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; t = p; do { accumulate_thread_rusage(t, r); @@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); + + if (who != RUSAGE_CHILDREN) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + } + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) @@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; - case PR_TASK_PERF_COUNTERS_DISABLE: - error = perf_counter_task_disable(); + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); break; - case PR_TASK_PERF_COUNTERS_ENABLE: - error = perf_counter_task_enable(); + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; @@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->timer_slack_ns = arg2; error = 0; break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case 0: + if (arg3 != 0) + return -EINVAL; + current->flags &= ~PF_MCE_PROCESS; + break; + case 1: + current->flags |= PF_MCE_PROCESS; + if (arg3 != 0) + current->flags |= PF_MCE_EARLY; + else + current->flags &= ~PF_MCE_EARLY; + break; + default: + return -EINVAL; + } + error = 0; + break; + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 68320f6..515bc23 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -177,4 +177,4 @@ cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); /* performance counters: */ -cond_syscall(sys_perf_counter_open); +cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1a631ba..0d949c5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -26,7 +26,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/utsname.h> #include <linux/kmemcheck.h> #include <linux/smp_lock.h> #include <linux/fs.h> @@ -50,7 +49,7 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/slow-work.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -77,6 +76,7 @@ extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; +extern unsigned int core_pipe_limit; extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -106,6 +106,9 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; +#ifdef CONFIG_PRINTK +static int ten_thousand = 10000; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -160,9 +163,9 @@ extern int max_lock_depth; #endif #ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -421,6 +424,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -722,6 +733,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &ten_thousand, + }, #endif { .ctl_name = KERN_NGROUPS_MAX, @@ -964,28 +986,28 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_paranoid", - .data = &sysctl_perf_counter_paranoid, - .maxlen = sizeof(sysctl_perf_counter_paranoid), + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_mlock_kb", - .data = &sysctl_perf_counter_mlock, - .maxlen = sizeof(sysctl_perf_counter_mlock), + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_max_sample_rate", - .data = &sysctl_perf_counter_sample_rate, - .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, @@ -1376,6 +1398,31 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, +#ifdef CONFIG_MEMORY_FAILURE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -2204,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head) #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(void* data, int maxlen, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -2265,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write, * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2279,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write, * * Returns 0 on success. */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, filp, + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } @@ -2307,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, struct file *filp, void __user *buffer, + int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -2414,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, #undef TMPBUFLEN } -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, +static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { - return __do_proc_dointvec(table->data, table, write, filp, + return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } @@ -2428,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2438,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, NULL,NULL); } @@ -2449,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, * Taint values can only be increased * This means we can safely use a temporary. */ -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2461,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp, t = *table; t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; @@ -2513,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2526,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, ¶m); } static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, @@ -2643,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, - filp, buffer, lenp, ppos, convmul, convdiv); + buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2670,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2695,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } @@ -2775,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2787,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } @@ -2798,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position @@ -2810,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * * Returns 0 on success. */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); } @@ -2821,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2834,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid *new_pid; @@ -2850,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp tmp = pid_vnr(cad_pid); - r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + r = __do_proc_dointvec(&tmp, table, write, buffer, lenp, ppos, NULL, NULL); if (r || !write) return r; @@ -2865,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp #else /* CONFIG_PROC_FS */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 0b0a636..ee26662 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 0000000..86628e7 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + */ + +#include <linux/time.h> +#include <linux/module.h> + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + const unsigned short *ip; + + days = totalsecs / SECS_PER_DAY; + rem = totalsecs % SECS_PER_DAY; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time_to_tm); diff --git a/kernel/timer.c b/kernel/timer.c index bbb5107..5db5a8d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/sched.h> #include <asm/uaccess.h> @@ -46,6 +46,9 @@ #include <asm/timex.h> #include <asm/io.h> +#define CREATE_TRACE_POINTS +#include <trace/events/timer.h> + u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -521,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } #endif +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key) @@ -549,7 +571,7 @@ void init_timer_key(struct timer_list *timer, const char *name, struct lock_class_key *key) { - debug_timer_init(timer); + debug_init(timer); __init_timer(timer, name, key); } EXPORT_SYMBOL(init_timer_key); @@ -568,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer, { struct list_head *entry = &timer->entry; - debug_timer_deactivate(timer); + debug_deactivate(timer); __list_del(entry->prev, entry->next); if (clear_pending) @@ -632,7 +654,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, goto out_unlock; } - debug_timer_activate(timer); + debug_activate(timer, expires); new_base = __get_cpu_var(tvec_bases); @@ -787,7 +809,7 @@ void add_timer_on(struct timer_list *timer, int cpu) BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); - debug_timer_activate(timer); + debug_activate(timer, timer->expires); if (time_before(timer->expires, base->next_timer) && !tbase_get_deferrable(timer->base)) base->next_timer = timer->expires; @@ -1000,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base) */ lock_map_acquire(&lockdep_map); + trace_timer_expire_entry(timer); fn(data); + trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1187,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_counter_do_pending(); + perf_event_do_pending(); hrtimer_run_pending(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e716346..b416512 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options options. +# hidding of the automatic options. config TRACING bool diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cc615f8..a142579 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1520,7 +1520,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations show_ftrace_seq_ops = { +static const struct seq_operations show_ftrace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -2414,11 +2414,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * __g_next(struct seq_file *m, loff_t *pos) { - unsigned long *array = m->private; - if (*pos >= ftrace_graph_count) return NULL; - return &array[*pos]; + return &ftrace_graph_funcs[*pos]; } static void * @@ -2461,7 +2459,7 @@ static int g_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations ftrace_graph_seq_ops = { +static const struct seq_operations ftrace_graph_seq_ops = { .start = g_start, .next = g_next, .stop = g_stop, @@ -2482,16 +2480,10 @@ ftrace_graph_open(struct inode *inode, struct file *file) ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } + mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) { + if (file->f_mode & FMODE_READ) ret = seq_open(file, &ftrace_graph_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = ftrace_graph_funcs; - } - } else - file->private_data = ftrace_graph_funcs; - mutex_unlock(&graph_lock); return ret; } @@ -2560,7 +2552,6 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - unsigned long *array; size_t read = 0; ssize_t ret; @@ -2574,12 +2565,6 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, goto out; } - if (file->f_mode & FMODE_READ) { - struct seq_file *m = file->private_data; - array = m->private; - } else - array = file->private_data; - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { ret = -ENOMEM; goto out; @@ -2591,7 +2576,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, parser.buffer[parser.idx] = 0; /* we allow only one expression at a time */ - ret = ftrace_set_func(array, &ftrace_graph_count, + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, parser.buffer); if (ret) goto out; @@ -3030,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3040,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd52a19..411af37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -125,13 +125,13 @@ int ftrace_dump_on_oops; static int tracing_set_tracer(const char *buf); -#define BOOTUP_TRACER_SIZE 100 -static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; +#define MAX_TRACER_SIZE 100 +static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static int __init set_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); + strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = 1; @@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; /* - * max_tracer_type_len is used to simplify the allocating of - * buffers to read userspace tracer names. We keep track of - * the longest tracer name registered. - */ -static int max_tracer_type_len; - -/* * trace_types_lock is used to protect the trace_types list. * This lock is also used to keep user access serialized. * Accesses from userspace will grab this lock while userspace @@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock); */ void trace_wake_up(void) { + int cpu; + + if (trace_flags & TRACE_ITER_BLOCK) + return; /* * The runqueue_is_locked() can fail, but this is the best we * have for now: */ - if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) + cpu = get_cpu(); + if (!runqueue_is_locked(cpu)) wake_up(&trace_wait); + put_cpu(); } static int __init set_buf_size(char *str) @@ -619,7 +618,6 @@ __releases(kernel_lock) __acquires(kernel_lock) { struct tracer *t; - int len; int ret = 0; if (!type->name) { @@ -627,6 +625,11 @@ __acquires(kernel_lock) return -1; } + if (strlen(type->name) > MAX_TRACER_SIZE) { + pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); + return -1; + } + /* * When this gets called we hold the BKL which means that * preemption is disabled. Various trace selftests however @@ -641,7 +644,7 @@ __acquires(kernel_lock) for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ - pr_info("Trace %s already registered\n", + pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; @@ -692,9 +695,6 @@ __acquires(kernel_lock) type->next = trace_types; trace_types = type; - len = strlen(type->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; out: tracing_selftest_running = false; @@ -703,7 +703,7 @@ __acquires(kernel_lock) if (ret || !default_bootup_tracer) goto out_unlock; - if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) + if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) goto out_unlock; printk(KERN_INFO "Starting tracer '%s'\n", type->name); @@ -725,14 +725,13 @@ __acquires(kernel_lock) void unregister_tracer(struct tracer *type) { struct tracer **t; - int len; mutex_lock(&trace_types_lock); for (t = &trace_types; *t; t = &(*t)->next) { if (*t == type) goto found; } - pr_info("Trace %s not registered\n", type->name); + pr_info("Tracer %s not registered\n", type->name); goto out; found: @@ -745,17 +744,7 @@ void unregister_tracer(struct tracer *type) current_trace->stop(&global_trace); current_trace = &nop_trace; } - - if (strlen(type->name) != max_tracer_type_len) - goto out; - - max_tracer_type_len = 0; - for (t = &trace_types; *t; t = &(*t)->next) { - len = strlen((*t)->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; - } - out: +out: mutex_unlock(&trace_types_lock); } @@ -1960,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations tracer_seq_ops = { +static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, @@ -1995,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - cpumask_clear(iter->started); - if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -2174,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations show_traces_seq_ops = { +static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -2604,7 +2591,7 @@ static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+2]; + char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); @@ -2754,15 +2741,15 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+1]; + char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; int err; ret = cnt; - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; + if (cnt > MAX_TRACER_SIZE) + cnt = MAX_TRACER_SIZE; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -4400,7 +4387,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; /* To save memory, keep the ring buffer size to its minimum */ @@ -4411,7 +4398,6 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); - cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 55a25c9..dd44b87 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -8,6 +8,57 @@ #include <linux/module.h> #include "trace.h" +/* + * We can't use a size but a type in alloc_percpu() + * So let's create a dummy type that matches the desired size + */ +typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; + +char *trace_profile_buf; +EXPORT_SYMBOL_GPL(trace_profile_buf); + +char *trace_profile_buf_nmi; +EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); + +/* Count the events in use (per event id, not per instance) */ +static int total_profile_count; + +static int ftrace_profile_enable_event(struct ftrace_event_call *event) +{ + char *buf; + int ret = -ENOMEM; + + if (atomic_inc_return(&event->profile_count)) + return 0; + + if (!total_profile_count++) { + buf = (char *)alloc_percpu(profile_buf_t); + if (!buf) + goto fail_buf; + + rcu_assign_pointer(trace_profile_buf, buf); + + buf = (char *)alloc_percpu(profile_buf_t); + if (!buf) + goto fail_buf_nmi; + + rcu_assign_pointer(trace_profile_buf_nmi, buf); + } + + ret = event->profile_enable(); + if (!ret) + return 0; + + kfree(trace_profile_buf_nmi); +fail_buf_nmi: + kfree(trace_profile_buf); +fail_buf: + total_profile_count--; + atomic_dec(&event->profile_count); + + return ret; +} + int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; @@ -17,7 +68,7 @@ int ftrace_profile_enable(int event_id) list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id && event->profile_enable && try_module_get(event->mod)) { - ret = event->profile_enable(event); + ret = ftrace_profile_enable_event(event); break; } } @@ -26,6 +77,33 @@ int ftrace_profile_enable(int event_id) return ret; } +static void ftrace_profile_disable_event(struct ftrace_event_call *event) +{ + char *buf, *nmi_buf; + + if (!atomic_add_negative(-1, &event->profile_count)) + return; + + event->profile_disable(); + + if (!--total_profile_count) { + buf = trace_profile_buf; + rcu_assign_pointer(trace_profile_buf, NULL); + + nmi_buf = trace_profile_buf_nmi; + rcu_assign_pointer(trace_profile_buf_nmi, NULL); + + /* + * Ensure every events in profiling have finished before + * releasing the buffers + */ + synchronize_sched(); + + free_percpu(buf); + free_percpu(nmi_buf); + } +} + void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; @@ -33,7 +111,7 @@ void ftrace_profile_disable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) { - event->profile_disable(event); + ftrace_profile_disable_event(event); module_put(event->mod); break; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 56c260b..6f03c8a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -271,42 +271,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - for (;;) { - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - + list_for_each_entry_continue(call, &ftrace_events, list) { /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->regfunc) - break; - - list = list->next; + return call; } - m->private = list->next; - - return call; + return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; loff_t l; mutex_lock(&event_mutex); - m->private = ftrace_events.next; + call = list_entry(&ftrace_events, struct ftrace_event_call, list); for (l = 0; l <= *pos; ) { - call = t_next(m, NULL, &l); + call = t_next(m, call, &l); if (!call) break; } @@ -316,37 +306,28 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - retry: - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - - if (!call->enabled) { - list = list->next; - goto retry; + list_for_each_entry_continue(call, &ftrace_events, list) { + if (call->enabled) + return call; } - m->private = list->next; - - return call; + return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; loff_t l; mutex_lock(&event_mutex); - m->private = ftrace_events.next; + call = list_entry(&ftrace_events, struct ftrace_event_call, list); for (l = 0; l <= *pos; ) { - call = s_next(m, NULL, &l); + call = s_next(m, call, &l); if (!call) break; } diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ca7d7c4..23b6385 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) seq_print_ip_sym(seq, it->from, symflags) && trace_seq_printf(seq, "\n")) return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE;; + return TRACE_TYPE_PARTIAL_LINE; } return TRACE_TYPE_UNHANDLED; } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 687699d..2547d88 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -11,7 +11,6 @@ #include <linux/ftrace.h> #include <linux/string.h> #include <linux/module.h> -#include <linux/marker.h> #include <linux/mutex.h> #include <linux/ctype.h> #include <linux/list.h> diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0f6facb..8504ac7 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = { int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == !!stack_tracer_enabled)) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8712ce3..9fbce6c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,7 +2,7 @@ #include <trace/events/syscalls.h> #include <linux/kernel.h> #include <linux/ftrace.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/syscall.h> #include "trace_output.h" @@ -384,10 +384,13 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { - struct syscall_trace_enter *rec; struct syscall_metadata *sys_data; + struct syscall_trace_enter *rec; + unsigned long flags; + char *raw_data; int syscall_nr; int size; + int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - do { - char raw_data[size]; + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return; + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(flags); - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + cpu = smp_processor_id(); + + if (in_nmi()) + raw_data = rcu_dereference(trace_profile_buf_nmi); + else + raw_data = rcu_dereference(trace_profile_buf); + + if (!raw_data) + goto end; - rec = (struct syscall_trace_enter *) raw_data; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_id; - rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, - (unsigned long *)&rec->args); - perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); - } while(0); + raw_data = per_cpu_ptr(raw_data, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_enter *) raw_data; + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->enter_id; + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + perf_tp_event(sys_data->enter_id, 0, 1, rec, size); + +end: + local_irq_restore(flags); } int reg_prof_syscall_enter(char *name) @@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name) static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; - struct syscall_trace_exit rec; + struct syscall_trace_exit *rec; + unsigned long flags; int syscall_nr; + char *raw_data; + int size; + int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) @@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - tracing_generic_entry_update(&rec.ent, 0, 0); - rec.ent.type = sys_data->exit_id; - rec.nr = syscall_nr; - rec.ret = syscall_get_return_value(current, regs); + /* We can probably do that at build time */ + size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); - perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); + /* + * Impossible, but be paranoid with the future + * How to put this check outside runtime? + */ + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "exit event has grown above profile buffer size")) + return; + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(flags); + cpu = smp_processor_id(); + + if (in_nmi()) + raw_data = rcu_dereference(trace_profile_buf_nmi); + else + raw_data = rcu_dereference(trace_profile_buf); + + if (!raw_data) + goto end; + + raw_data = per_cpu_ptr(raw_data, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_trace_exit *)raw_data; + + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->exit_id; + rec->nr = syscall_nr; + rec->ret = syscall_get_return_value(current, regs); + + perf_tp_event(sys_data->exit_id, 0, 1, rec, size); + +end: + local_irq_restore(flags); } int reg_prof_syscall_exit(char *name) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9489a0a..cc89be5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : - * It is used to to delay the free of multiple probes array until a quiescent + * It is used to delay the free of multiple probes array until a quiescent * state is reached. * Tracepoint entries modifications are protected by the tracepoints_mutex. */ diff --git a/kernel/uid16.c b/kernel/uid16.c index 0314501..4192098 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -4,7 +4,6 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/mman.h> #include <linux/notifier.h> #include <linux/reboot.h> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 92359cc..69eae35 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); return r; } |