diff options
author | Russell King <rmk+kernel@arm.linux.org.uk> | 2010-08-06 18:13:54 +0100 |
---|---|---|
committer | Russell King <rmk+kernel@arm.linux.org.uk> | 2010-08-06 18:13:54 +0100 |
commit | 11e4afb49b7fa1fc8e1ffd850c1806dd86a08204 (patch) | |
tree | 9e57efcb106ae912f7bec718feb3f8ec607559bb /kernel | |
parent | 162500b3a3ff39d941d29db49b41a16667ae44f0 (diff) | |
parent | 9b2a606d3898fcb2eedb6faded3bb37549590ac4 (diff) | |
download | op-kernel-dev-11e4afb49b7fa1fc8e1ffd850c1806dd86a08204.zip op-kernel-dev-11e4afb49b7fa1fc8e1ffd850c1806dd86a08204.tar.gz |
Merge branches 'gemini' and 'misc' into devel
Diffstat (limited to 'kernel')
157 files changed, 16005 insertions, 6887 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index a987aa1..057472f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -68,14 +68,14 @@ obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o -obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_KGDB) += kgdb.o +obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ diff --git a/kernel/acct.c b/kernel/acct.c index 24f8c81..385b884 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -216,7 +216,6 @@ static int acct_on(char *name) { struct file *file; struct vfsmount *mnt; - int error; struct pid_namespace *ns; struct bsd_acct_struct *acct = NULL; @@ -244,13 +243,6 @@ static int acct_on(char *name) } } - error = security_acct(file); - if (error) { - kfree(acct); - filp_close(file, NULL); - return error; - } - spin_lock(&acct_lock); if (ns->bacct == NULL) { ns->bacct = acct; @@ -281,7 +273,7 @@ static int acct_on(char *name) */ SYSCALL_DEFINE1(acct, const char __user *, name) { - int error; + int error = 0; if (!capable(CAP_SYS_PACCT)) return -EPERM; @@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (acct == NULL) return 0; - error = security_acct(NULL); - if (!error) { - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - } + spin_lock(&acct_lock); + acct_file_reopen(acct, NULL, NULL); + spin_unlock(&acct_lock); } + return error; } @@ -353,17 +343,18 @@ restart: void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct; + struct bsd_acct_struct *acct = ns->bacct; - spin_lock(&acct_lock); - acct = ns->bacct; - if (acct != NULL) { - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); + if (acct == NULL) + return; - kfree(acct); - } + del_timer_sync(&acct->timer); + spin_lock(&acct_lock); + if (acct->file != NULL) + acct_file_reopen(acct, NULL, NULL); spin_unlock(&acct_lock); + + kfree(acct); } /* diff --git a/kernel/async.c b/kernel/async.c index 27235f5..15319d6 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel. #include <linux/init.h> #include <linux/kthread.h> #include <linux/delay.h> +#include <linux/slab.h> #include <asm/atomic.h> static async_cookie_t next_cookie = 1; diff --git a/kernel/audit.c b/kernel/audit.c index 78f7f86..8296aa5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -46,6 +46,7 @@ #include <asm/atomic.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> @@ -406,7 +407,7 @@ static void kauditd_send_skb(struct sk_buff *skb) audit_hold_skb(skb); } else /* drop the extra reference if sent ok */ - kfree_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 028e856..46a57b5 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -3,6 +3,7 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/kthread.h> +#include <linux/slab.h> struct audit_tree; struct audit_chunk; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index cc7e879..8df4369 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/inotify.h> #include <linux/security.h> #include "audit.h" diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a706040..ce08041 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/security.h> #include "audit.h" diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f3a461c..3828ad5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -49,6 +49,7 @@ #include <linux/namei.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/mount.h> #include <linux/socket.h> #include <linux/mqueue.h> @@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context, { if (context->name_count >= AUDIT_NAMES) { if (inode) - printk(KERN_DEBUG "name_count maxed, losing inode data: " + printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " "dev=%02x:%02x, inode=%lu\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), diff --git a/kernel/capability.c b/kernel/capability.c index 9e4697e..2f05303 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -15,7 +15,6 @@ #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <asm/uaccess.h> -#include "cred-internals.h" /* * Leveraged for setting/resetting capabilities diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef909a3..a8ce099 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -27,7 +27,6 @@ */ #include <linux/cgroup.h> -#include <linux/module.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> @@ -1647,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry) int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; - struct dentry *dentry = rcu_dereference(cgrp->dentry); + struct dentry *dentry = rcu_dereference_check(cgrp->dentry, + rcu_read_lock_held() || + cgroup_lock_is_held()); if (!dentry || cgrp == dummytop) { /* @@ -1663,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) *--start = '\0'; for (;;) { int len = dentry->d_name.len; + if ((start -= len) < buf) return -ENAMETOOLONG; - memcpy(start, cgrp->dentry->d_name.name, len); + memcpy(start, dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; - dentry = rcu_dereference(cgrp->dentry); + + dentry = rcu_dereference_check(cgrp->dentry, + rcu_read_lock_held() || + cgroup_lock_is_held()); if (!cgrp->parent) continue; if (--start < buf) @@ -1783,6 +1788,29 @@ out: return retval; } +/** + * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup + * @tsk: the task to be attached + */ +int cgroup_attach_task_current_cg(struct task_struct *tsk) +{ + struct cgroupfs_root *root; + struct cgroup *cur_cg; + int retval = 0; + + cgroup_lock(); + for_each_active_root(root) { + cur_cg = task_cgroup_from_root(current, root); + retval = cgroup_attach_task(cur_cg, tsk); + if (retval) + break; + } + cgroup_unlock(); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); + /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex * held. May take task_lock of task @@ -2989,7 +3017,6 @@ static void cgroup_event_remove(struct work_struct *work) remove); struct cgroup *cgrp = event->cgrp; - /* TODO: check return code */ event->cft->unregister_event(cgrp, event->cft, event->eventfd); eventfd_ctx_put(event->eventfd); @@ -3011,7 +3038,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { - remove_wait_queue_locked(event->wqh, &event->wait); + __remove_wait_queue(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); list_del(&event->list); spin_unlock(&cgrp->event_list_lock); @@ -3610,7 +3637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * @ss: the subsystem to load * * This function should be called in a modular subsystem's initcall. If the - * subsytem is built as a module, it will be assigned a new subsys_id and set + * subsystem is built as a module, it will be assigned a new subsys_id and set * up for use. If the subsystem is built-in anyway, work is delegated to the * simpler cgroup_init_subsys. */ @@ -4430,7 +4457,15 @@ __setup("cgroup_disable=", cgroup_disable); */ unsigned short css_id(struct cgroup_subsys_state *css) { - struct css_id *cssid = rcu_dereference(css->id); + struct css_id *cssid; + + /* + * This css_id() can return correct value when somone has refcnt + * on this or this is under rcu_read_lock(). Once css->id is allocated, + * it's unchanged until freed. + */ + cssid = rcu_dereference_check(css->id, + rcu_read_lock_held() || atomic_read(&css->refcnt)); if (cssid) return cssid->id; @@ -4440,7 +4475,10 @@ EXPORT_SYMBOL_GPL(css_id); unsigned short css_depth(struct cgroup_subsys_state *css) { - struct css_id *cssid = rcu_dereference(css->id); + struct css_id *cssid; + + cssid = rcu_dereference_check(css->id, + rcu_read_lock_held() || atomic_read(&css->refcnt)); if (cssid) return cssid->depth; @@ -4448,15 +4486,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css) } EXPORT_SYMBOL_GPL(css_depth); +/** + * css_is_ancestor - test "root" css is an ancestor of "child" + * @child: the css to be tested. + * @root: the css supporsed to be an ancestor of the child. + * + * Returns true if "root" is an ancestor of "child" in its hierarchy. Because + * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). + * But, considering usual usage, the csses should be valid objects after test. + * Assuming that the caller will do some action to the child if this returns + * returns true, the caller must take "child";s reference count. + * If "child" is valid object and this returns true, "root" is valid, too. + */ + bool css_is_ancestor(struct cgroup_subsys_state *child, const struct cgroup_subsys_state *root) { - struct css_id *child_id = rcu_dereference(child->id); - struct css_id *root_id = rcu_dereference(root->id); + struct css_id *child_id; + struct css_id *root_id; + bool ret = true; - if (!child_id || !root_id || (child_id->depth < root_id->depth)) - return false; - return child_id->stack[root_id->depth] == root_id->id; + rcu_read_lock(); + child_id = rcu_dereference(child->id); + root_id = rcu_dereference(root->id); + if (!child_id + || !root_id + || (child_id->depth < root_id->depth) + || (child_id->stack[root_id->depth] != root_id->id)) + ret = false; + rcu_read_unlock(); + return ret; } static void __free_css_id_cb(struct rcu_head *head) @@ -4556,13 +4615,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, { int subsys_id, i, depth = 0; struct cgroup_subsys_state *parent_css, *child_css; - struct css_id *child_id, *parent_id = NULL; + struct css_id *child_id, *parent_id; subsys_id = ss->subsys_id; parent_css = parent->subsys[subsys_id]; child_css = child->subsys[subsys_id]; - depth = css_depth(parent_css) + 1; parent_id = parent_css->id; + depth = parent_id->depth + 1; child_id = get_new_cssid(ss, depth); if (IS_ERR(child_id)) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 59e9ef6..ce71ed5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -15,6 +15,7 @@ */ #include <linux/module.h> +#include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/uaccess.h> @@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -int cgroup_frozen(struct task_struct *task) +int cgroup_freezing_or_frozen(struct task_struct *task) { struct freezer *freezer; enum freezer_state state; task_lock(task); freezer = task_freezer(task); - state = freezer->state; + if (!freezer->css.cgroup->parent) + state = CGROUP_THAWED; /* root cgroup can't be frozen */ + else + state = freezer->state; task_unlock(task); - return state == CGROUP_FROZEN; + return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); } /* @@ -85,10 +89,10 @@ struct cgroup_subsys freezer_subsys; /* Locks taken and their ordering * ------------------------------ - * css_set_lock * cgroup_mutex (AKA cgroup_lock) - * task->alloc_lock (AKA task_lock) * freezer->lock + * css_set_lock + * task->alloc_lock (AKA task_lock) * task->sighand->siglock * * cgroup code forces css_set_lock to be taken before task->alloc_lock @@ -96,33 +100,38 @@ struct cgroup_subsys freezer_subsys; * freezer_create(), freezer_destroy(): * cgroup_mutex [ by cgroup core ] * - * can_attach(): - * cgroup_mutex + * freezer_can_attach(): + * cgroup_mutex (held by caller of can_attach) * - * cgroup_frozen(): + * cgroup_freezing_or_frozen(): * task->alloc_lock (to get task's cgroup) * * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * task->alloc_lock (to get task's cgroup) * freezer->lock * sighand->siglock (if the cgroup is freezing) * * freezer_read(): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) * * freezer_write() (freeze): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock + * sighand->siglock (fake signal delivery inside freeze_task()) * * freezer_write() (unfreeze): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (to prevent races with freeze_task()) + * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) * sighand->siglock */ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, @@ -201,9 +210,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) * No lock is needed, since the task isn't on tasklist yet, * so it can't be moved to another cgroup, which means the * freezer won't be removed and will be valid during this - * function call. + * function call. Nevertheless, apply RCU read-side critical + * section to suppress RCU lockdep false positives. */ + rcu_read_lock(); freezer = task_freezer(task); + rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the diff --git a/kernel/compat.c b/kernel/compat.c index f6c204f0..5adab05 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -25,6 +25,7 @@ #include <linux/posix-timers.h> #include <linux/times.h> #include <linux/ptrace.h> +#include <linux/gfp.h> #include <asm/uaccess.h> @@ -494,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, { int ret; cpumask_var_t mask; - unsigned long *k; - unsigned int min_length = cpumask_size(); - - if (nr_cpu_ids <= BITS_PER_COMPAT_LONG) - min_length = sizeof(compat_ulong_t); - if (len < min_length) + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(compat_ulong_t)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); - if (ret < 0) - goto out; + if (ret == 0) { + size_t retlen = min_t(size_t, len, cpumask_size()); - k = cpumask_bits(mask); - ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); - if (ret == 0) - ret = min_length; - -out: + if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) + ret = -EFAULT; + else + ret = retlen; + } free_cpumask_var(mask); + return ret; } diff --git a/kernel/cpu.c b/kernel/cpu.c index f8cced2..97d1b42 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,18 +14,35 @@ #include <linux/kthread.h> #include <linux/stop_machine.h> #include <linux/mutex.h> +#include <linux/gfp.h> #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); -static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); +/* + * The following two API's must be used when attempting + * to serialize the updates to cpu_online_mask, cpu_present_mask. + */ +void cpu_maps_update_begin(void) +{ + mutex_lock(&cpu_add_remove_lock); +} + +void cpu_maps_update_done(void) +{ + mutex_unlock(&cpu_add_remove_lock); +} + +static RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; +#ifdef CONFIG_HOTPLUG_CPU + static struct { struct task_struct *active_writer; struct mutex lock; /* Synchronizes accesses to refcount, */ @@ -40,8 +57,6 @@ static struct { .refcount = 0, }; -#ifdef CONFIG_HOTPLUG_CPU - void get_online_cpus(void) { might_sleep(); @@ -66,22 +81,6 @@ void put_online_cpus(void) } EXPORT_SYMBOL_GPL(put_online_cpus); -#endif /* CONFIG_HOTPLUG_CPU */ - -/* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. - */ -void cpu_maps_update_begin(void) -{ - mutex_lock(&cpu_add_remove_lock); -} - -void cpu_maps_update_done(void) -{ - mutex_unlock(&cpu_add_remove_lock); -} - /* * This ensures that the hotplug operation can begin only when the * refcount goes to zero. @@ -123,6 +122,12 @@ static void cpu_hotplug_done(void) cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); } + +#else /* #if CONFIG_HOTPLUG_CPU */ +static void cpu_hotplug_begin(void) {} +static void cpu_hotplug_done(void) {} +#endif /* #esle #if CONFIG_HOTPLUG_CPU */ + /* Need to know about CPUs going up/down? */ int __ref register_cpu_notifier(struct notifier_block *nb) { @@ -133,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +static int __cpu_notify(unsigned long val, void *v, int nr_to_call, + int *nr_calls) +{ + int ret; + + ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + nr_calls); + + return notifier_to_errno(ret); +} + +static int cpu_notify(unsigned long val, void *v) +{ + return __cpu_notify(val, v, -1, NULL); +} + #ifdef CONFIG_HOTPLUG_CPU +static void cpu_notify_nofail(unsigned long val, void *v) +{ + BUG_ON(cpu_notify(val, v)); +} + EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -163,6 +189,7 @@ static inline void check_for_tasks(int cpu) } struct take_cpu_down_param { + struct task_struct *caller; unsigned long mod; void *hcpu; }; @@ -171,6 +198,7 @@ struct take_cpu_down_param { static int __ref take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; + unsigned int cpu = (unsigned long)param->hcpu; int err; /* Ensure this CPU doesn't handle any more interrupts. */ @@ -178,9 +206,10 @@ static int __ref take_cpu_down(void *_param) if (err < 0) return err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); + cpu_notify(CPU_DYING | param->mod, param->hcpu); + if (task_cpu(param->caller) == cpu) + move_task_off_dead_cpu(cpu, param->caller); /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -191,10 +220,10 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - cpumask_var_t old_allowed; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { + .caller = current, .mod = mod, .hcpu = hcpu, }; @@ -205,38 +234,26 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; - if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) - return -ENOMEM; - cpu_hotplug_begin(); set_cpu_active(cpu, false); - err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, - hcpu, -1, &nr_calls); - if (err == NOTIFY_BAD) { + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); + if (err) { set_cpu_active(cpu, true); nr_calls--; - __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, - hcpu, nr_calls, NULL); + __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", __func__, cpu); - err = -EINVAL; goto out_release; } - /* Ensure that we are not runnable on dying cpu */ - cpumask_copy(old_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpu_active_mask); - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, - hcpu) == NOTIFY_BAD) - BUG(); + cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); - goto out_allowed; + goto out_release; } BUG_ON(cpu_online(cpu)); @@ -248,22 +265,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __cpu_die(cpu); /* CPU is completely dead: tell everyone. Too late to complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, - hcpu) == NOTIFY_BAD) - BUG(); + cpu_notify_nofail(CPU_DEAD | mod, hcpu); check_for_tasks(cpu); -out_allowed: - set_cpus_allowed_ptr(current, old_allowed); out_release: cpu_hotplug_done(); - if (!err) { - if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, - hcpu) == NOTIFY_BAD) - BUG(); - } - free_cpumask_var(old_allowed); + if (!err) + cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); return err; } @@ -271,9 +280,6 @@ int __ref cpu_down(unsigned int cpu) { int err; - err = stop_machine_create(); - if (err) - return err; cpu_maps_update_begin(); if (cpu_hotplug_disabled) { @@ -285,7 +291,6 @@ int __ref cpu_down(unsigned int cpu) out: cpu_maps_update_done(); - stop_machine_destroy(); return err; } EXPORT_SYMBOL(cpu_down); @@ -302,13 +307,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); - ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, - -1, &nr_calls); - if (ret == NOTIFY_BAD) { + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); + if (ret) { nr_calls--; printk("%s: attempt to bring up CPU %u failed\n", __func__, cpu); - ret = -EINVAL; goto out_notify; } @@ -321,12 +324,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) set_cpu_active(cpu, true); /* Now call notifier in preparation. */ - raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); + cpu_notify(CPU_ONLINE | mod, hcpu); out_notify: if (ret != 0) - __raw_notifier_call_chain(&cpu_chain, - CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); cpu_hotplug_done(); return ret; @@ -335,6 +337,12 @@ out_notify: int __cpuinit cpu_up(unsigned int cpu) { int err = 0; + +#ifdef CONFIG_MEMORY_HOTPLUG + int nid; + pg_data_t *pgdat; +#endif + if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); @@ -345,6 +353,28 @@ int __cpuinit cpu_up(unsigned int cpu) return -EINVAL; } +#ifdef CONFIG_MEMORY_HOTPLUG + nid = cpu_to_node(cpu); + if (!node_online(nid)) { + err = mem_online_node(nid); + if (err) + return err; + } + + pgdat = NODE_DATA(nid); + if (!pgdat) { + printk(KERN_ERR + "Can't online cpu %d due to NULL pgdat\n", cpu); + return -ENOMEM; + } + + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } +#endif + cpu_maps_update_begin(); if (cpu_hotplug_disabled) { @@ -364,11 +394,8 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error = 0; - error = stop_machine_create(); - if (error) - return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); /* @@ -399,7 +426,6 @@ int disable_nonboot_cpus(void) printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); - stop_machine_destroy(); return error; } @@ -466,7 +492,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) val = CPU_STARTING_FROZEN; #endif /* CONFIG_PM_SLEEP_SMP */ - raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); + cpu_notify(val, (void *)(long)cpu); } #endif /* CONFIG_SMP */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba401fa..7cb37d8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -105,7 +105,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset heirarchy */ + /* used for walking a cpuset hierarchy */ struct list_head stack_list; }; @@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * call to guarantee_online_mems(), as we know no one is changing * our task's cpuset. * - * Hold callback_mutex around the two modifications of our tasks - * mems_allowed to synchronize with cpuset_mems_allowed(). - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -949,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, * In order to avoid seeing no nodes if the old and new nodes are disjoint, * we structure updates as setting all new allowed nodes, then clearing newly * disallowed ones. - * - * Called with task's alloc_lock held */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { +repeat: + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return; + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return; + + task_lock(tsk); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, &tsk->mems_allowed); - mpol_rebind_task(tsk, newmems); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); + + + /* + * ensure checking ->mems_allowed_change_disable after setting all new + * allowed nodes. + * + * the read-side task can see an nodemask with new allowed nodes and + * old allowed nodes. and if it allocates page when cpuset clears newly + * disallowed ones continuous, it can see the new allowed bits. + * + * And if setting all new allowed nodes is after the checking, setting + * all new allowed nodes and clearing newly disallowed ones will be done + * continuous, and the read-side task may find no node to alloc page. + */ + smp_mb(); + + /* + * Allocation of memory is very fast, we needn't sleep when waiting + * for the read-side. + */ + while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + task_unlock(tsk); + if (!task_curr(tsk)) + yield(); + goto repeat; + } + + /* + * ensure checking ->mems_allowed_change_disable before clearing all new + * disallowed nodes. + * + * if clearing newly disallowed bits before the checking, the read-side + * task may find no node to alloc page. + */ + smp_mb(); + + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + task_unlock(tsk); } /* @@ -973,14 +1016,17 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - nodemask_t newmems; + NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); + + if (!newmems) + return; cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, &newmems); + guarantee_online_mems(cs, newmems); - task_lock(p); - cpuset_change_task_nodemask(p, &newmems); - task_unlock(p); + cpuset_change_task_nodemask(p, newmems); + + NODEMASK_FREE(newmems); mm = get_task_mm(p); if (!mm) @@ -1051,16 +1097,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - nodemask_t oldmem; + NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); int retval; struct ptr_heap heap; + if (!oldmem) + return -ENOMEM; + /* * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; * it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + if (cs == &top_cpuset) { + retval = -EACCES; + goto done; + } /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. @@ -1076,11 +1127,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_HIGH_MEMORY])) - return -EINVAL; + node_states[N_HIGH_MEMORY])) { + retval = -EINVAL; + goto done; + } } - oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs->mems_allowed)) { + *oldmem = cs->mems_allowed; + if (nodes_equal(*oldmem, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } @@ -1096,10 +1149,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, &oldmem, &heap); + update_tasks_nodemask(cs, oldmem, &heap); heap_free(&heap); done: + NODEMASK_FREE(oldmem); return retval; } @@ -1373,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - task_lock(tsk); cpuset_change_task_nodemask(tsk, to); - task_unlock(tsk); cpuset_update_task_spread_flag(cs, tsk); } @@ -1384,40 +1436,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk, bool threadgroup) { - nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); + NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); + + if (from == NULL || to == NULL) + goto alloc_fail; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); - to = node_possible_map; } else { guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &to); } + guarantee_online_mems(cs, to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, &to, cs); + cpuset_attach_task(tsk, to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, &to, cs); + cpuset_attach_task(c, to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - from = oldcs->mems_allowed; - to = cs->mems_allowed; + *from = oldcs->mems_allowed; + *to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, &to); + mpol_rebind_mm(mm, to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &from, &to); + cpuset_migrate_mm(mm, from, to); mmput(mm); } + +alloc_fail: + NODEMASK_FREE(from); + NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -1562,13 +1621,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - nodemask_t mask; + NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); + int retval; + + if (mask == NULL) + return -ENOMEM; mutex_lock(&callback_mutex); - mask = cs->mems_allowed; + *mask = cs->mems_allowed; mutex_unlock(&callback_mutex); - return nodelist_scnprintf(page, PAGE_SIZE, mask); + retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); + + NODEMASK_FREE(mask); + + return retval; } static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1997,7 +2064,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - nodemask_t oldmems; + NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + + if (oldmems == NULL) + return; list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2014,7 +2084,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - oldmems = cp->mems_allowed; + *oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2030,9 +2100,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + update_tasks_nodemask(cp, oldmems, NULL); } } + NODEMASK_FREE(oldmems); } /* @@ -2090,20 +2161,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { + NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + + if (oldmems == NULL) + return NOTIFY_DONE; + cgroup_lock(); switch (action) { case MEM_ONLINE: - case MEM_OFFLINE: + *oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - if (action == MEM_OFFLINE) - scan_for_empty_cpusets(&top_cpuset); + update_tasks_nodemask(&top_cpuset, oldmems, NULL); + break; + case MEM_OFFLINE: + /* + * needn't update top_cpuset.mems_allowed explicitly because + * scan_for_empty_cpusets() will update it. + */ + scan_for_empty_cpusets(&top_cpuset); break; default: break; } cgroup_unlock(); + + NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif @@ -2140,19 +2224,52 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { mutex_lock(&callback_mutex); - cpuset_cpus_allowed_locked(tsk, pmask); + task_lock(tsk); + guarantee_online_cpus(task_cs(tsk), pmask); + task_unlock(tsk); mutex_unlock(&callback_mutex); } -/** - * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. - * Must be called with callback_mutex held. - **/ -void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +int cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), pmask); - task_unlock(tsk); + const struct cpuset *cs; + int cpu; + + rcu_read_lock(); + cs = task_cs(tsk); + if (cs) + cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); + rcu_read_unlock(); + + /* + * We own tsk->cpus_allowed, nobody can change it under us. + * + * But we used cs && cs->cpus_allowed lockless and thus can + * race with cgroup_attach_task() or update_cpumask() and get + * the wrong tsk->cpus_allowed. However, both cases imply the + * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() + * which takes task_rq_lock(). + * + * If we are called after it dropped the lock we must see all + * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary + * set any mask even if it is not right from task_cs() pov, + * the pending set_cpus_allowed_ptr() will fix things. + */ + + cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); + if (cpu >= nr_cpu_ids) { + /* + * Either tsk->cpus_allowed is wrong (see above) or it + * is actually empty. The latter case is only possible + * if we are racing with remove_tasks_in_empty_cpuset(). + * Like above we can temporary set any mask and rely on + * set_cpus_allowed_ptr() as synchronization point. + */ + cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); + cpu = cpumask_any(cpu_active_mask); + } + + return cpu; } void cpuset_init_current_mems_allowed(void) @@ -2341,22 +2458,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) } /** - * cpuset_lock - lock out any changes to cpuset structures - * - * The out of memory (oom) code needs to mutex_lock cpusets - * from being changed while it scans the tasklist looking for a - * task in an overlapping cpuset. Expose callback_mutex via this - * cpuset_lock() routine, so the oom code can lock it, before - * locking the task list. The tasklist_lock is a spinlock, so - * must be taken inside callback_mutex. - */ - -void cpuset_lock(void) -{ - mutex_lock(&callback_mutex); -} - -/** * cpuset_unlock - release lock on cpuset changes * * Undo the lock taken in a previous cpuset_lock() call. @@ -2368,7 +2469,8 @@ void cpuset_unlock(void) } /** - * cpuset_mem_spread_node() - On which node to begin search for a page + * cpuset_mem_spread_node() - On which node to begin search for a file page + * cpuset_slab_spread_node() - On which node to begin search for a slab page * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), @@ -2393,16 +2495,27 @@ void cpuset_unlock(void) * See kmem_cache_alloc_node(). */ -int cpuset_mem_spread_node(void) +static int cpuset_spread_node(int *rotor) { int node; - node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); + node = next_node(*rotor, current->mems_allowed); if (node == MAX_NUMNODES) node = first_node(current->mems_allowed); - current->cpuset_mem_spread_rotor = node; + *rotor = node; return node; } + +int cpuset_mem_spread_node(void) +{ + return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); +} + +int cpuset_slab_spread_node(void) +{ + return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); +} + EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); /** diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h deleted file mode 100644 index 2dc4fc2..0000000 --- a/kernel/cred-internals.h +++ /dev/null @@ -1,21 +0,0 @@ -/* Internal credentials stuff - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -/* - * user.c - */ -static inline void sched_switch_user(struct task_struct *p) -{ -#ifdef CONFIG_USER_SCHED - sched_move_task(p); -#endif /* CONFIG_USER_SCHED */ -} - diff --git a/kernel/cred.c b/kernel/cred.c index 1ed8ca1..60bc8b1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -10,13 +10,13 @@ */ #include <linux/module.h> #include <linux/cred.h> +#include <linux/slab.h> #include <linux/sched.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/init_task.h> #include <linux/security.h> #include <linux/cn_proc.h> -#include "cred-internals.h" #if 0 #define kdebug(FMT, ...) \ @@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk) } } +/** + * get_task_cred - Get another task's objective credentials + * @task: The task to query + * + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. + * + * The caller must also make sure task doesn't get deleted, either by holding a + * ref on task or by holding tasklist_lock to prevent it from being unlinked. + */ +const struct cred *get_task_cred(struct task_struct *task) +{ + const struct cred *cred; + + rcu_read_lock(); + + do { + cred = __task_cred((task)); + BUG_ON(!cred); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + + rcu_read_unlock(); + return cred; +} + /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. @@ -347,60 +372,6 @@ struct cred *prepare_exec_creds(void) } /* - * prepare new credentials for the usermode helper dispatcher - */ -struct cred *prepare_usermodehelper_creds(void) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = NULL; -#endif - struct cred *new; - -#ifdef CONFIG_KEYS - tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC); - if (!tgcred) - return NULL; -#endif - - new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); - if (!new) - return NULL; - - kdebug("prepare_usermodehelper_creds() alloc %p", new); - - memcpy(new, &init_cred, sizeof(struct cred)); - - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); - get_group_info(new->group_info); - get_uid(new->user); - -#ifdef CONFIG_KEYS - new->thread_keyring = NULL; - new->request_key_auth = NULL; - new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT; - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - new->tgcred = tgcred; -#endif - -#ifdef CONFIG_SECURITY - new->security = NULL; -#endif - if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) - goto error; - validate_creds(new); - - BUG_ON(atomic_read(&new->usage) != 1); - return new; - -error: - put_cred(new); - return NULL; -} - -/* * Copy credentials for the new process created by fork() * * We share if we can, but under some circumstances we have to generate a new @@ -516,8 +487,6 @@ int commit_creds(struct cred *new) #endif BUG_ON(atomic_read(&new->usage) < 1); - security_commit_creds(new, old); - get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ @@ -553,8 +522,6 @@ int commit_creds(struct cred *new) atomic_dec(&old->user->processes); alter_cred_subscribers(old, -2); - sched_switch_user(task); - /* send notifications */ if (new->uid != old->uid || new->euid != old->euid || @@ -786,8 +753,6 @@ bool creds_are_invalid(const struct cred *cred) { if (cred->magic != CRED_MAGIC) return true; - if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) - return true; #ifdef CONFIG_SECURITY_SELINUX if (selinux_is_enabled()) { if ((unsigned long) cred->security < PAGE_SIZE) diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile new file mode 100644 index 0000000..a85edc3 --- /dev/null +++ b/kernel/debug/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the linux kernel debugger +# + +obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o +obj-$(CONFIG_KGDB_KDB) += kdb/ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c new file mode 100644 index 0000000..51d14fe8 --- /dev/null +++ b/kernel/debug/debug_core.c @@ -0,0 +1,983 @@ +/* + * Kernel Debug Core + * + * Maintainer: Jason Wessel <jason.wessel@windriver.com> + * + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002-2004 Timesys Corporation + * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> + * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2005-2009 Wind River Systems, Inc. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Contributors at various stages not listed above: + * Jason Wessel ( jason.wessel@windriver.com ) + * George Anzinger <george@mvista.com> + * Anurekh Saxena (anurekh.saxena@timesys.com) + * Lake Stevens Instrument Division (Glenn Engel) + * Jim Kingdon, Cygnus Support. + * + * Original KGDB stub: David Grothe <dave@gcom.com>, + * Tigran Aivazian <tigran@sco.com> + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ +#include <linux/pid_namespace.h> +#include <linux/clocksource.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/console.h> +#include <linux/threads.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/sysrq.h> +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/pid.h> +#include <linux/smp.h> +#include <linux/mm.h> + +#include <asm/cacheflush.h> +#include <asm/byteorder.h> +#include <asm/atomic.h> +#include <asm/system.h> + +#include "debug_core.h" + +static int kgdb_break_asap; + +struct debuggerinfo_struct kgdb_info[NR_CPUS]; + +/** + * kgdb_connected - Is a host GDB connected to us? + */ +int kgdb_connected; +EXPORT_SYMBOL_GPL(kgdb_connected); + +/* All the KGDB handlers are installed */ +int kgdb_io_module_registered; + +/* Guard for recursive entry */ +static int exception_level; + +struct kgdb_io *dbg_io_ops; +static DEFINE_SPINLOCK(kgdb_registration_lock); + +/* kgdb console driver is loaded */ +static int kgdb_con_registered; +/* determine if kgdb console output should be used */ +static int kgdb_use_con; +/* Flag for alternate operations for early debugging */ +bool dbg_is_early = true; +/* Next cpu to become the master debug core */ +int dbg_switch_cpu; + +/* Use kdb or gdbserver mode */ +int dbg_kdb_mode = 1; + +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + +module_param(kgdb_use_con, int, 0644); + +/* + * Holds information about breakpoints in a kernel. These breakpoints are + * added and removed by gdb. + */ +static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { + [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } +}; + +/* + * The CPU# of the active CPU, or -1 if none: + */ +atomic_t kgdb_active = ATOMIC_INIT(-1); +EXPORT_SYMBOL_GPL(kgdb_active); + +/* + * We use NR_CPUs not PERCPU, in case kgdb is used to debug early + * bootup code (which might not have percpu set up yet): + */ +static atomic_t passive_cpu_wait[NR_CPUS]; +static atomic_t cpu_in_kgdb[NR_CPUS]; +static atomic_t kgdb_break_tasklet_var; +atomic_t kgdb_setting_breakpoint; + +struct task_struct *kgdb_usethread; +struct task_struct *kgdb_contthread; + +int kgdb_single_step; +static pid_t kgdb_sstep_pid; + +/* to keep track of the CPU which is doing the single stepping*/ +atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); + +/* + * If you are debugging a problem where roundup (the collection of + * all other CPUs) is a problem [this should be extremely rare], + * then use the nokgdbroundup option to avoid roundup. In that case + * the other CPUs might interfere with your debugging context, so + * use this with care: + */ +static int kgdb_do_roundup = 1; + +static int __init opt_nokgdbroundup(char *str) +{ + kgdb_do_roundup = 0; + + return 0; +} + +early_param("nokgdbroundup", opt_nokgdbroundup); + +/* + * Finally, some KGDB code :-) + */ + +/* + * Weak aliases for breakpoint management, + * can be overriden by architectures when needed: + */ +int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) +{ + int err; + + err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); + if (err) + return err; + + return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); +} + +int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) +{ + return probe_kernel_write((char *)addr, + (char *)bundle, BREAK_INSTR_SIZE); +} + +int __weak kgdb_validate_break_address(unsigned long addr) +{ + char tmp_variable[BREAK_INSTR_SIZE]; + int err; + /* Validate setting the breakpoint and then removing it. In the + * remove fails, the kernel needs to emit a bad message because we + * are deep trouble not being able to put things back the way we + * found them. + */ + err = kgdb_arch_set_breakpoint(addr, tmp_variable); + if (err) + return err; + err = kgdb_arch_remove_breakpoint(addr, tmp_variable); + if (err) + printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " + "memory destroyed at: %lx", addr); + return err; +} + +unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int __weak kgdb_arch_init(void) +{ + return 0; +} + +int __weak kgdb_skipexception(int exception, struct pt_regs *regs) +{ + return 0; +} + +/** + * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. + * @regs: Current &struct pt_regs. + * + * This function will be called if the particular architecture must + * disable hardware debugging while it is processing gdb packets or + * handling exception. + */ +void __weak kgdb_disable_hw_debug(struct pt_regs *regs) +{ +} + +/* + * Some architectures need cache flushes when we set/clear a + * breakpoint: + */ +static void kgdb_flush_swbreak_addr(unsigned long addr) +{ + if (!CACHE_FLUSH_IS_SAFE) + return; + + if (current->mm && current->mm->mmap_cache) { + flush_cache_range(current->mm->mmap_cache, + addr, addr + BREAK_INSTR_SIZE); + } + /* Force flush instruction cache if it was outside the mm */ + flush_icache_range(addr, addr + BREAK_INSTR_SIZE); +} + +/* + * SW breakpoint management: + */ +int dbg_activate_sw_breakpoints(void) +{ + unsigned long addr; + int error; + int ret = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_SET) + continue; + + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_set_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) { + ret = error; + printk(KERN_INFO "KGDB: BP install failed: %lx", addr); + continue; + } + + kgdb_flush_swbreak_addr(addr); + kgdb_break[i].state = BP_ACTIVE; + } + return ret; +} + +int dbg_set_sw_break(unsigned long addr) +{ + int err = kgdb_validate_break_address(addr); + int breakno = -1; + int i; + + if (err) + return err; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) + return -EEXIST; + } + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_REMOVED && + kgdb_break[i].bpt_addr == addr) { + breakno = i; + break; + } + } + + if (breakno == -1) { + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_UNDEFINED) { + breakno = i; + break; + } + } + } + + if (breakno == -1) + return -E2BIG; + + kgdb_break[breakno].state = BP_SET; + kgdb_break[breakno].type = BP_BREAKPOINT; + kgdb_break[breakno].bpt_addr = addr; + + return 0; +} + +int dbg_deactivate_sw_breakpoints(void) +{ + unsigned long addr; + int error; + int ret = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + continue; + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_remove_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) { + printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); + ret = error; + } + + kgdb_flush_swbreak_addr(addr); + kgdb_break[i].state = BP_SET; + } + return ret; +} + +int dbg_remove_sw_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) { + kgdb_break[i].state = BP_REMOVED; + return 0; + } + } + return -ENOENT; +} + +int kgdb_isremovedbreak(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_REMOVED) && + (kgdb_break[i].bpt_addr == addr)) + return 1; + } + return 0; +} + +int dbg_remove_all_break(void) +{ + unsigned long addr; + int error; + int i; + + /* Clear memory breakpoints. */ + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + goto setundefined; + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_remove_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + addr); +setundefined: + kgdb_break[i].state = BP_UNDEFINED; + } + + /* Clear hardware breakpoints. */ + if (arch_kgdb_ops.remove_all_hw_break) + arch_kgdb_ops.remove_all_hw_break(); + + return 0; +} + +/* + * Return true if there is a valid kgdb I/O module. Also if no + * debugger is attached a message can be printed to the console about + * waiting for the debugger to attach. + * + * The print_wait argument is only to be true when called from inside + * the core kgdb_handle_exception, because it will wait for the + * debugger to attach. + */ +static int kgdb_io_ready(int print_wait) +{ + if (!dbg_io_ops) + return 0; + if (kgdb_connected) + return 1; + if (atomic_read(&kgdb_setting_breakpoint)) + return 1; + if (print_wait) { +#ifdef CONFIG_KGDB_KDB + if (!dbg_kdb_mode) + printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); +#else + printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); +#endif + } + return 1; +} + +static int kgdb_reenter_check(struct kgdb_state *ks) +{ + unsigned long addr; + + if (atomic_read(&kgdb_active) != raw_smp_processor_id()) + return 0; + + /* Panic on recursive debugger calls: */ + exception_level++; + addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); + dbg_deactivate_sw_breakpoints(); + + /* + * If the break point removed ok at the place exception + * occurred, try to recover and print a warning to the end + * user because the user planted a breakpoint in a place that + * KGDB needs in order to function. + */ + if (dbg_remove_sw_break(addr) == 0) { + exception_level = 0; + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + dbg_activate_sw_breakpoints(); + printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", + addr); + WARN_ON_ONCE(1); + + return 1; + } + dbg_remove_all_break(); + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + + if (exception_level > 1) { + dump_stack(); + panic("Recursive entry to debugger"); + } + + printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); +#ifdef CONFIG_KGDB_KDB + /* Allow kdb to debug itself one level */ + return 0; +#endif + dump_stack(); + panic("Recursive entry to debugger"); + + return 1; +} + +static void dbg_cpu_switch(int cpu, int next_cpu) +{ + /* Mark the cpu we are switching away from as a slave when it + * holds the kgdb_active token. This must be done so that the + * that all the cpus wait in for the debug core will not enter + * again as the master. */ + if (cpu == atomic_read(&kgdb_active)) { + kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; + kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER; + } + kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER; +} + +static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) +{ + unsigned long flags; + int sstep_tries = 100; + int error; + int i, cpu; + int trace_on = 0; +acquirelock: + /* + * Interrupts will be restored by the 'trap return' code, except when + * single stepping. + */ + local_irq_save(flags); + + cpu = ks->cpu; + kgdb_info[cpu].debuggerinfo = regs; + kgdb_info[cpu].task = current; + kgdb_info[cpu].ret_state = 0; + kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; + /* + * Make sure the above info reaches the primary CPU before + * our cpu_in_kgdb[] flag setting does: + */ + atomic_inc(&cpu_in_kgdb[cpu]); + + if (exception_level == 1) + goto cpu_master_loop; + + /* + * CPU will loop if it is a slave or request to become a kgdb + * master cpu and acquire the kgdb_active lock: + */ + while (1) { +cpu_loop: + if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) { + kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; + goto cpu_master_loop; + } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { + if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) + break; + } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { + if (!atomic_read(&passive_cpu_wait[cpu])) + goto return_normal; + } else { +return_normal: + /* Return to normal operation by executing any + * hw breakpoint fixup. + */ + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + if (trace_on) + tracing_on(); + atomic_dec(&cpu_in_kgdb[cpu]); + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + local_irq_restore(flags); + return 0; + } + cpu_relax(); + } + + /* + * For single stepping, try to only enter on the processor + * that was single stepping. To gaurd against a deadlock, the + * kernel will only try for the value of sstep_tries before + * giving up and continuing on. + */ + if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && + (kgdb_info[cpu].task && + kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { + atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + local_irq_restore(flags); + + goto acquirelock; + } + + if (!kgdb_io_ready(1)) { + kgdb_info[cpu].ret_state = 1; + goto kgdb_restore; /* No I/O connection, resume the system */ + } + + /* + * Don't enter if we have hit a removed breakpoint. + */ + if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) + goto kgdb_restore; + + /* Call the I/O driver's pre_exception routine */ + if (dbg_io_ops->pre_exception) + dbg_io_ops->pre_exception(); + + kgdb_disable_hw_debug(ks->linux_regs); + + /* + * Get the passive CPU lock which will hold all the non-primary + * CPU in a spin state while the debugger is active + */ + if (!kgdb_single_step) { + for (i = 0; i < NR_CPUS; i++) + atomic_inc(&passive_cpu_wait[i]); + } + +#ifdef CONFIG_SMP + /* Signal the other CPUs to enter kgdb_wait() */ + if ((!kgdb_single_step) && kgdb_do_roundup) + kgdb_roundup_cpus(flags); +#endif + + /* + * Wait for the other CPUs to be notified and be waiting for us: + */ + for_each_online_cpu(i) { + while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) + cpu_relax(); + } + + /* + * At this point the primary processor is completely + * in the debugger and all secondary CPUs are quiescent + */ + dbg_deactivate_sw_breakpoints(); + kgdb_single_step = 0; + kgdb_contthread = current; + exception_level = 0; + trace_on = tracing_is_on(); + if (trace_on) + tracing_off(); + + while (1) { +cpu_master_loop: + if (dbg_kdb_mode) { + kgdb_connected = 1; + error = kdb_stub(ks); + kgdb_connected = 0; + } else { + error = gdb_serial_stub(ks); + } + + if (error == DBG_PASS_EVENT) { + dbg_kdb_mode = !dbg_kdb_mode; + } else if (error == DBG_SWITCH_CPU_EVENT) { + dbg_cpu_switch(cpu, dbg_switch_cpu); + goto cpu_loop; + } else { + kgdb_info[cpu].ret_state = error; + break; + } + } + + /* Call the I/O driver's post_exception routine */ + if (dbg_io_ops->post_exception) + dbg_io_ops->post_exception(); + + atomic_dec(&cpu_in_kgdb[ks->cpu]); + + if (!kgdb_single_step) { + for (i = NR_CPUS-1; i >= 0; i--) + atomic_dec(&passive_cpu_wait[i]); + /* + * Wait till all the CPUs have quit from the debugger, + * but allow a CPU that hit an exception and is + * waiting to become the master to remain in the debug + * core. + */ + for_each_online_cpu(i) { + while (kgdb_do_roundup && + atomic_read(&cpu_in_kgdb[i]) && + !(kgdb_info[i].exception_state & + DCPU_WANT_MASTER)) + cpu_relax(); + } + } + +kgdb_restore: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step); + if (kgdb_info[sstep_cpu].task) + kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid; + else + kgdb_sstep_pid = 0; + } + if (trace_on) + tracing_on(); + /* Free kgdb_active */ + atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + local_irq_restore(flags); + + return kgdb_info[cpu].ret_state; +} + +/* + * kgdb_handle_exception() - main entry point from a kernel exception + * + * Locking hierarchy: + * interface locks, if any (begin_session) + * kgdb lock (kgdb_active) + */ +int +kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +{ + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + int ret; + + ks->cpu = raw_smp_processor_id(); + ks->ex_vector = evector; + ks->signo = signo; + ks->err_code = ecode; + ks->kgdb_usethreadid = 0; + ks->linux_regs = regs; + + if (kgdb_reenter_check(ks)) + return 0; /* Ouch, double exception ! */ + kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; + ret = kgdb_cpu_enter(ks, regs); + kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | + DCPU_IS_SLAVE); + return ret; +} + +int kgdb_nmicallback(int cpu, void *regs) +{ +#ifdef CONFIG_SMP + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + + memset(ks, 0, sizeof(struct kgdb_state)); + ks->cpu = cpu; + ks->linux_regs = regs; + + if (!atomic_read(&cpu_in_kgdb[cpu]) && + atomic_read(&kgdb_active) != -1 && + atomic_read(&kgdb_active) != cpu) { + kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; + kgdb_cpu_enter(ks, regs); + kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE; + return 0; + } +#endif + return 1; +} + +static void kgdb_console_write(struct console *co, const char *s, + unsigned count) +{ + unsigned long flags; + + /* If we're debugging, or KGDB has not connected, don't try + * and print. */ + if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode) + return; + + local_irq_save(flags); + gdbstub_msg_write(s, count); + local_irq_restore(flags); +} + +static struct console kgdbcons = { + .name = "kgdb", + .write = kgdb_console_write, + .flags = CON_PRINTBUFFER | CON_ENABLED, + .index = -1, +}; + +#ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_handle_dbg(int key, struct tty_struct *tty) +{ + if (!dbg_io_ops) { + printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + return; + } + if (!kgdb_connected) { +#ifdef CONFIG_KGDB_KDB + if (!dbg_kdb_mode) + printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); +#else + printk(KERN_CRIT "Entering KGDB\n"); +#endif + } + + kgdb_breakpoint(); +} + +static struct sysrq_key_op sysrq_dbg_op = { + .handler = sysrq_handle_dbg, + .help_msg = "debug(G)", + .action_msg = "DEBUG", +}; +#endif + +static int kgdb_panic_event(struct notifier_block *self, + unsigned long val, + void *data) +{ + if (dbg_kdb_mode) + kdb_printf("PANIC: %s\n", (char *)data); + kgdb_breakpoint(); + return NOTIFY_DONE; +} + +static struct notifier_block kgdb_panic_event_nb = { + .notifier_call = kgdb_panic_event, + .priority = INT_MAX, +}; + +void __weak kgdb_arch_late(void) +{ +} + +void __init dbg_late_init(void) +{ + dbg_is_early = false; + if (kgdb_io_module_registered) + kgdb_arch_late(); + kdb_init(KDB_INIT_FULL); +} + +static void kgdb_register_callbacks(void) +{ + if (!kgdb_io_module_registered) { + kgdb_io_module_registered = 1; + kgdb_arch_init(); + if (!dbg_is_early) + kgdb_arch_late(); + atomic_notifier_chain_register(&panic_notifier_list, + &kgdb_panic_event_nb); +#ifdef CONFIG_MAGIC_SYSRQ + register_sysrq_key('g', &sysrq_dbg_op); +#endif + if (kgdb_use_con && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + } +} + +static void kgdb_unregister_callbacks(void) +{ + /* + * When this routine is called KGDB should unregister from the + * panic handler and clean up, making sure it is not handling any + * break exceptions at the time. + */ + if (kgdb_io_module_registered) { + kgdb_io_module_registered = 0; + atomic_notifier_chain_unregister(&panic_notifier_list, + &kgdb_panic_event_nb); + kgdb_arch_exit(); +#ifdef CONFIG_MAGIC_SYSRQ + unregister_sysrq_key('g', &sysrq_dbg_op); +#endif + if (kgdb_con_registered) { + unregister_console(&kgdbcons); + kgdb_con_registered = 0; + } + } +} + +/* + * There are times a tasklet needs to be used vs a compiled in + * break point so as to cause an exception outside a kgdb I/O module, + * such as is the case with kgdboe, where calling a breakpoint in the + * I/O driver itself would be fatal. + */ +static void kgdb_tasklet_bpt(unsigned long ing) +{ + kgdb_breakpoint(); + atomic_set(&kgdb_break_tasklet_var, 0); +} + +static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); + +void kgdb_schedule_breakpoint(void) +{ + if (atomic_read(&kgdb_break_tasklet_var) || + atomic_read(&kgdb_active) != -1 || + atomic_read(&kgdb_setting_breakpoint)) + return; + atomic_inc(&kgdb_break_tasklet_var); + tasklet_schedule(&kgdb_tasklet_breakpoint); +} +EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); + +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + +/** + * kgdb_register_io_module - register KGDB IO module + * @new_dbg_io_ops: the io ops vector + * + * Register it with the KGDB core. + */ +int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) +{ + int err; + + spin_lock(&kgdb_registration_lock); + + if (dbg_io_ops) { + spin_unlock(&kgdb_registration_lock); + + printk(KERN_ERR "kgdb: Another I/O driver is already " + "registered with KGDB.\n"); + return -EBUSY; + } + + if (new_dbg_io_ops->init) { + err = new_dbg_io_ops->init(); + if (err) { + spin_unlock(&kgdb_registration_lock); + return err; + } + } + + dbg_io_ops = new_dbg_io_ops; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", + new_dbg_io_ops->name); + + /* Arm KGDB now. */ + kgdb_register_callbacks(); + + if (kgdb_break_asap) + kgdb_initial_breakpoint(); + + return 0; +} +EXPORT_SYMBOL_GPL(kgdb_register_io_module); + +/** + * kkgdb_unregister_io_module - unregister KGDB IO module + * @old_dbg_io_ops: the io ops vector + * + * Unregister it with the KGDB core. + */ +void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) +{ + BUG_ON(kgdb_connected); + + /* + * KGDB is no longer able to communicate out, so + * unregister our callbacks and reset state. + */ + kgdb_unregister_callbacks(); + + spin_lock(&kgdb_registration_lock); + + WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops); + dbg_io_ops = NULL; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO + "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + old_dbg_io_ops->name); +} +EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); + +int dbg_io_get_char(void) +{ + int ret = dbg_io_ops->read_char(); + if (ret == NO_POLL_CHAR) + return -1; + if (!dbg_kdb_mode) + return ret; + if (ret == 127) + return 8; + return ret; +} + +/** + * kgdb_breakpoint - generate breakpoint exception + * + * This function will generate a breakpoint exception. It is used at the + * beginning of a program to sync up with a debugger and can be used + * otherwise as a quick means to stop program execution and "break" into + * the debugger. + */ +void kgdb_breakpoint(void) +{ + atomic_inc(&kgdb_setting_breakpoint); + wmb(); /* Sync point before breakpoint */ + arch_kgdb_breakpoint(); + wmb(); /* Sync point after breakpoint */ + atomic_dec(&kgdb_setting_breakpoint); +} +EXPORT_SYMBOL_GPL(kgdb_breakpoint); + +static int __init opt_kgdb_wait(char *str) +{ + kgdb_break_asap = 1; + + kdb_init(KDB_INIT_EARLY); + if (kgdb_io_module_registered) + kgdb_initial_breakpoint(); + + return 0; +} + +early_param("kgdbwait", opt_kgdb_wait); diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h new file mode 100644 index 0000000..c5d753d --- /dev/null +++ b/kernel/debug/debug_core.h @@ -0,0 +1,81 @@ +/* + * Created by: Jason Wessel <jason.wessel@windriver.com> + * + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#ifndef _DEBUG_CORE_H_ +#define _DEBUG_CORE_H_ +/* + * These are the private implementation headers between the kernel + * debugger core and the debugger front end code. + */ + +/* kernel debug core data structures */ +struct kgdb_state { + int ex_vector; + int signo; + int err_code; + int cpu; + int pass_exception; + unsigned long thr_query; + unsigned long threadid; + long kgdb_usethreadid; + struct pt_regs *linux_regs; +}; + +/* Exception state values */ +#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ +#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ +#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ +#define DCPU_SSTEP 0x8 /* CPU is single stepping */ + +struct debuggerinfo_struct { + void *debuggerinfo; + struct task_struct *task; + int exception_state; + int ret_state; + int irq_depth; +}; + +extern struct debuggerinfo_struct kgdb_info[]; + +/* kernel debug core break point routines */ +extern int dbg_remove_all_break(void); +extern int dbg_set_sw_break(unsigned long addr); +extern int dbg_remove_sw_break(unsigned long addr); +extern int dbg_activate_sw_breakpoints(void); +extern int dbg_deactivate_sw_breakpoints(void); + +/* polled character access to i/o module */ +extern int dbg_io_get_char(void); + +/* stub return value for switching between the gdbstub and kdb */ +#define DBG_PASS_EVENT -12345 +/* Switch from one cpu to another */ +#define DBG_SWITCH_CPU_EVENT -123456 +extern int dbg_switch_cpu; + +/* gdbstub interface functions */ +extern int gdb_serial_stub(struct kgdb_state *ks); +extern void gdbstub_msg_write(const char *s, int len); + +/* gdbstub functions used for kdb <-> gdbstub transition */ +extern int gdbstub_state(struct kgdb_state *ks, char *cmd); +extern int dbg_kdb_mode; + +#ifdef CONFIG_KGDB_KDB +extern int kdb_stub(struct kgdb_state *ks); +extern int kdb_parse(const char *cmdstr); +#else /* ! CONFIG_KGDB_KDB */ +static inline int kdb_stub(struct kgdb_state *ks) +{ + return DBG_PASS_EVENT; +} +#endif /* CONFIG_KGDB_KDB */ + +#endif /* _DEBUG_CORE_H_ */ diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c new file mode 100644 index 0000000..6e81fd5 --- /dev/null +++ b/kernel/debug/gdbstub.c @@ -0,0 +1,1014 @@ +/* + * Kernel Debug Core + * + * Maintainer: Jason Wessel <jason.wessel@windriver.com> + * + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002-2004 Timesys Corporation + * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> + * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2005-2009 Wind River Systems, Inc. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Contributors at various stages not listed above: + * Jason Wessel ( jason.wessel@windriver.com ) + * George Anzinger <george@mvista.com> + * Anurekh Saxena (anurekh.saxena@timesys.com) + * Lake Stevens Instrument Division (Glenn Engel) + * Jim Kingdon, Cygnus Support. + * + * Original KGDB stub: David Grothe <dave@gcom.com>, + * Tigran Aivazian <tigran@sco.com> + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#include <linux/kernel.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/reboot.h> +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/unaligned.h> +#include "debug_core.h" + +#define KGDB_MAX_THREAD_QUERY 17 + +/* Our I/O buffers. */ +static char remcom_in_buffer[BUFMAX]; +static char remcom_out_buffer[BUFMAX]; + +/* Storage for the registers, in GDB format. */ +static unsigned long gdb_regs[(NUMREGBYTES + + sizeof(unsigned long) - 1) / + sizeof(unsigned long)]; + +/* + * GDB remote protocol parser: + */ + +static int hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return ch - 'a' + 10; + if ((ch >= '0') && (ch <= '9')) + return ch - '0'; + if ((ch >= 'A') && (ch <= 'F')) + return ch - 'A' + 10; + return -1; +} + +#ifdef CONFIG_KGDB_KDB +static int gdbstub_read_wait(void) +{ + int ret = -1; + int i; + + /* poll any additional I/O interfaces that are defined */ + while (ret < 0) + for (i = 0; kdb_poll_funcs[i] != NULL; i++) { + ret = kdb_poll_funcs[i](); + if (ret > 0) + break; + } + return ret; +} +#else +static int gdbstub_read_wait(void) +{ + int ret = dbg_io_ops->read_char(); + while (ret == NO_POLL_CHAR) + ret = dbg_io_ops->read_char(); + return ret; +} +#endif +/* scan for the sequence $<data>#<checksum> */ +static void get_packet(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int count; + char ch; + + do { + /* + * Spin and wait around for the start character, ignore all + * other characters: + */ + while ((ch = (gdbstub_read_wait())) != '$') + /* nothing */; + + kgdb_connected = 1; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* + * now, read until a # or end of buffer is found: + */ + while (count < (BUFMAX - 1)) { + ch = gdbstub_read_wait(); + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(gdbstub_read_wait()) << 4; + xmitcsum += hex(gdbstub_read_wait()); + + if (checksum != xmitcsum) + /* failed checksum */ + dbg_io_ops->write_char('-'); + else + /* successful transfer */ + dbg_io_ops->write_char('+'); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); + } + } while (checksum != xmitcsum); +} + +/* + * Send the packet in buffer. + * Check for gdb connection if asked for. + */ +static void put_packet(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* + * $<packet info>#<checksum>. + */ + while (1) { + dbg_io_ops->write_char('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + dbg_io_ops->write_char(ch); + checksum += ch; + count++; + } + + dbg_io_ops->write_char('#'); + dbg_io_ops->write_char(hex_asc_hi(checksum)); + dbg_io_ops->write_char(hex_asc_lo(checksum)); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); + + /* Now see what we get in reply. */ + ch = gdbstub_read_wait(); + + if (ch == 3) + ch = gdbstub_read_wait(); + + /* If we get an ACK, we are done. */ + if (ch == '+') + return; + + /* + * If we get the start of another packet, this means + * that GDB is attempting to reconnect. We will NAK + * the packet being sent, and stop trying to send this + * packet. + */ + if (ch == '$') { + dbg_io_ops->write_char('-'); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); + return; + } + } +} + +static char gdbmsgbuf[BUFMAX + 1]; + +void gdbstub_msg_write(const char *s, int len) +{ + char *bufptr; + int wcount; + int i; + + if (len == 0) + len = strlen(s); + + /* 'O'utput */ + gdbmsgbuf[0] = 'O'; + + /* Fill and send buffers... */ + while (len > 0) { + bufptr = gdbmsgbuf + 1; + + /* Calculate how many this time */ + if ((len << 1) > (BUFMAX - 2)) + wcount = (BUFMAX - 2) >> 1; + else + wcount = len; + + /* Pack in hex chars */ + for (i = 0; i < wcount; i++) + bufptr = pack_hex_byte(bufptr, s[i]); + *bufptr = '\0'; + + /* Move up */ + s += wcount; + len -= wcount; + + /* Write packet */ + put_packet(gdbmsgbuf); + } +} + +/* + * Convert the memory pointed to by mem into hex, placing result in + * buf. Return a pointer to the last char put in buf (null). May + * return an error. + */ +int kgdb_mem2hex(char *mem, char *buf, int count) +{ + char *tmp; + int err; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory copy. Hex conversion will work against this one. + */ + tmp = buf + count; + + err = probe_kernel_read(tmp, mem, count); + if (!err) { + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; + } + + *buf = 0; + } + + return err; +} + +/* + * Convert the hex array pointed to by buf into binary to be placed in + * mem. Return a pointer to the character AFTER the last byte + * written. May return an error. + */ +int kgdb_hex2mem(char *buf, char *mem, int count) +{ + char *tmp_raw; + char *tmp_hex; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory that is converted from hex. + */ + tmp_raw = buf + count * 2; + + tmp_hex = tmp_raw - 1; + while (tmp_hex >= buf) { + tmp_raw--; + *tmp_raw = hex(*tmp_hex--); + *tmp_raw |= hex(*tmp_hex--) << 4; + } + + return probe_kernel_write(mem, tmp_raw, count); +} + +/* + * While we find nice hex chars, build a long_val. + * Return number of chars processed. + */ +int kgdb_hex2long(char **ptr, unsigned long *long_val) +{ + int hex_val; + int num = 0; + int negate = 0; + + *long_val = 0; + + if (**ptr == '-') { + negate = 1; + (*ptr)++; + } + while (**ptr) { + hex_val = hex(**ptr); + if (hex_val < 0) + break; + + *long_val = (*long_val << 4) | hex_val; + num++; + (*ptr)++; + } + + if (negate) + *long_val = -*long_val; + + return num; +} + +/* + * Copy the binary array pointed to by buf into mem. Fix $, #, and + * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success. + * The input buf is overwitten with the result to write to mem. + */ +static int kgdb_ebin2mem(char *buf, char *mem, int count) +{ + int size = 0; + char *c = buf; + + while (count-- > 0) { + c[size] = *buf++; + if (c[size] == 0x7d) + c[size] = *buf++ ^ 0x20; + size++; + } + + return probe_kernel_write(mem, c, size); +} + +/* Write memory due to an 'M' or 'X' packet. */ +static int write_mem_msg(int binary) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long addr; + unsigned long length; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && + kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { + if (binary) + err = kgdb_ebin2mem(ptr, (char *)addr, length); + else + err = kgdb_hex2mem(ptr, (char *)addr, length); + if (err) + return err; + if (CACHE_FLUSH_IS_SAFE) + flush_icache_range(addr, addr + length); + return 0; + } + + return -EINVAL; +} + +static void error_packet(char *pkt, int error) +{ + error = -error; + pkt[0] = 'E'; + pkt[1] = hex_asc[(error / 10)]; + pkt[2] = hex_asc[(error % 10)]; + pkt[3] = '\0'; +} + +/* + * Thread ID accessors. We represent a flat TID space to GDB, where + * the per CPU idle threads (which under Linux all have PID 0) are + * remapped to negative TIDs. + */ + +#define BUF_THREAD_ID_SIZE 16 + +static char *pack_threadid(char *pkt, unsigned char *id) +{ + char *limit; + + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *id++); + + return pkt; +} + +static void int_to_threadref(unsigned char *id, int value) +{ + unsigned char *scan; + int i = 4; + + scan = (unsigned char *)id; + while (i--) + *scan++ = 0; + put_unaligned_be32(value, scan); +} + +static struct task_struct *getthread(struct pt_regs *regs, int tid) +{ + /* + * Non-positive TIDs are remapped to the cpu shadow information + */ + if (tid == 0 || tid == -1) + tid = -atomic_read(&kgdb_active) - 2; + if (tid < -1 && tid > -NR_CPUS - 2) { + if (kgdb_info[-tid - 2].task) + return kgdb_info[-tid - 2].task; + else + return idle_task(-tid - 2); + } + if (tid <= 0) { + printk(KERN_ERR "KGDB: Internal thread select error\n"); + dump_stack(); + return NULL; + } + + /* + * find_task_by_pid_ns() does not take the tasklist lock anymore + * but is nicely RCU locked - hence is a pretty resilient + * thing to use: + */ + return find_task_by_pid_ns(tid, &init_pid_ns); +} + + +/* + * Remap normal tasks to their real PID, + * CPU shadow threads are mapped to -CPU - 2 + */ +static inline int shadow_pid(int realpid) +{ + if (realpid) + return realpid; + + return -raw_smp_processor_id() - 2; +} + +/* + * All the functions that start with gdb_cmd are the various + * operations to implement the handlers for the gdbserial protocol + * where KGDB is communicating with an external debugger + */ + +/* Handle the '?' status packets */ +static void gdb_cmd_status(struct kgdb_state *ks) +{ + /* + * We know that this packet is only sent + * during initial connect. So to be safe, + * we clear out our breakpoints now in case + * GDB is reconnecting. + */ + dbg_remove_all_break(); + + remcom_out_buffer[0] = 'S'; + pack_hex_byte(&remcom_out_buffer[1], ks->signo); +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + struct task_struct *thread; + void *local_debuggerinfo; + int i; + + thread = kgdb_usethread; + if (!thread) { + thread = kgdb_info[ks->cpu].task; + local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; + } else { + local_debuggerinfo = NULL; + for_each_online_cpu(i) { + /* + * Try to find the task on some other + * or possibly this node if we do not + * find the matching task then we try + * to approximate the results. + */ + if (thread == kgdb_info[i].task) + local_debuggerinfo = kgdb_info[i].debuggerinfo; + } + } + + /* + * All threads that don't have debuggerinfo should be + * in schedule() sleeping, since all other CPUs + * are in kgdb_wait, and thus have debuggerinfo. + */ + if (local_debuggerinfo) { + pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); + } else { + /* + * Pull stuff saved during switch_to; nothing + * else is accessible (or even particularly + * relevant). + * + * This should be enough for a stack trace. + */ + sleeping_thread_to_gdb_regs(gdb_regs, thread); + } + kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); +} + +/* Handle the 'G' set registers request */ +static void gdb_cmd_setregs(struct kgdb_state *ks) +{ + kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); + + if (kgdb_usethread && kgdb_usethread != current) { + error_packet(remcom_out_buffer, -EINVAL); + } else { + gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); + } +} + +/* Handle the 'm' memory read bytes */ +static void gdb_cmd_memread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long length; + unsigned long addr; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && + kgdb_hex2long(&ptr, &length) > 0) { + err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); + if (err) + error_packet(remcom_out_buffer, err); + } else { + error_packet(remcom_out_buffer, -EINVAL); + } +} + +/* Handle the 'M' memory write bytes */ +static void gdb_cmd_memwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(0); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'X' memory binary write bytes */ +static void gdb_cmd_binwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(1); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'D' or 'k', detach or kill packets */ +static void gdb_cmd_detachkill(struct kgdb_state *ks) +{ + int error; + + /* The detach case */ + if (remcom_in_buffer[0] == 'D') { + error = dbg_remove_all_break(); + if (error < 0) { + error_packet(remcom_out_buffer, error); + } else { + strcpy(remcom_out_buffer, "OK"); + kgdb_connected = 0; + } + put_packet(remcom_out_buffer); + } else { + /* + * Assume the kill case, with no exit code checking, + * trying to force detach the debugger: + */ + dbg_remove_all_break(); + kgdb_connected = 0; + } +} + +/* Handle the 'R' reboot packets */ +static int gdb_cmd_reboot(struct kgdb_state *ks) +{ + /* For now, only honor R0 */ + if (strcmp(remcom_in_buffer, "R0") == 0) { + printk(KERN_CRIT "Executing emergency reboot\n"); + strcpy(remcom_out_buffer, "OK"); + put_packet(remcom_out_buffer); + + /* + * Execution should not return from + * machine_emergency_restart() + */ + machine_emergency_restart(); + kgdb_connected = 0; + + return 1; + } + return 0; +} + +/* Handle the 'q' query packets */ +static void gdb_cmd_query(struct kgdb_state *ks) +{ + struct task_struct *g; + struct task_struct *p; + unsigned char thref[8]; + char *ptr; + int i; + int cpu; + int finished = 0; + + switch (remcom_in_buffer[1]) { + case 's': + case 'f': + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) + break; + + i = 0; + remcom_out_buffer[0] = 'm'; + ptr = remcom_out_buffer + 1; + if (remcom_in_buffer[1] == 'f') { + /* Each cpu is a shadow thread */ + for_each_online_cpu(cpu) { + ks->thr_query = 0; + int_to_threadref(thref, -cpu - 2); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + i++; + } + } + + do_each_thread(g, p) { + if (i >= ks->thr_query && !finished) { + int_to_threadref(thref, p->pid); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + ks->thr_query++; + if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) + finished = 1; + } + i++; + } while_each_thread(g, p); + + *(--ptr) = '\0'; + break; + + case 'C': + /* Current thread id */ + strcpy(remcom_out_buffer, "QC"); + ks->threadid = shadow_pid(current->pid); + int_to_threadref(thref, ks->threadid); + pack_threadid(remcom_out_buffer + 2, thref); + break; + case 'T': + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) + break; + + ks->threadid = 0; + ptr = remcom_in_buffer + 17; + kgdb_hex2long(&ptr, &ks->threadid); + if (!getthread(ks->linux_regs, ks->threadid)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + if ((int)ks->threadid > 0) { + kgdb_mem2hex(getthread(ks->linux_regs, + ks->threadid)->comm, + remcom_out_buffer, 16); + } else { + static char tmpstr[23 + BUF_THREAD_ID_SIZE]; + + sprintf(tmpstr, "shadowCPU%d", + (int)(-ks->threadid - 2)); + kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); + } + break; +#ifdef CONFIG_KGDB_KDB + case 'R': + if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) { + int len = strlen(remcom_in_buffer + 6); + + if ((len % 2) != 0) { + strcpy(remcom_out_buffer, "E01"); + break; + } + kgdb_hex2mem(remcom_in_buffer + 6, + remcom_out_buffer, len); + len = len / 2; + remcom_out_buffer[len++] = 0; + + kdb_parse(remcom_out_buffer); + strcpy(remcom_out_buffer, "OK"); + } + break; +#endif + } +} + +/* Handle the 'H' task query packets */ +static void gdb_cmd_task(struct kgdb_state *ks) +{ + struct task_struct *thread; + char *ptr; + + switch (remcom_in_buffer[1]) { + case 'g': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_usethread = thread; + ks->kgdb_usethreadid = ks->threadid; + strcpy(remcom_out_buffer, "OK"); + break; + case 'c': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + if (!ks->threadid) { + kgdb_contthread = NULL; + } else { + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_contthread = thread; + } + strcpy(remcom_out_buffer, "OK"); + break; + } +} + +/* Handle the 'T' thread query packets */ +static void gdb_cmd_thread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + struct task_struct *thread; + + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (thread) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, -EINVAL); +} + +/* Handle the 'z' or 'Z' breakpoint remove or set packets */ +static void gdb_cmd_break(struct kgdb_state *ks) +{ + /* + * Since GDB-5.3, it's been drafted that '0' is a software + * breakpoint, '1' is a hardware breakpoint, so let's do that. + */ + char *bpt_type = &remcom_in_buffer[1]; + char *ptr = &remcom_in_buffer[2]; + unsigned long addr; + unsigned long length; + int error = 0; + + if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { + /* Unsupported */ + if (*bpt_type > '4') + return; + } else { + if (*bpt_type != '0' && *bpt_type != '1') + /* Unsupported. */ + return; + } + + /* + * Test if this is a hardware breakpoint, and + * if we support it: + */ + if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) + /* Unsupported. */ + return; + + if (*(ptr++) != ',') { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (!kgdb_hex2long(&ptr, &addr)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (*(ptr++) != ',' || + !kgdb_hex2long(&ptr, &length)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + + if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') + error = dbg_set_sw_break(addr); + else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') + error = dbg_remove_sw_break(addr); + else if (remcom_in_buffer[0] == 'Z') + error = arch_kgdb_ops.set_hw_breakpoint(addr, + (int)length, *bpt_type - '0'); + else if (remcom_in_buffer[0] == 'z') + error = arch_kgdb_ops.remove_hw_breakpoint(addr, + (int) length, *bpt_type - '0'); + + if (error == 0) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, error); +} + +/* Handle the 'C' signal / exception passing packets */ +static int gdb_cmd_exception_pass(struct kgdb_state *ks) +{ + /* C09 == pass exception + * C15 == detach kgdb, pass exception + */ + if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'c'; + + } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'D'; + dbg_remove_all_break(); + kgdb_connected = 0; + return 1; + + } else { + gdbstub_msg_write("KGDB only knows signal 9 (pass)" + " and 15 (pass and disconnect)\n" + "Executing a continue without signal passing\n", 0); + remcom_in_buffer[0] = 'c'; + } + + /* Indicate fall through */ + return -1; +} + +/* + * This function performs all gdbserial command procesing + */ +int gdb_serial_stub(struct kgdb_state *ks) +{ + int error = 0; + int tmp; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + if (kgdb_connected) { + unsigned char thref[8]; + char *ptr; + + /* Reply to host that an exception has occurred */ + ptr = remcom_out_buffer; + *ptr++ = 'T'; + ptr = pack_hex_byte(ptr, ks->signo); + ptr += strlen(strcpy(ptr, "thread:")); + int_to_threadref(thref, shadow_pid(current->pid)); + ptr = pack_threadid(ptr, thref); + *ptr++ = ';'; + put_packet(remcom_out_buffer); + } + + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; + + while (1) { + error = 0; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + get_packet(remcom_in_buffer); + + switch (remcom_in_buffer[0]) { + case '?': /* gdbserial status */ + gdb_cmd_status(ks); + break; + case 'g': /* return the value of the CPU registers */ + gdb_cmd_getregs(ks); + break; + case 'G': /* set the value of the CPU registers - return OK */ + gdb_cmd_setregs(ks); + break; + case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + gdb_cmd_memread(ks); + break; + case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_memwrite(ks); + break; + case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_binwrite(ks); + break; + /* kill or detach. KGDB should treat this like a + * continue. + */ + case 'D': /* Debugger detach */ + case 'k': /* Debugger detach via kill */ + gdb_cmd_detachkill(ks); + goto default_handle; + case 'R': /* Reboot */ + if (gdb_cmd_reboot(ks)) + goto default_handle; + break; + case 'q': /* query command */ + gdb_cmd_query(ks); + break; + case 'H': /* task related */ + gdb_cmd_task(ks); + break; + case 'T': /* Query thread status */ + gdb_cmd_thread(ks); + break; + case 'z': /* Break point remove */ + case 'Z': /* Break point set */ + gdb_cmd_break(ks); + break; +#ifdef CONFIG_KGDB_KDB + case '3': /* Escape into back into kdb */ + if (remcom_in_buffer[1] == '\0') { + gdb_cmd_detachkill(ks); + return DBG_PASS_EVENT; + } +#endif + case 'C': /* Exception passing */ + tmp = gdb_cmd_exception_pass(ks); + if (tmp > 0) + goto default_handle; + if (tmp == 0) + break; + /* Fall through on tmp < 0 */ + case 'c': /* Continue packet */ + case 's': /* Single step packet */ + if (kgdb_contthread && kgdb_contthread != current) { + /* Can't switch threads in kgdb */ + error_packet(remcom_out_buffer, -EINVAL); + break; + } + dbg_activate_sw_breakpoints(); + /* Fall through to default processing */ + default: +default_handle: + error = kgdb_arch_handle_exception(ks->ex_vector, + ks->signo, + ks->err_code, + remcom_in_buffer, + remcom_out_buffer, + ks->linux_regs); + /* + * Leave cmd processing on error, detach, + * kill, continue, or single step. + */ + if (error >= 0 || remcom_in_buffer[0] == 'D' || + remcom_in_buffer[0] == 'k') { + error = 0; + goto kgdb_exit; + } + + } + + /* reply to the request */ + put_packet(remcom_out_buffer); + } + +kgdb_exit: + if (ks->pass_exception) + error = 1; + return error; +} + +int gdbstub_state(struct kgdb_state *ks, char *cmd) +{ + int error; + + switch (cmd[0]) { + case 'e': + error = kgdb_arch_handle_exception(ks->ex_vector, + ks->signo, + ks->err_code, + remcom_in_buffer, + remcom_out_buffer, + ks->linux_regs); + return error; + case 's': + case 'c': + strcpy(remcom_in_buffer, cmd); + return 0; + case '?': + gdb_cmd_status(ks); + break; + case '\0': + strcpy(remcom_out_buffer, ""); + break; + } + dbg_io_ops->write_char('+'); + put_packet(remcom_out_buffer); + return 0; +} diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore new file mode 100644 index 0000000..396d12e --- /dev/null +++ b/kernel/debug/kdb/.gitignore @@ -0,0 +1 @@ +gen-kdb_cmds.c diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile new file mode 100644 index 0000000..d4fc58f --- /dev/null +++ b/kernel/debug/kdb/Makefile @@ -0,0 +1,25 @@ +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. +# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. +# + +CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p') +obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o +obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o + +clean-files := gen-kdb_cmds.c + +quiet_cmd_gen-kdb = GENKDB $@ + cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \ + /^\#/{next} \ + /^[ \t]*$$/{next} \ + {gsub(/"/, "\\\"", $$0); \ + print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \ + END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \ + $(filter-out %/Makefile,$^) > $@# + +$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile + $(call cmd,gen-kdb) diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c new file mode 100644 index 0000000..75bd9b3 --- /dev/null +++ b/kernel/debug/kdb/kdb_bp.c @@ -0,0 +1,564 @@ +/* + * Kernel Debugger Architecture Independent Breakpoint Handler + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/kdb.h> +#include <linux/kgdb.h> +#include <linux/smp.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include "kdb_private.h" + +/* + * Table of kdb_breakpoints + */ +kdb_bp_t kdb_breakpoints[KDB_MAXBPT]; + +static void kdb_setsinglestep(struct pt_regs *regs) +{ + KDB_STATE_SET(DOING_SS); +} + +static char *kdb_rwtypes[] = { + "Instruction(i)", + "Instruction(Register)", + "Data Write", + "I/O", + "Data Access" +}; + +static char *kdb_bptype(kdb_bp_t *bp) +{ + if (bp->bp_type < 0 || bp->bp_type > 4) + return ""; + + return kdb_rwtypes[bp->bp_type]; +} + +static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) +{ + int nextarg = *nextargp; + int diag; + + bp->bph_length = 1; + if ((argc + 1) != nextarg) { + if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) + bp->bp_type = BP_ACCESS_WATCHPOINT; + else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) + bp->bp_type = BP_WRITE_WATCHPOINT; + else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) + bp->bp_type = BP_HARDWARE_BREAKPOINT; + else + return KDB_ARGCOUNT; + + bp->bph_length = 1; + + nextarg++; + + if ((argc + 1) != nextarg) { + unsigned long len; + + diag = kdbgetularg((char *)argv[nextarg], + &len); + if (diag) + return diag; + + + if (len > 8) + return KDB_BADLENGTH; + + bp->bph_length = len; + nextarg++; + } + + if ((argc + 1) != nextarg) + return KDB_ARGCOUNT; + } + + *nextargp = nextarg; + return 0; +} + +static int _kdb_bp_remove(kdb_bp_t *bp) +{ + int ret = 1; + if (!bp->bp_installed) + return ret; + if (!bp->bp_type) + ret = dbg_remove_sw_break(bp->bp_addr); + else + ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr, + bp->bph_length, + bp->bp_type); + if (ret == 0) + bp->bp_installed = 0; + return ret; +} + +static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp) +{ + if (KDB_DEBUG(BP)) + kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs)); + + /* + * Setup single step + */ + kdb_setsinglestep(regs); + + /* + * Reset delay attribute + */ + bp->bp_delay = 0; + bp->bp_delayed = 1; +} + +static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) +{ + int ret; + /* + * Install the breakpoint, if it is not already installed. + */ + + if (KDB_DEBUG(BP)) + kdb_printf("%s: bp_installed %d\n", + __func__, bp->bp_installed); + if (!KDB_STATE(SSBPT)) + bp->bp_delay = 0; + if (bp->bp_installed) + return 1; + if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) { + if (KDB_DEBUG(BP)) + kdb_printf("%s: delayed bp\n", __func__); + kdb_handle_bp(regs, bp); + return 0; + } + if (!bp->bp_type) + ret = dbg_set_sw_break(bp->bp_addr); + else + ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr, + bp->bph_length, + bp->bp_type); + if (ret == 0) { + bp->bp_installed = 1; + } else { + kdb_printf("%s: failed to set breakpoint at 0x%lx\n", + __func__, bp->bp_addr); + return 1; + } + return 0; +} + +/* + * kdb_bp_install + * + * Install kdb_breakpoints prior to returning from the + * kernel debugger. This allows the kdb_breakpoints to be set + * upon functions that are used internally by kdb, such as + * printk(). This function is only called once per kdb session. + */ +void kdb_bp_install(struct pt_regs *regs) +{ + int i; + + for (i = 0; i < KDB_MAXBPT; i++) { + kdb_bp_t *bp = &kdb_breakpoints[i]; + + if (KDB_DEBUG(BP)) { + kdb_printf("%s: bp %d bp_enabled %d\n", + __func__, i, bp->bp_enabled); + } + if (bp->bp_enabled) + _kdb_bp_install(regs, bp); + } +} + +/* + * kdb_bp_remove + * + * Remove kdb_breakpoints upon entry to the kernel debugger. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ +void kdb_bp_remove(void) +{ + int i; + + for (i = KDB_MAXBPT - 1; i >= 0; i--) { + kdb_bp_t *bp = &kdb_breakpoints[i]; + + if (KDB_DEBUG(BP)) { + kdb_printf("%s: bp %d bp_enabled %d\n", + __func__, i, bp->bp_enabled); + } + if (bp->bp_enabled) + _kdb_bp_remove(bp); + } +} + + +/* + * kdb_printbp + * + * Internal function to format and print a breakpoint entry. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ + +static void kdb_printbp(kdb_bp_t *bp, int i) +{ + kdb_printf("%s ", kdb_bptype(bp)); + kdb_printf("BP #%d at ", i); + kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT); + + if (bp->bp_enabled) + kdb_printf("\n is enabled"); + else + kdb_printf("\n is disabled"); + + kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n", + bp->bp_addr, bp->bp_type, bp->bp_installed); + + kdb_printf("\n"); +} + +/* + * kdb_bp + * + * Handle the bp commands. + * + * [bp|bph] <addr-expression> [DATAR|DATAW] + * + * Parameters: + * argc Count of arguments in argv + * argv Space delimited command line arguments + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic if failure. + * Locking: + * None. + * Remarks: + * + * bp Set breakpoint on all cpus. Only use hardware assist if need. + * bph Set breakpoint on all cpus. Force hardware register + */ + +static int kdb_bp(int argc, const char **argv) +{ + int i, bpno; + kdb_bp_t *bp, *bp_check; + int diag; + int free; + char *symname = NULL; + long offset = 0ul; + int nextarg; + kdb_bp_t template = {0}; + + if (argc == 0) { + /* + * Display breakpoint table + */ + for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; + bpno++, bp++) { + if (bp->bp_free) + continue; + kdb_printbp(bp, bpno); + } + + return 0; + } + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr, + &offset, &symname); + if (diag) + return diag; + if (!template.bp_addr) + return KDB_BADINT; + + /* + * Find an empty bp structure to allocate + */ + free = KDB_MAXBPT; + for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { + if (bp->bp_free) + break; + } + + if (bpno == KDB_MAXBPT) + return KDB_TOOMANYBPT; + + if (strcmp(argv[0], "bph") == 0) { + template.bp_type = BP_HARDWARE_BREAKPOINT; + diag = kdb_parsebp(argc, argv, &nextarg, &template); + if (diag) + return diag; + } else { + template.bp_type = BP_BREAKPOINT; + } + + /* + * Check for clashing breakpoints. + * + * Note, in this design we can't have hardware breakpoints + * enabled for both read and write on the same address. + */ + for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT; + i++, bp_check++) { + if (!bp_check->bp_free && + bp_check->bp_addr == template.bp_addr) { + kdb_printf("You already have a breakpoint at " + kdb_bfd_vma_fmt0 "\n", template.bp_addr); + return KDB_DUPBPT; + } + } + + template.bp_enabled = 1; + + /* + * Actually allocate the breakpoint found earlier + */ + *bp = template; + bp->bp_free = 0; + + kdb_printbp(bp, bpno); + + return 0; +} + +/* + * kdb_bc + * + * Handles the 'bc', 'be', and 'bd' commands + * + * [bd|bc|be] <breakpoint-number> + * [bd|bc|be] * + * + * Parameters: + * argc Count of arguments in argv + * argv Space delimited command line arguments + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic for failure + * Locking: + * None. + * Remarks: + */ +static int kdb_bc(int argc, const char **argv) +{ + unsigned long addr; + kdb_bp_t *bp = NULL; + int lowbp = KDB_MAXBPT; + int highbp = 0; + int done = 0; + int i; + int diag = 0; + + int cmd; /* KDBCMD_B? */ +#define KDBCMD_BC 0 +#define KDBCMD_BE 1 +#define KDBCMD_BD 2 + + if (strcmp(argv[0], "be") == 0) + cmd = KDBCMD_BE; + else if (strcmp(argv[0], "bd") == 0) + cmd = KDBCMD_BD; + else + cmd = KDBCMD_BC; + + if (argc != 1) + return KDB_ARGCOUNT; + + if (strcmp(argv[1], "*") == 0) { + lowbp = 0; + highbp = KDB_MAXBPT; + } else { + diag = kdbgetularg(argv[1], &addr); + if (diag) + return diag; + + /* + * For addresses less than the maximum breakpoint number, + * assume that the breakpoint number is desired. + */ + if (addr < KDB_MAXBPT) { + bp = &kdb_breakpoints[addr]; + lowbp = highbp = addr; + highbp++; + } else { + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; + i++, bp++) { + if (bp->bp_addr == addr) { + lowbp = highbp = i; + highbp++; + break; + } + } + } + } + + /* + * Now operate on the set of breakpoints matching the input + * criteria (either '*' for all, or an individual breakpoint). + */ + for (bp = &kdb_breakpoints[lowbp], i = lowbp; + i < highbp; + i++, bp++) { + if (bp->bp_free) + continue; + + done++; + + switch (cmd) { + case KDBCMD_BC: + bp->bp_enabled = 0; + + kdb_printf("Breakpoint %d at " + kdb_bfd_vma_fmt " cleared\n", + i, bp->bp_addr); + + bp->bp_addr = 0; + bp->bp_free = 1; + + break; + case KDBCMD_BE: + bp->bp_enabled = 1; + + kdb_printf("Breakpoint %d at " + kdb_bfd_vma_fmt " enabled", + i, bp->bp_addr); + + kdb_printf("\n"); + break; + case KDBCMD_BD: + if (!bp->bp_enabled) + break; + + bp->bp_enabled = 0; + + kdb_printf("Breakpoint %d at " + kdb_bfd_vma_fmt " disabled\n", + i, bp->bp_addr); + + break; + } + if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) { + bp->bp_delay = 0; + KDB_STATE_CLEAR(SSBPT); + } + } + + return (!done) ? KDB_BPTNOTFOUND : 0; +} + +/* + * kdb_ss + * + * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) + * commands. + * + * ss + * ssb + * + * Parameters: + * argc Argument count + * argv Argument vector + * Outputs: + * None. + * Returns: + * KDB_CMD_SS[B] for success, a kdb error if failure. + * Locking: + * None. + * Remarks: + * + * Set the arch specific option to trigger a debug trap after the next + * instruction. + * + * For 'ssb', set the trace flag in the debug trap handler + * after printing the current insn and return directly without + * invoking the kdb command processor, until a branch instruction + * is encountered. + */ + +static int kdb_ss(int argc, const char **argv) +{ + int ssb = 0; + + ssb = (strcmp(argv[0], "ssb") == 0); + if (argc != 0) + return KDB_ARGCOUNT; + /* + * Set trace flag and go. + */ + KDB_STATE_SET(DOING_SS); + if (ssb) { + KDB_STATE_SET(DOING_SSB); + return KDB_CMD_SSB; + } + return KDB_CMD_SS; +} + +/* Initialize the breakpoint table and register breakpoint commands. */ + +void __init kdb_initbptab(void) +{ + int i; + kdb_bp_t *bp; + + /* + * First time initialization. + */ + memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints)); + + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) + bp->bp_free = 1; + + kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", + "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", + "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); + if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) + kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", + "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("bc", kdb_bc, "<bpnum>", + "Clear Breakpoint", 0, KDB_REPEAT_NONE); + kdb_register_repeat("be", kdb_bc, "<bpnum>", + "Enable Breakpoint", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bd", kdb_bc, "<bpnum>", + "Disable Breakpoint", 0, KDB_REPEAT_NONE); + + kdb_register_repeat("ss", kdb_ss, "", + "Single Step", 1, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("ssb", kdb_ss, "", + "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS); + /* + * Architecture dependent initialization. + */ +} diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c new file mode 100644 index 0000000..2f62fe8 --- /dev/null +++ b/kernel/debug/kdb/kdb_bt.c @@ -0,0 +1,210 @@ +/* + * Kernel Debugger Architecture Independent Stack Traceback + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include <linux/ctype.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/kdb.h> +#include <linux/nmi.h> +#include <asm/system.h> +#include "kdb_private.h" + + +static void kdb_show_stack(struct task_struct *p, void *addr) +{ + int old_lvl = console_loglevel; + console_loglevel = 15; + kdb_trap_printk++; + kdb_set_current_task(p); + if (addr) { + show_stack((struct task_struct *)p, addr); + } else if (kdb_current_regs) { +#ifdef CONFIG_X86 + show_stack(p, &kdb_current_regs->sp); +#else + show_stack(p, NULL); +#endif + } else { + show_stack(p, NULL); + } + console_loglevel = old_lvl; + kdb_trap_printk--; +} + +/* + * kdb_bt + * + * This function implements the 'bt' command. Print a stack + * traceback. + * + * bt [<address-expression>] (addr-exp is for alternate stacks) + * btp <pid> Kernel stack for <pid> + * btt <address-expression> Kernel stack for task structure at + * <address-expression> + * bta [DRSTCZEUIMA] All useful processes, optionally + * filtered by state + * btc [<cpu>] The current process on one cpu, + * default is all cpus + * + * bt <address-expression> refers to a address on the stack, that location + * is assumed to contain a return address. + * + * btt <address-expression> refers to the address of a struct task. + * + * Inputs: + * argc argument count + * argv argument vector + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * Backtrack works best when the code uses frame pointers. But even + * without frame pointers we should get a reasonable trace. + * + * mds comes in handy when examining the stack to do a manual traceback or + * to get a starting point for bt <address-expression>. + */ + +static int +kdb_bt1(struct task_struct *p, unsigned long mask, + int argcount, int btaprompt) +{ + char buffer[2]; + if (kdb_getarea(buffer[0], (unsigned long)p) || + kdb_getarea(buffer[0], (unsigned long)(p+1)-1)) + return KDB_BADADDR; + if (!kdb_task_state(p, mask)) + return 0; + kdb_printf("Stack traceback for pid %d\n", p->pid); + kdb_ps1(p); + kdb_show_stack(p, NULL); + if (btaprompt) { + kdb_getstr(buffer, sizeof(buffer), + "Enter <q> to end, <cr> to continue:"); + if (buffer[0] == 'q') { + kdb_printf("\n"); + return 1; + } + } + touch_nmi_watchdog(); + return 0; +} + +int +kdb_bt(int argc, const char **argv) +{ + int diag; + int argcount = 5; + int btaprompt = 1; + int nextarg; + unsigned long addr; + long offset; + + kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ + kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each + * proc in bta */ + + if (strcmp(argv[0], "bta") == 0) { + struct task_struct *g, *p; + unsigned long cpu; + unsigned long mask = kdb_task_state_string(argc ? argv[1] : + NULL); + if (argc == 0) + kdb_ps_suppressed(); + /* Run the active tasks first */ + for_each_online_cpu(cpu) { + p = kdb_curr_task(cpu); + if (kdb_bt1(p, mask, argcount, btaprompt)) + return 0; + } + /* Now the inactive tasks */ + kdb_do_each_thread(g, p) { + if (task_curr(p)) + continue; + if (kdb_bt1(p, mask, argcount, btaprompt)) + return 0; + } kdb_while_each_thread(g, p); + } else if (strcmp(argv[0], "btp") == 0) { + struct task_struct *p; + unsigned long pid; + if (argc != 1) + return KDB_ARGCOUNT; + diag = kdbgetularg((char *)argv[1], &pid); + if (diag) + return diag; + p = find_task_by_pid_ns(pid, &init_pid_ns); + if (p) { + kdb_set_current_task(p); + return kdb_bt1(p, ~0UL, argcount, 0); + } + kdb_printf("No process with pid == %ld found\n", pid); + return 0; + } else if (strcmp(argv[0], "btt") == 0) { + if (argc != 1) + return KDB_ARGCOUNT; + diag = kdbgetularg((char *)argv[1], &addr); + if (diag) + return diag; + kdb_set_current_task((struct task_struct *)addr); + return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0); + } else if (strcmp(argv[0], "btc") == 0) { + unsigned long cpu = ~0; + struct task_struct *save_current_task = kdb_current_task; + char buf[80]; + if (argc > 1) + return KDB_ARGCOUNT; + if (argc == 1) { + diag = kdbgetularg((char *)argv[1], &cpu); + if (diag) + return diag; + } + /* Recursive use of kdb_parse, do not use argv after + * this point */ + argv = NULL; + if (cpu != ~0) { + if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { + kdb_printf("no process for cpu %ld\n", cpu); + return 0; + } + sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + kdb_parse(buf); + return 0; + } + kdb_printf("btc: cpu status: "); + kdb_parse("cpu\n"); + for_each_online_cpu(cpu) { + sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + kdb_parse(buf); + touch_nmi_watchdog(); + } + kdb_set_current_task(save_current_task); + return 0; + } else { + if (argc) { + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, + &offset, NULL); + if (diag) + return diag; + kdb_show_stack(kdb_current_task, (void *)addr); + return 0; + } else { + return kdb_bt1(kdb_current_task, ~0UL, argcount, 0); + } + } + + /* NOTREACHED */ + return 0; +} diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds new file mode 100644 index 0000000..56c88e4 --- /dev/null +++ b/kernel/debug/kdb/kdb_cmds @@ -0,0 +1,35 @@ +# Initial commands for kdb, alter to suit your needs. +# These commands are executed in kdb_init() context, no SMP, no +# processes. Commands that require process data (including stack or +# registers) are not reliable this early. set and bp commands should +# be safe. Global breakpoint commands affect each cpu as it is booted. + +# Standard debugging information for first level support, just type archkdb +# or archkdbcpu or archkdbshort at the kdb prompt. + +defcmd dumpcommon "" "Common kdb debugging" + set BTAPROMPT 0 + set LINES 10000 + -summary + -cpu + -ps + -dmesg 600 + -bt +endefcmd + +defcmd dumpall "" "First line debugging" + set BTSYMARG 1 + set BTARGS 9 + pid R + -dumpcommon + -bta +endefcmd + +defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" + set BTSYMARG 1 + set BTARGS 9 + pid R + -dumpcommon + -btc +endefcmd + diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c new file mode 100644 index 0000000..bf6e827 --- /dev/null +++ b/kernel/debug/kdb/kdb_debugger.c @@ -0,0 +1,169 @@ +/* + * Created by: Jason Wessel <jason.wessel@windriver.com> + * + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/kdebug.h> +#include "kdb_private.h" +#include "../debug_core.h" + +/* + * KDB interface to KGDB internals + */ +get_char_func kdb_poll_funcs[] = { + dbg_io_get_char, + NULL, + NULL, + NULL, + NULL, + NULL, +}; +EXPORT_SYMBOL_GPL(kdb_poll_funcs); + +int kdb_poll_idx = 1; +EXPORT_SYMBOL_GPL(kdb_poll_idx); + +int kdb_stub(struct kgdb_state *ks) +{ + int error = 0; + kdb_bp_t *bp; + unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); + kdb_reason_t reason = KDB_REASON_OOPS; + kdb_dbtrap_t db_result = KDB_DB_NOBPT; + int i; + + if (KDB_STATE(REENTRY)) { + reason = KDB_REASON_SWITCH; + KDB_STATE_CLEAR(REENTRY); + addr = instruction_pointer(ks->linux_regs); + } + ks->pass_exception = 0; + if (atomic_read(&kgdb_setting_breakpoint)) + reason = KDB_REASON_KEYBOARD; + + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { + if ((bp->bp_enabled) && (bp->bp_addr == addr)) { + reason = KDB_REASON_BREAK; + db_result = KDB_DB_BPT; + if (addr != instruction_pointer(ks->linux_regs)) + kgdb_arch_set_pc(ks->linux_regs, addr); + break; + } + } + if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) { + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { + if (bp->bp_free) + continue; + if (bp->bp_addr == addr) { + bp->bp_delay = 1; + bp->bp_delayed = 1; + /* + * SSBPT is set when the kernel debugger must single step a + * task in order to re-establish an instruction breakpoint + * which uses the instruction replacement mechanism. It is + * cleared by any action that removes the need to single-step + * the breakpoint. + */ + reason = KDB_REASON_BREAK; + db_result = KDB_DB_BPT; + KDB_STATE_SET(SSBPT); + break; + } + } + } + + if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 && + ks->signo == SIGTRAP) { + reason = KDB_REASON_SSTEP; + db_result = KDB_DB_BPT; + } + /* Set initial kdb state variables */ + KDB_STATE_CLEAR(KGDB_TRANS); + kdb_initial_cpu = ks->cpu; + kdb_current_task = kgdb_info[ks->cpu].task; + kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; + /* Remove any breakpoints as needed by kdb and clear single step */ + kdb_bp_remove(); + KDB_STATE_CLEAR(DOING_SS); + KDB_STATE_CLEAR(DOING_SSB); + KDB_STATE_SET(PAGER); + /* zero out any offline cpu data */ + for_each_present_cpu(i) { + if (!cpu_online(i)) { + kgdb_info[i].debuggerinfo = NULL; + kgdb_info[i].task = NULL; + } + } + if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) { + ks->pass_exception = 1; + KDB_FLAG_SET(CATASTROPHIC); + } + kdb_initial_cpu = ks->cpu; + if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { + KDB_STATE_CLEAR(SSBPT); + KDB_STATE_CLEAR(DOING_SS); + } else { + /* Start kdb main loop */ + error = kdb_main_loop(KDB_REASON_ENTER, reason, + ks->err_code, db_result, ks->linux_regs); + } + /* + * Upon exit from the kdb main loop setup break points and restart + * the system based on the requested continue state + */ + kdb_initial_cpu = -1; + kdb_current_task = NULL; + kdb_current_regs = NULL; + KDB_STATE_CLEAR(PAGER); + kdbnearsym_cleanup(); + if (error == KDB_CMD_KGDB) { + if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { + /* + * This inteface glue which allows kdb to transition in into + * the gdb stub. In order to do this the '?' or '' gdb serial + * packet response is processed here. And then control is + * passed to the gdbstub. + */ + if (KDB_STATE(DOING_KGDB)) + gdbstub_state(ks, "?"); + else + gdbstub_state(ks, ""); + KDB_STATE_CLEAR(DOING_KGDB); + KDB_STATE_CLEAR(DOING_KGDB2); + } + return DBG_PASS_EVENT; + } + kdb_bp_install(ks->linux_regs); + dbg_activate_sw_breakpoints(); + /* Set the exit state to a single step or a continue */ + if (KDB_STATE(DOING_SS)) + gdbstub_state(ks, "s"); + else + gdbstub_state(ks, "c"); + + KDB_FLAG_CLEAR(CATASTROPHIC); + + /* Invoke arch specific exception handling prior to system resume */ + kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e"); + if (ks->pass_exception) + kgdb_info[ks->cpu].ret_state = 1; + if (error == KDB_CMD_CPU) { + KDB_STATE_SET(REENTRY); + /* + * Force clear the single step bit because kdb emulates this + * differently vs the gdbstub + */ + kgdb_single_step = 0; + dbg_deactivate_sw_breakpoints(); + return DBG_SWITCH_CPU_EVENT; + } + return kgdb_info[ks->cpu].ret_state; +} + diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c new file mode 100644 index 0000000..c9b7f4f9 --- /dev/null +++ b/kernel/debug/kdb/kdb_io.c @@ -0,0 +1,826 @@ +/* + * Kernel Debugger Architecture Independent Console I/O handler + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/kdev_t.h> +#include <linux/console.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/nmi.h> +#include <linux/delay.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/kallsyms.h> +#include "kdb_private.h" + +#define CMD_BUFLEN 256 +char kdb_prompt_str[CMD_BUFLEN]; + +int kdb_trap_printk; + +static void kgdb_transition_check(char *buffer) +{ + int slen = strlen(buffer); + if (strncmp(buffer, "$?#3f", slen) != 0 && + strncmp(buffer, "$qSupported#37", slen) != 0 && + strncmp(buffer, "+$qSupported#37", slen) != 0) { + KDB_STATE_SET(KGDB_TRANS); + kdb_printf("%s", buffer); + } +} + +static int kdb_read_get_key(char *buffer, size_t bufsize) +{ +#define ESCAPE_UDELAY 1000 +#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ + char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */ + char *ped = escape_data; + int escape_delay = 0; + get_char_func *f, *f_escape = NULL; + int key; + + for (f = &kdb_poll_funcs[0]; ; ++f) { + if (*f == NULL) { + /* Reset NMI watchdog once per poll loop */ + touch_nmi_watchdog(); + f = &kdb_poll_funcs[0]; + } + if (escape_delay == 2) { + *ped = '\0'; + ped = escape_data; + --escape_delay; + } + if (escape_delay == 1) { + key = *ped++; + if (!*ped) + --escape_delay; + break; + } + key = (*f)(); + if (key == -1) { + if (escape_delay) { + udelay(ESCAPE_UDELAY); + --escape_delay; + } + continue; + } + if (bufsize <= 2) { + if (key == '\r') + key = '\n'; + *buffer++ = key; + *buffer = '\0'; + return -1; + } + if (escape_delay == 0 && key == '\e') { + escape_delay = ESCAPE_DELAY; + ped = escape_data; + f_escape = f; + } + if (escape_delay) { + *ped++ = key; + if (f_escape != f) { + escape_delay = 2; + continue; + } + if (ped - escape_data == 1) { + /* \e */ + continue; + } else if (ped - escape_data == 2) { + /* \e<something> */ + if (key != '[') + escape_delay = 2; + continue; + } else if (ped - escape_data == 3) { + /* \e[<something> */ + int mapkey = 0; + switch (key) { + case 'A': /* \e[A, up arrow */ + mapkey = 16; + break; + case 'B': /* \e[B, down arrow */ + mapkey = 14; + break; + case 'C': /* \e[C, right arrow */ + mapkey = 6; + break; + case 'D': /* \e[D, left arrow */ + mapkey = 2; + break; + case '1': /* dropthrough */ + case '3': /* dropthrough */ + /* \e[<1,3,4>], may be home, del, end */ + case '4': + mapkey = -1; + break; + } + if (mapkey != -1) { + if (mapkey > 0) { + escape_data[0] = mapkey; + escape_data[1] = '\0'; + } + escape_delay = 2; + } + continue; + } else if (ped - escape_data == 4) { + /* \e[<1,3,4><something> */ + int mapkey = 0; + if (key == '~') { + switch (escape_data[2]) { + case '1': /* \e[1~, home */ + mapkey = 1; + break; + case '3': /* \e[3~, del */ + mapkey = 4; + break; + case '4': /* \e[4~, end */ + mapkey = 5; + break; + } + } + if (mapkey > 0) { + escape_data[0] = mapkey; + escape_data[1] = '\0'; + } + escape_delay = 2; + continue; + } + } + break; /* A key to process */ + } + return key; +} + +/* + * kdb_read + * + * This function reads a string of characters, terminated by + * a newline, or by reaching the end of the supplied buffer, + * from the current kernel debugger console device. + * Parameters: + * buffer - Address of character buffer to receive input characters. + * bufsize - size, in bytes, of the character buffer + * Returns: + * Returns a pointer to the buffer containing the received + * character string. This string will be terminated by a + * newline character. + * Locking: + * No locks are required to be held upon entry to this + * function. It is not reentrant - it relies on the fact + * that while kdb is running on only one "master debug" cpu. + * Remarks: + * + * The buffer size must be >= 2. A buffer size of 2 means that the caller only + * wants a single key. + * + * An escape key could be the start of a vt100 control sequence such as \e[D + * (left arrow) or it could be a character in its own right. The standard + * method for detecting the difference is to wait for 2 seconds to see if there + * are any other characters. kdb is complicated by the lack of a timer service + * (interrupts are off), by multiple input sources and by the need to sometimes + * return after just one key. Escape sequence processing has to be done as + * states in the polling loop. + */ + +static char *kdb_read(char *buffer, size_t bufsize) +{ + char *cp = buffer; + char *bufend = buffer+bufsize-2; /* Reserve space for newline + * and null byte */ + char *lastchar; + char *p_tmp; + char tmp; + static char tmpbuffer[CMD_BUFLEN]; + int len = strlen(buffer); + int len_tmp; + int tab = 0; + int count; + int i; + int diag, dtab_count; + int key; + + + diag = kdbgetintenv("DTABCOUNT", &dtab_count); + if (diag) + dtab_count = 30; + + if (len > 0) { + cp += len; + if (*(buffer+len-1) == '\n') + cp--; + } + + lastchar = cp; + *cp = '\0'; + kdb_printf("%s", buffer); +poll_again: + key = kdb_read_get_key(buffer, bufsize); + if (key == -1) + return buffer; + if (key != 9) + tab = 0; + switch (key) { + case 8: /* backspace */ + if (cp > buffer) { + if (cp < lastchar) { + memcpy(tmpbuffer, cp, lastchar - cp); + memcpy(cp-1, tmpbuffer, lastchar - cp); + } + *(--lastchar) = '\0'; + --cp; + kdb_printf("\b%s \r", cp); + tmp = *cp; + *cp = '\0'; + kdb_printf(kdb_prompt_str); + kdb_printf("%s", buffer); + *cp = tmp; + } + break; + case 13: /* enter */ + *lastchar++ = '\n'; + *lastchar++ = '\0'; + kdb_printf("\n"); + return buffer; + case 4: /* Del */ + if (cp < lastchar) { + memcpy(tmpbuffer, cp+1, lastchar - cp - 1); + memcpy(cp, tmpbuffer, lastchar - cp - 1); + *(--lastchar) = '\0'; + kdb_printf("%s \r", cp); + tmp = *cp; + *cp = '\0'; + kdb_printf(kdb_prompt_str); + kdb_printf("%s", buffer); + *cp = tmp; + } + break; + case 1: /* Home */ + if (cp > buffer) { + kdb_printf("\r"); + kdb_printf(kdb_prompt_str); + cp = buffer; + } + break; + case 5: /* End */ + if (cp < lastchar) { + kdb_printf("%s", cp); + cp = lastchar; + } + break; + case 2: /* Left */ + if (cp > buffer) { + kdb_printf("\b"); + --cp; + } + break; + case 14: /* Down */ + memset(tmpbuffer, ' ', + strlen(kdb_prompt_str) + (lastchar-buffer)); + *(tmpbuffer+strlen(kdb_prompt_str) + + (lastchar-buffer)) = '\0'; + kdb_printf("\r%s\r", tmpbuffer); + *lastchar = (char)key; + *(lastchar+1) = '\0'; + return lastchar; + case 6: /* Right */ + if (cp < lastchar) { + kdb_printf("%c", *cp); + ++cp; + } + break; + case 16: /* Up */ + memset(tmpbuffer, ' ', + strlen(kdb_prompt_str) + (lastchar-buffer)); + *(tmpbuffer+strlen(kdb_prompt_str) + + (lastchar-buffer)) = '\0'; + kdb_printf("\r%s\r", tmpbuffer); + *lastchar = (char)key; + *(lastchar+1) = '\0'; + return lastchar; + case 9: /* Tab */ + if (tab < 2) + ++tab; + p_tmp = buffer; + while (*p_tmp == ' ') + p_tmp++; + if (p_tmp > cp) + break; + memcpy(tmpbuffer, p_tmp, cp-p_tmp); + *(tmpbuffer + (cp-p_tmp)) = '\0'; + p_tmp = strrchr(tmpbuffer, ' '); + if (p_tmp) + ++p_tmp; + else + p_tmp = tmpbuffer; + len = strlen(p_tmp); + count = kallsyms_symbol_complete(p_tmp, + sizeof(tmpbuffer) - + (p_tmp - tmpbuffer)); + if (tab == 2 && count > 0) { + kdb_printf("\n%d symbols are found.", count); + if (count > dtab_count) { + count = dtab_count; + kdb_printf(" But only first %d symbols will" + " be printed.\nYou can change the" + " environment variable DTABCOUNT.", + count); + } + kdb_printf("\n"); + for (i = 0; i < count; i++) { + if (kallsyms_symbol_next(p_tmp, i) < 0) + break; + kdb_printf("%s ", p_tmp); + *(p_tmp + len) = '\0'; + } + if (i >= dtab_count) + kdb_printf("..."); + kdb_printf("\n"); + kdb_printf(kdb_prompt_str); + kdb_printf("%s", buffer); + } else if (tab != 2 && count > 0) { + len_tmp = strlen(p_tmp); + strncpy(p_tmp+len_tmp, cp, lastchar-cp+1); + len_tmp = strlen(p_tmp); + strncpy(cp, p_tmp+len, len_tmp-len + 1); + len = len_tmp - len; + kdb_printf("%s", cp); + cp += len; + lastchar += len; + } + kdb_nextline = 1; /* reset output line number */ + break; + default: + if (key >= 32 && lastchar < bufend) { + if (cp < lastchar) { + memcpy(tmpbuffer, cp, lastchar - cp); + memcpy(cp+1, tmpbuffer, lastchar - cp); + *++lastchar = '\0'; + *cp = key; + kdb_printf("%s\r", cp); + ++cp; + tmp = *cp; + *cp = '\0'; + kdb_printf(kdb_prompt_str); + kdb_printf("%s", buffer); + *cp = tmp; + } else { + *++lastchar = '\0'; + *cp++ = key; + /* The kgdb transition check will hide + * printed characters if we think that + * kgdb is connecting, until the check + * fails */ + if (!KDB_STATE(KGDB_TRANS)) + kgdb_transition_check(buffer); + else + kdb_printf("%c", key); + } + /* Special escape to kgdb */ + if (lastchar - buffer >= 5 && + strcmp(lastchar - 5, "$?#3f") == 0) { + strcpy(buffer, "kgdb"); + KDB_STATE_SET(DOING_KGDB); + return buffer; + } + if (lastchar - buffer >= 14 && + strcmp(lastchar - 14, "$qSupported#37") == 0) { + strcpy(buffer, "kgdb"); + KDB_STATE_SET(DOING_KGDB2); + return buffer; + } + } + break; + } + goto poll_again; +} + +/* + * kdb_getstr + * + * Print the prompt string and read a command from the + * input device. + * + * Parameters: + * buffer Address of buffer to receive command + * bufsize Size of buffer in bytes + * prompt Pointer to string to use as prompt string + * Returns: + * Pointer to command buffer. + * Locking: + * None. + * Remarks: + * For SMP kernels, the processor number will be + * substituted for %d, %x or %o in the prompt. + */ + +char *kdb_getstr(char *buffer, size_t bufsize, char *prompt) +{ + if (prompt && kdb_prompt_str != prompt) + strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); + kdb_printf(kdb_prompt_str); + kdb_nextline = 1; /* Prompt and input resets line number */ + return kdb_read(buffer, bufsize); +} + +/* + * kdb_input_flush + * + * Get rid of any buffered console input. + * + * Parameters: + * none + * Returns: + * nothing + * Locking: + * none + * Remarks: + * Call this function whenever you want to flush input. If there is any + * outstanding input, it ignores all characters until there has been no + * data for approximately 1ms. + */ + +static void kdb_input_flush(void) +{ + get_char_func *f; + int res; + int flush_delay = 1; + while (flush_delay) { + flush_delay--; +empty: + touch_nmi_watchdog(); + for (f = &kdb_poll_funcs[0]; *f; ++f) { + res = (*f)(); + if (res != -1) { + flush_delay = 1; + goto empty; + } + } + if (flush_delay) + mdelay(1); + } +} + +/* + * kdb_printf + * + * Print a string to the output device(s). + * + * Parameters: + * printf-like format and optional args. + * Returns: + * 0 + * Locking: + * None. + * Remarks: + * use 'kdbcons->write()' to avoid polluting 'log_buf' with + * kdb output. + * + * If the user is doing a cmd args | grep srch + * then kdb_grepping_flag is set. + * In that case we need to accumulate full lines (ending in \n) before + * searching for the pattern. + */ + +static char kdb_buffer[256]; /* A bit too big to go on stack */ +static char *next_avail = kdb_buffer; +static int size_avail; +static int suspend_grep; + +/* + * search arg1 to see if it contains arg2 + * (kdmain.c provides flags for ^pat and pat$) + * + * return 1 for found, 0 for not found + */ +static int kdb_search_string(char *searched, char *searchfor) +{ + char firstchar, *cp; + int len1, len2; + + /* not counting the newline at the end of "searched" */ + len1 = strlen(searched)-1; + len2 = strlen(searchfor); + if (len1 < len2) + return 0; + if (kdb_grep_leading && kdb_grep_trailing && len1 != len2) + return 0; + if (kdb_grep_leading) { + if (!strncmp(searched, searchfor, len2)) + return 1; + } else if (kdb_grep_trailing) { + if (!strncmp(searched+len1-len2, searchfor, len2)) + return 1; + } else { + firstchar = *searchfor; + cp = searched; + while ((cp = strchr(cp, firstchar))) { + if (!strncmp(cp, searchfor, len2)) + return 1; + cp++; + } + } + return 0; +} + +int vkdb_printf(const char *fmt, va_list ap) +{ + int diag; + int linecount; + int logging, saved_loglevel = 0; + int saved_trap_printk; + int got_printf_lock = 0; + int retlen = 0; + int fnd, len; + char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; + char *moreprompt = "more> "; + struct console *c = console_drivers; + static DEFINE_SPINLOCK(kdb_printf_lock); + unsigned long uninitialized_var(flags); + + preempt_disable(); + saved_trap_printk = kdb_trap_printk; + kdb_trap_printk = 0; + + /* Serialize kdb_printf if multiple cpus try to write at once. + * But if any cpu goes recursive in kdb, just print the output, + * even if it is interleaved with any other text. + */ + if (!KDB_STATE(PRINTF_LOCK)) { + KDB_STATE_SET(PRINTF_LOCK); + spin_lock_irqsave(&kdb_printf_lock, flags); + got_printf_lock = 1; + atomic_inc(&kdb_event); + } else { + __acquire(kdb_printf_lock); + } + + diag = kdbgetintenv("LINES", &linecount); + if (diag || linecount <= 1) + linecount = 24; + + diag = kdbgetintenv("LOGGING", &logging); + if (diag) + logging = 0; + + if (!kdb_grepping_flag || suspend_grep) { + /* normally, every vsnprintf starts a new buffer */ + next_avail = kdb_buffer; + size_avail = sizeof(kdb_buffer); + } + vsnprintf(next_avail, size_avail, fmt, ap); + + /* + * If kdb_parse() found that the command was cmd xxx | grep yyy + * then kdb_grepping_flag is set, and kdb_grep_string contains yyy + * + * Accumulate the print data up to a newline before searching it. + * (vsnprintf does null-terminate the string that it generates) + */ + + /* skip the search if prints are temporarily unconditional */ + if (!suspend_grep && kdb_grepping_flag) { + cp = strchr(kdb_buffer, '\n'); + if (!cp) { + /* + * Special cases that don't end with newlines + * but should be written without one: + * The "[nn]kdb> " prompt should + * appear at the front of the buffer. + * + * The "[nn]more " prompt should also be + * (MOREPROMPT -> moreprompt) + * written * but we print that ourselves, + * we set the suspend_grep flag to make + * it unconditional. + * + */ + if (next_avail == kdb_buffer) { + /* + * these should occur after a newline, + * so they will be at the front of the + * buffer + */ + cp2 = kdb_buffer; + len = strlen(kdb_prompt_str); + if (!strncmp(cp2, kdb_prompt_str, len)) { + /* + * We're about to start a new + * command, so we can go back + * to normal mode. + */ + kdb_grepping_flag = 0; + goto kdb_printit; + } + } + /* no newline; don't search/write the buffer + until one is there */ + len = strlen(kdb_buffer); + next_avail = kdb_buffer + len; + size_avail = sizeof(kdb_buffer) - len; + goto kdb_print_out; + } + + /* + * The newline is present; print through it or discard + * it, depending on the results of the search. + */ + cp++; /* to byte after the newline */ + replaced_byte = *cp; /* remember what/where it was */ + cphold = cp; + *cp = '\0'; /* end the string for our search */ + + /* + * We now have a newline at the end of the string + * Only continue with this output if it contains the + * search string. + */ + fnd = kdb_search_string(kdb_buffer, kdb_grep_string); + if (!fnd) { + /* + * At this point the complete line at the start + * of kdb_buffer can be discarded, as it does + * not contain what the user is looking for. + * Shift the buffer left. + */ + *cphold = replaced_byte; + strcpy(kdb_buffer, cphold); + len = strlen(kdb_buffer); + next_avail = kdb_buffer + len; + size_avail = sizeof(kdb_buffer) - len; + goto kdb_print_out; + } + /* + * at this point the string is a full line and + * should be printed, up to the null. + */ + } +kdb_printit: + + /* + * Write to all consoles. + */ + retlen = strlen(kdb_buffer); + if (!dbg_kdb_mode && kgdb_connected) { + gdbstub_msg_write(kdb_buffer, retlen); + } else { + if (!dbg_io_ops->is_console) { + len = strlen(kdb_buffer); + cp = kdb_buffer; + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; + } + } + while (c) { + c->write(c, kdb_buffer, retlen); + touch_nmi_watchdog(); + c = c->next; + } + } + if (logging) { + saved_loglevel = console_loglevel; + console_loglevel = 0; + printk(KERN_INFO "%s", kdb_buffer); + } + + if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) + kdb_nextline++; + + /* check for having reached the LINES number of printed lines */ + if (kdb_nextline == linecount) { + char buf1[16] = ""; +#if defined(CONFIG_SMP) + char buf2[32]; +#endif + + /* Watch out for recursion here. Any routine that calls + * kdb_printf will come back through here. And kdb_read + * uses kdb_printf to echo on serial consoles ... + */ + kdb_nextline = 1; /* In case of recursion */ + + /* + * Pause until cr. + */ + moreprompt = kdbgetenv("MOREPROMPT"); + if (moreprompt == NULL) + moreprompt = "more> "; + +#if defined(CONFIG_SMP) + if (strchr(moreprompt, '%')) { + sprintf(buf2, moreprompt, get_cpu()); + put_cpu(); + moreprompt = buf2; + } +#endif + + kdb_input_flush(); + c = console_drivers; + + if (!dbg_io_ops->is_console) { + len = strlen(moreprompt); + cp = moreprompt; + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; + } + } + while (c) { + c->write(c, moreprompt, strlen(moreprompt)); + touch_nmi_watchdog(); + c = c->next; + } + + if (logging) + printk("%s", moreprompt); + + kdb_read(buf1, 2); /* '2' indicates to return + * immediately after getting one key. */ + kdb_nextline = 1; /* Really set output line 1 */ + + /* empty and reset the buffer: */ + kdb_buffer[0] = '\0'; + next_avail = kdb_buffer; + size_avail = sizeof(kdb_buffer); + if ((buf1[0] == 'q') || (buf1[0] == 'Q')) { + /* user hit q or Q */ + KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */ + KDB_STATE_CLEAR(PAGER); + /* end of command output; back to normal mode */ + kdb_grepping_flag = 0; + kdb_printf("\n"); + } else if (buf1[0] == ' ') { + kdb_printf("\n"); + suspend_grep = 1; /* for this recursion */ + } else if (buf1[0] == '\n') { + kdb_nextline = linecount - 1; + kdb_printf("\r"); + suspend_grep = 1; /* for this recursion */ + } else if (buf1[0] && buf1[0] != '\n') { + /* user hit something other than enter */ + suspend_grep = 1; /* for this recursion */ + kdb_printf("\nOnly 'q' or 'Q' are processed at more " + "prompt, input ignored\n"); + } else if (kdb_grepping_flag) { + /* user hit enter */ + suspend_grep = 1; /* for this recursion */ + kdb_printf("\n"); + } + kdb_input_flush(); + } + + /* + * For grep searches, shift the printed string left. + * replaced_byte contains the character that was overwritten with + * the terminating null, and cphold points to the null. + * Then adjust the notion of available space in the buffer. + */ + if (kdb_grepping_flag && !suspend_grep) { + *cphold = replaced_byte; + strcpy(kdb_buffer, cphold); + len = strlen(kdb_buffer); + next_avail = kdb_buffer + len; + size_avail = sizeof(kdb_buffer) - len; + } + +kdb_print_out: + suspend_grep = 0; /* end of what may have been a recursive call */ + if (logging) + console_loglevel = saved_loglevel; + if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { + got_printf_lock = 0; + spin_unlock_irqrestore(&kdb_printf_lock, flags); + KDB_STATE_CLEAR(PRINTF_LOCK); + atomic_dec(&kdb_event); + } else { + __release(kdb_printf_lock); + } + kdb_trap_printk = saved_trap_printk; + preempt_enable(); + return retlen; +} + +int kdb_printf(const char *fmt, ...) +{ + va_list ap; + int r; + + va_start(ap, fmt); + r = vkdb_printf(fmt, ap); + va_end(ap); + + return r; +} + diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c new file mode 100644 index 0000000..4bca634 --- /dev/null +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -0,0 +1,212 @@ +/* + * Kernel Debugger Architecture Dependent Console I/O handler + * + * This file is subject to the terms and conditions of the GNU General Public + * License. + * + * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include <linux/kdb.h> +#include <linux/keyboard.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/io.h> + +/* Keyboard Controller Registers on normal PCs. */ + +#define KBD_STATUS_REG 0x64 /* Status register (R) */ +#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */ + +/* Status Register Bits */ + +#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */ +#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ + +static int kbd_exists; + +/* + * Check if the keyboard controller has a keypress for us. + * Some parts (Enter Release, LED change) are still blocking polled here, + * but hopefully they are all short. + */ +int kdb_get_kbd_char(void) +{ + int scancode, scanstatus; + static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */ + static int shift_key; /* Shift next keypress */ + static int ctrl_key; + u_short keychar; + + if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) || + (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) { + kbd_exists = 0; + return -1; + } + kbd_exists = 1; + + if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) + return -1; + + /* + * Fetch the scancode + */ + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + + /* + * Ignore mouse events. + */ + if (scanstatus & KBD_STAT_MOUSE_OBF) + return -1; + + /* + * Ignore release, trigger on make + * (except for shift keys, where we want to + * keep the shift state so long as the key is + * held down). + */ + + if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) { + /* + * Next key may use shift table + */ + if ((scancode & 0x80) == 0) + shift_key = 1; + else + shift_key = 0; + return -1; + } + + if ((scancode&0x7f) == 0x1d) { + /* + * Left ctrl key + */ + if ((scancode & 0x80) == 0) + ctrl_key = 1; + else + ctrl_key = 0; + return -1; + } + + if ((scancode & 0x80) != 0) + return -1; + + scancode &= 0x7f; + + /* + * Translate scancode + */ + + if (scancode == 0x3a) { + /* + * Toggle caps lock + */ + shift_lock ^= 1; + +#ifdef KDB_BLINK_LED + kdb_toggleled(0x4); +#endif + return -1; + } + + if (scancode == 0x0e) { + /* + * Backspace + */ + return 8; + } + + /* Special Key */ + switch (scancode) { + case 0xF: /* Tab */ + return 9; + case 0x53: /* Del */ + return 4; + case 0x47: /* Home */ + return 1; + case 0x4F: /* End */ + return 5; + case 0x4B: /* Left */ + return 2; + case 0x48: /* Up */ + return 16; + case 0x50: /* Down */ + return 14; + case 0x4D: /* Right */ + return 6; + } + + if (scancode == 0xe0) + return -1; + + /* + * For Japanese 86/106 keyboards + * See comment in drivers/char/pc_keyb.c. + * - Masahiro Adegawa + */ + if (scancode == 0x73) + scancode = 0x59; + else if (scancode == 0x7d) + scancode = 0x7c; + + if (!shift_lock && !shift_key && !ctrl_key) { + keychar = plain_map[scancode]; + } else if ((shift_lock || shift_key) && key_maps[1]) { + keychar = key_maps[1][scancode]; + } else if (ctrl_key && key_maps[4]) { + keychar = key_maps[4][scancode]; + } else { + keychar = 0x0020; + kdb_printf("Unknown state/scancode (%d)\n", scancode); + } + keychar &= 0x0fff; + if (keychar == '\t') + keychar = ' '; + switch (KTYP(keychar)) { + case KT_LETTER: + case KT_LATIN: + if (isprint(keychar)) + break; /* printable characters */ + /* drop through */ + case KT_SPEC: + if (keychar == K_ENTER) + break; + /* drop through */ + default: + return -1; /* ignore unprintables */ + } + + if ((scancode & 0x7f) == 0x1c) { + /* + * enter key. All done. Absorb the release scancode. + */ + while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) + ; + + /* + * Fetch the scancode + */ + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + + while (scanstatus & KBD_STAT_MOUSE_OBF) { + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + } + + if (scancode != 0x9c) { + /* + * Wasn't an enter-release, why not? + */ + kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", + scancode, scanstatus); + } + + return 13; + } + + return keychar & 0xff; +} +EXPORT_SYMBOL_GPL(kdb_get_kbd_char); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c new file mode 100644 index 0000000..ebe4a28 --- /dev/null +++ b/kernel/debug/kdb/kdb_main.c @@ -0,0 +1,2846 @@ +/* + * Kernel Debugger Architecture Independent Main Code + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com> + * Xscale (R) modifications copyright (C) 2003 Intel Corporation. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include <linux/ctype.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/sysrq.h> +#include <linux/smp.h> +#include <linux/utsname.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/kallsyms.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/notifier.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/nmi.h> +#include <linux/time.h> +#include <linux/ptrace.h> +#include <linux/sysctl.h> +#include <linux/cpu.h> +#include <linux/kdebug.h> +#include <linux/proc_fs.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include "kdb_private.h" + +#define GREP_LEN 256 +char kdb_grep_string[GREP_LEN]; +int kdb_grepping_flag; +EXPORT_SYMBOL(kdb_grepping_flag); +int kdb_grep_leading; +int kdb_grep_trailing; + +/* + * Kernel debugger state flags + */ +int kdb_flags; +atomic_t kdb_event; + +/* + * kdb_lock protects updates to kdb_initial_cpu. Used to + * single thread processors through the kernel debugger. + */ +int kdb_initial_cpu = -1; /* cpu number that owns kdb */ +int kdb_nextline = 1; +int kdb_state; /* General KDB state */ + +struct task_struct *kdb_current_task; +EXPORT_SYMBOL(kdb_current_task); +struct pt_regs *kdb_current_regs; + +const char *kdb_diemsg; +static int kdb_go_count; +#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC +static unsigned int kdb_continue_catastrophic = + CONFIG_KDB_CONTINUE_CATASTROPHIC; +#else +static unsigned int kdb_continue_catastrophic; +#endif + +/* kdb_commands describes the available commands. */ +static kdbtab_t *kdb_commands; +#define KDB_BASE_CMD_MAX 50 +static int kdb_max_commands = KDB_BASE_CMD_MAX; +static kdbtab_t kdb_base_commands[50]; +#define for_each_kdbcmd(cmd, num) \ + for ((cmd) = kdb_base_commands, (num) = 0; \ + num < kdb_max_commands; \ + num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) + +typedef struct _kdbmsg { + int km_diag; /* kdb diagnostic */ + char *km_msg; /* Corresponding message text */ +} kdbmsg_t; + +#define KDBMSG(msgnum, text) \ + { KDB_##msgnum, text } + +static kdbmsg_t kdbmsgs[] = { + KDBMSG(NOTFOUND, "Command Not Found"), + KDBMSG(ARGCOUNT, "Improper argument count, see usage."), + KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, " + "8 is only allowed on 64 bit systems"), + KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"), + KDBMSG(NOTENV, "Cannot find environment variable"), + KDBMSG(NOENVVALUE, "Environment variable should have value"), + KDBMSG(NOTIMP, "Command not implemented"), + KDBMSG(ENVFULL, "Environment full"), + KDBMSG(ENVBUFFULL, "Environment buffer full"), + KDBMSG(TOOMANYBPT, "Too many breakpoints defined"), +#ifdef CONFIG_CPU_XSCALE + KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"), +#else + KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"), +#endif + KDBMSG(DUPBPT, "Duplicate breakpoint address"), + KDBMSG(BPTNOTFOUND, "Breakpoint not found"), + KDBMSG(BADMODE, "Invalid IDMODE"), + KDBMSG(BADINT, "Illegal numeric value"), + KDBMSG(INVADDRFMT, "Invalid symbolic address format"), + KDBMSG(BADREG, "Invalid register name"), + KDBMSG(BADCPUNUM, "Invalid cpu number"), + KDBMSG(BADLENGTH, "Invalid length field"), + KDBMSG(NOBP, "No Breakpoint exists"), + KDBMSG(BADADDR, "Invalid address"), +}; +#undef KDBMSG + +static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); + + +/* + * Initial environment. This is all kept static and local to + * this file. We don't want to rely on the memory allocation + * mechanisms in the kernel, so we use a very limited allocate-only + * heap for new and altered environment variables. The entire + * environment is limited to a fixed number of entries (add more + * to __env[] if required) and a fixed amount of heap (add more to + * KDB_ENVBUFSIZE if required). + */ + +static char *__env[] = { +#if defined(CONFIG_SMP) + "PROMPT=[%d]kdb> ", + "MOREPROMPT=[%d]more> ", +#else + "PROMPT=kdb> ", + "MOREPROMPT=more> ", +#endif + "RADIX=16", + "MDCOUNT=8", /* lines of md output */ + "BTARGS=9", /* 9 possible args in bt */ + KDB_PLATFORM_ENV, + "DTABCOUNT=30", + "NOSECT=1", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, +}; + +static const int __nenv = (sizeof(__env) / sizeof(char *)); + +struct task_struct *kdb_curr_task(int cpu) +{ + struct task_struct *p = curr_task(cpu); +#ifdef _TIF_MCA_INIT + if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu)) + p = krp->p; +#endif + return p; +} + +/* + * kdbgetenv - This function will return the character string value of + * an environment variable. + * Parameters: + * match A character string representing an environment variable. + * Returns: + * NULL No environment variable matches 'match' + * char* Pointer to string value of environment variable. + */ +char *kdbgetenv(const char *match) +{ + char **ep = __env; + int matchlen = strlen(match); + int i; + + for (i = 0; i < __nenv; i++) { + char *e = *ep++; + + if (!e) + continue; + + if ((strncmp(match, e, matchlen) == 0) + && ((e[matchlen] == '\0') + || (e[matchlen] == '='))) { + char *cp = strchr(e, '='); + return cp ? ++cp : ""; + } + } + return NULL; +} + +/* + * kdballocenv - This function is used to allocate bytes for + * environment entries. + * Parameters: + * match A character string representing a numeric value + * Outputs: + * *value the unsigned long representation of the env variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + * Remarks: + * We use a static environment buffer (envbuffer) to hold the values + * of dynamically generated environment variables (see kdb_set). Buffer + * space once allocated is never free'd, so over time, the amount of space + * (currently 512 bytes) will be exhausted if env variables are changed + * frequently. + */ +static char *kdballocenv(size_t bytes) +{ +#define KDB_ENVBUFSIZE 512 + static char envbuffer[KDB_ENVBUFSIZE]; + static int envbufsize; + char *ep = NULL; + + if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) { + ep = &envbuffer[envbufsize]; + envbufsize += bytes; + } + return ep; +} + +/* + * kdbgetulenv - This function will return the value of an unsigned + * long-valued environment variable. + * Parameters: + * match A character string representing a numeric value + * Outputs: + * *value the unsigned long represntation of the env variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + */ +static int kdbgetulenv(const char *match, unsigned long *value) +{ + char *ep; + + ep = kdbgetenv(match); + if (!ep) + return KDB_NOTENV; + if (strlen(ep) == 0) + return KDB_NOENVVALUE; + + *value = simple_strtoul(ep, NULL, 0); + + return 0; +} + +/* + * kdbgetintenv - This function will return the value of an + * integer-valued environment variable. + * Parameters: + * match A character string representing an integer-valued env variable + * Outputs: + * *value the integer representation of the environment variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + */ +int kdbgetintenv(const char *match, int *value) +{ + unsigned long val; + int diag; + + diag = kdbgetulenv(match, &val); + if (!diag) + *value = (int) val; + return diag; +} + +/* + * kdbgetularg - This function will convert a numeric string into an + * unsigned long value. + * Parameters: + * arg A character string representing a numeric value + * Outputs: + * *value the unsigned long represntation of arg. + * Returns: + * Zero on success, a kdb diagnostic on failure. + */ +int kdbgetularg(const char *arg, unsigned long *value) +{ + char *endp; + unsigned long val; + + val = simple_strtoul(arg, &endp, 0); + + if (endp == arg) { + /* + * Try base 16, for us folks too lazy to type the + * leading 0x... + */ + val = simple_strtoul(arg, &endp, 16); + if (endp == arg) + return KDB_BADINT; + } + + *value = val; + + return 0; +} + +/* + * kdb_set - This function implements the 'set' command. Alter an + * existing environment variable or create a new one. + */ +int kdb_set(int argc, const char **argv) +{ + int i; + char *ep; + size_t varlen, vallen; + + /* + * we can be invoked two ways: + * set var=value argv[1]="var", argv[2]="value" + * set var = value argv[1]="var", argv[2]="=", argv[3]="value" + * - if the latter, shift 'em down. + */ + if (argc == 3) { + argv[2] = argv[3]; + argc--; + } + + if (argc != 2) + return KDB_ARGCOUNT; + + /* + * Check for internal variables + */ + if (strcmp(argv[1], "KDBDEBUG") == 0) { + unsigned int debugflags; + char *cp; + + debugflags = simple_strtoul(argv[2], &cp, 0); + if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) { + kdb_printf("kdb: illegal debug flags '%s'\n", + argv[2]); + return 0; + } + kdb_flags = (kdb_flags & + ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) + | (debugflags << KDB_DEBUG_FLAG_SHIFT); + + return 0; + } + + /* + * Tokenizer squashed the '=' sign. argv[1] is variable + * name, argv[2] = value. + */ + varlen = strlen(argv[1]); + vallen = strlen(argv[2]); + ep = kdballocenv(varlen + vallen + 2); + if (ep == (char *)0) + return KDB_ENVBUFFULL; + + sprintf(ep, "%s=%s", argv[1], argv[2]); + + ep[varlen+vallen+1] = '\0'; + + for (i = 0; i < __nenv; i++) { + if (__env[i] + && ((strncmp(__env[i], argv[1], varlen) == 0) + && ((__env[i][varlen] == '\0') + || (__env[i][varlen] == '=')))) { + __env[i] = ep; + return 0; + } + } + + /* + * Wasn't existing variable. Fit into slot. + */ + for (i = 0; i < __nenv-1; i++) { + if (__env[i] == (char *)0) { + __env[i] = ep; + return 0; + } + } + + return KDB_ENVFULL; +} + +static int kdb_check_regs(void) +{ + if (!kdb_current_regs) { + kdb_printf("No current kdb registers." + " You may need to select another task\n"); + return KDB_BADREG; + } + return 0; +} + +/* + * kdbgetaddrarg - This function is responsible for parsing an + * address-expression and returning the value of the expression, + * symbol name, and offset to the caller. + * + * The argument may consist of a numeric value (decimal or + * hexidecimal), a symbol name, a register name (preceeded by the + * percent sign), an environment variable with a numeric value + * (preceeded by a dollar sign) or a simple arithmetic expression + * consisting of a symbol name, +/-, and a numeric constant value + * (offset). + * Parameters: + * argc - count of arguments in argv + * argv - argument vector + * *nextarg - index to next unparsed argument in argv[] + * regs - Register state at time of KDB entry + * Outputs: + * *value - receives the value of the address-expression + * *offset - receives the offset specified, if any + * *name - receives the symbol name, if any + * *nextarg - index to next unparsed argument in argv[] + * Returns: + * zero is returned on success, a kdb diagnostic code is + * returned on error. + */ +int kdbgetaddrarg(int argc, const char **argv, int *nextarg, + unsigned long *value, long *offset, + char **name) +{ + unsigned long addr; + unsigned long off = 0; + int positive; + int diag; + int found = 0; + char *symname; + char symbol = '\0'; + char *cp; + kdb_symtab_t symtab; + + /* + * Process arguments which follow the following syntax: + * + * symbol | numeric-address [+/- numeric-offset] + * %register + * $environment-variable + */ + + if (*nextarg > argc) + return KDB_ARGCOUNT; + + symname = (char *)argv[*nextarg]; + + /* + * If there is no whitespace between the symbol + * or address and the '+' or '-' symbols, we + * remember the character and replace it with a + * null so the symbol/value can be properly parsed + */ + cp = strpbrk(symname, "+-"); + if (cp != NULL) { + symbol = *cp; + *cp++ = '\0'; + } + + if (symname[0] == '$') { + diag = kdbgetulenv(&symname[1], &addr); + if (diag) + return diag; + } else if (symname[0] == '%') { + diag = kdb_check_regs(); + if (diag) + return diag; + /* Implement register values with % at a later time as it is + * arch optional. + */ + return KDB_NOTIMP; + } else { + found = kdbgetsymval(symname, &symtab); + if (found) { + addr = symtab.sym_start; + } else { + diag = kdbgetularg(argv[*nextarg], &addr); + if (diag) + return diag; + } + } + + if (!found) + found = kdbnearsym(addr, &symtab); + + (*nextarg)++; + + if (name) + *name = symname; + if (value) + *value = addr; + if (offset && name && *name) + *offset = addr - symtab.sym_start; + + if ((*nextarg > argc) + && (symbol == '\0')) + return 0; + + /* + * check for +/- and offset + */ + + if (symbol == '\0') { + if ((argv[*nextarg][0] != '+') + && (argv[*nextarg][0] != '-')) { + /* + * Not our argument. Return. + */ + return 0; + } else { + positive = (argv[*nextarg][0] == '+'); + (*nextarg)++; + } + } else + positive = (symbol == '+'); + + /* + * Now there must be an offset! + */ + if ((*nextarg > argc) + && (symbol == '\0')) { + return KDB_INVADDRFMT; + } + + if (!symbol) { + cp = (char *)argv[*nextarg]; + (*nextarg)++; + } + + diag = kdbgetularg(cp, &off); + if (diag) + return diag; + + if (!positive) + off = -off; + + if (offset) + *offset += off; + + if (value) + *value += off; + + return 0; +} + +static void kdb_cmderror(int diag) +{ + int i; + + if (diag >= 0) { + kdb_printf("no error detected (diagnostic is %d)\n", diag); + return; + } + + for (i = 0; i < __nkdb_err; i++) { + if (kdbmsgs[i].km_diag == diag) { + kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg); + return; + } + } + + kdb_printf("Unknown diag %d\n", -diag); +} + +/* + * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd' + * command which defines one command as a set of other commands, + * terminated by endefcmd. kdb_defcmd processes the initial + * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for + * the following commands until 'endefcmd'. + * Inputs: + * argc argument count + * argv argument vector + * Returns: + * zero for success, a kdb diagnostic if error + */ +struct defcmd_set { + int count; + int usable; + char *name; + char *usage; + char *help; + char **command; +}; +static struct defcmd_set *defcmd_set; +static int defcmd_set_count; +static int defcmd_in_progress; + +/* Forward references */ +static int kdb_exec_defcmd(int argc, const char **argv); + +static int kdb_defcmd2(const char *cmdstr, const char *argv0) +{ + struct defcmd_set *s = defcmd_set + defcmd_set_count - 1; + char **save_command = s->command; + if (strcmp(argv0, "endefcmd") == 0) { + defcmd_in_progress = 0; + if (!s->count) + s->usable = 0; + if (s->usable) + kdb_register(s->name, kdb_exec_defcmd, + s->usage, s->help, 0); + return 0; + } + if (!s->usable) + return KDB_NOTIMP; + s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); + if (!s->command) { + kdb_printf("Could not allocate new kdb_defcmd table for %s\n", + cmdstr); + s->usable = 0; + return KDB_NOTIMP; + } + memcpy(s->command, save_command, s->count * sizeof(*(s->command))); + s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB); + kfree(save_command); + return 0; +} + +static int kdb_defcmd(int argc, const char **argv) +{ + struct defcmd_set *save_defcmd_set = defcmd_set, *s; + if (defcmd_in_progress) { + kdb_printf("kdb: nested defcmd detected, assuming missing " + "endefcmd\n"); + kdb_defcmd2("endefcmd", "endefcmd"); + } + if (argc == 0) { + int i; + for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) { + kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name, + s->usage, s->help); + for (i = 0; i < s->count; ++i) + kdb_printf("%s", s->command[i]); + kdb_printf("endefcmd\n"); + } + return 0; + } + if (argc != 3) + return KDB_ARGCOUNT; + defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), + GFP_KDB); + if (!defcmd_set) { + kdb_printf("Could not allocate new defcmd_set entry for %s\n", + argv[1]); + defcmd_set = save_defcmd_set; + return KDB_NOTIMP; + } + memcpy(defcmd_set, save_defcmd_set, + defcmd_set_count * sizeof(*defcmd_set)); + kfree(save_defcmd_set); + s = defcmd_set + defcmd_set_count; + memset(s, 0, sizeof(*s)); + s->usable = 1; + s->name = kdb_strdup(argv[1], GFP_KDB); + s->usage = kdb_strdup(argv[2], GFP_KDB); + s->help = kdb_strdup(argv[3], GFP_KDB); + if (s->usage[0] == '"') { + strcpy(s->usage, s->usage+1); + s->usage[strlen(s->usage)-1] = '\0'; + } + if (s->help[0] == '"') { + strcpy(s->help, s->help+1); + s->help[strlen(s->help)-1] = '\0'; + } + ++defcmd_set_count; + defcmd_in_progress = 1; + return 0; +} + +/* + * kdb_exec_defcmd - Execute the set of commands associated with this + * defcmd name. + * Inputs: + * argc argument count + * argv argument vector + * Returns: + * zero for success, a kdb diagnostic if error + */ +static int kdb_exec_defcmd(int argc, const char **argv) +{ + int i, ret; + struct defcmd_set *s; + if (argc != 0) + return KDB_ARGCOUNT; + for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) { + if (strcmp(s->name, argv[0]) == 0) + break; + } + if (i == defcmd_set_count) { + kdb_printf("kdb_exec_defcmd: could not find commands for %s\n", + argv[0]); + return KDB_NOTIMP; + } + for (i = 0; i < s->count; ++i) { + /* Recursive use of kdb_parse, do not use argv after + * this point */ + argv = NULL; + kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]); + ret = kdb_parse(s->command[i]); + if (ret) + return ret; + } + return 0; +} + +/* Command history */ +#define KDB_CMD_HISTORY_COUNT 32 +#define CMD_BUFLEN 200 /* kdb_printf: max printline + * size == 256 */ +static unsigned int cmd_head, cmd_tail; +static unsigned int cmdptr; +static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN]; +static char cmd_cur[CMD_BUFLEN]; + +/* + * The "str" argument may point to something like | grep xyz + */ +static void parse_grep(const char *str) +{ + int len; + char *cp = (char *)str, *cp2; + + /* sanity check: we should have been called with the \ first */ + if (*cp != '|') + return; + cp++; + while (isspace(*cp)) + cp++; + if (strncmp(cp, "grep ", 5)) { + kdb_printf("invalid 'pipe', see grephelp\n"); + return; + } + cp += 5; + while (isspace(*cp)) + cp++; + cp2 = strchr(cp, '\n'); + if (cp2) + *cp2 = '\0'; /* remove the trailing newline */ + len = strlen(cp); + if (len == 0) { + kdb_printf("invalid 'pipe', see grephelp\n"); + return; + } + /* now cp points to a nonzero length search string */ + if (*cp == '"') { + /* allow it be "x y z" by removing the "'s - there must + be two of them */ + cp++; + cp2 = strchr(cp, '"'); + if (!cp2) { + kdb_printf("invalid quoted string, see grephelp\n"); + return; + } + *cp2 = '\0'; /* end the string where the 2nd " was */ + } + kdb_grep_leading = 0; + if (*cp == '^') { + kdb_grep_leading = 1; + cp++; + } + len = strlen(cp); + kdb_grep_trailing = 0; + if (*(cp+len-1) == '$') { + kdb_grep_trailing = 1; + *(cp+len-1) = '\0'; + } + len = strlen(cp); + if (!len) + return; + if (len >= GREP_LEN) { + kdb_printf("search string too long\n"); + return; + } + strcpy(kdb_grep_string, cp); + kdb_grepping_flag++; + return; +} + +/* + * kdb_parse - Parse the command line, search the command table for a + * matching command and invoke the command function. This + * function may be called recursively, if it is, the second call + * will overwrite argv and cbuf. It is the caller's + * responsibility to save their argv if they recursively call + * kdb_parse(). + * Parameters: + * cmdstr The input command line to be parsed. + * regs The registers at the time kdb was entered. + * Returns: + * Zero for success, a kdb diagnostic if failure. + * Remarks: + * Limited to 20 tokens. + * + * Real rudimentary tokenization. Basically only whitespace + * is considered a token delimeter (but special consideration + * is taken of the '=' sign as used by the 'set' command). + * + * The algorithm used to tokenize the input string relies on + * there being at least one whitespace (or otherwise useless) + * character between tokens as the character immediately following + * the token is altered in-place to a null-byte to terminate the + * token string. + */ + +#define MAXARGC 20 + +int kdb_parse(const char *cmdstr) +{ + static char *argv[MAXARGC]; + static int argc; + static char cbuf[CMD_BUFLEN+2]; + char *cp; + char *cpp, quoted; + kdbtab_t *tp; + int i, escaped, ignore_errors = 0, check_grep; + + /* + * First tokenize the command string. + */ + cp = (char *)cmdstr; + kdb_grepping_flag = check_grep = 0; + + if (KDB_FLAG(CMD_INTERRUPT)) { + /* Previous command was interrupted, newline must not + * repeat the command */ + KDB_FLAG_CLEAR(CMD_INTERRUPT); + KDB_STATE_SET(PAGER); + argc = 0; /* no repeat */ + } + + if (*cp != '\n' && *cp != '\0') { + argc = 0; + cpp = cbuf; + while (*cp) { + /* skip whitespace */ + while (isspace(*cp)) + cp++; + if ((*cp == '\0') || (*cp == '\n') || + (*cp == '#' && !defcmd_in_progress)) + break; + /* special case: check for | grep pattern */ + if (*cp == '|') { + check_grep++; + break; + } + if (cpp >= cbuf + CMD_BUFLEN) { + kdb_printf("kdb_parse: command buffer " + "overflow, command ignored\n%s\n", + cmdstr); + return KDB_NOTFOUND; + } + if (argc >= MAXARGC - 1) { + kdb_printf("kdb_parse: too many arguments, " + "command ignored\n%s\n", cmdstr); + return KDB_NOTFOUND; + } + argv[argc++] = cpp; + escaped = 0; + quoted = '\0'; + /* Copy to next unquoted and unescaped + * whitespace or '=' */ + while (*cp && *cp != '\n' && + (escaped || quoted || !isspace(*cp))) { + if (cpp >= cbuf + CMD_BUFLEN) + break; + if (escaped) { + escaped = 0; + *cpp++ = *cp++; + continue; + } + if (*cp == '\\') { + escaped = 1; + ++cp; + continue; + } + if (*cp == quoted) + quoted = '\0'; + else if (*cp == '\'' || *cp == '"') + quoted = *cp; + *cpp = *cp++; + if (*cpp == '=' && !quoted) + break; + ++cpp; + } + *cpp++ = '\0'; /* Squash a ws or '=' character */ + } + } + if (!argc) + return 0; + if (check_grep) + parse_grep(cp); + if (defcmd_in_progress) { + int result = kdb_defcmd2(cmdstr, argv[0]); + if (!defcmd_in_progress) { + argc = 0; /* avoid repeat on endefcmd */ + *(argv[0]) = '\0'; + } + return result; + } + if (argv[0][0] == '-' && argv[0][1] && + (argv[0][1] < '0' || argv[0][1] > '9')) { + ignore_errors = 1; + ++argv[0]; + } + + for_each_kdbcmd(tp, i) { + if (tp->cmd_name) { + /* + * If this command is allowed to be abbreviated, + * check to see if this is it. + */ + + if (tp->cmd_minlen + && (strlen(argv[0]) <= tp->cmd_minlen)) { + if (strncmp(argv[0], + tp->cmd_name, + tp->cmd_minlen) == 0) { + break; + } + } + + if (strcmp(argv[0], tp->cmd_name) == 0) + break; + } + } + + /* + * If we don't find a command by this name, see if the first + * few characters of this match any of the known commands. + * e.g., md1c20 should match md. + */ + if (i == kdb_max_commands) { + for_each_kdbcmd(tp, i) { + if (tp->cmd_name) { + if (strncmp(argv[0], + tp->cmd_name, + strlen(tp->cmd_name)) == 0) { + break; + } + } + } + } + + if (i < kdb_max_commands) { + int result; + KDB_STATE_SET(CMD); + result = (*tp->cmd_func)(argc-1, (const char **)argv); + if (result && ignore_errors && result > KDB_CMD_GO) + result = 0; + KDB_STATE_CLEAR(CMD); + switch (tp->cmd_repeat) { + case KDB_REPEAT_NONE: + argc = 0; + if (argv[0]) + *(argv[0]) = '\0'; + break; + case KDB_REPEAT_NO_ARGS: + argc = 1; + if (argv[1]) + *(argv[1]) = '\0'; + break; + case KDB_REPEAT_WITH_ARGS: + break; + } + return result; + } + + /* + * If the input with which we were presented does not + * map to an existing command, attempt to parse it as an + * address argument and display the result. Useful for + * obtaining the address of a variable, or the nearest symbol + * to an address contained in a register. + */ + { + unsigned long value; + char *name = NULL; + long offset; + int nextarg = 0; + + if (kdbgetaddrarg(0, (const char **)argv, &nextarg, + &value, &offset, &name)) { + return KDB_NOTFOUND; + } + + kdb_printf("%s = ", argv[0]); + kdb_symbol_print(value, NULL, KDB_SP_DEFAULT); + kdb_printf("\n"); + return 0; + } +} + + +static int handle_ctrl_cmd(char *cmd) +{ +#define CTRL_P 16 +#define CTRL_N 14 + + /* initial situation */ + if (cmd_head == cmd_tail) + return 0; + switch (*cmd) { + case CTRL_P: + if (cmdptr != cmd_tail) + cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; + strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); + return 1; + case CTRL_N: + if (cmdptr != cmd_head) + cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT; + strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); + return 1; + } + return 0; +} + +/* + * kdb_reboot - This function implements the 'reboot' command. Reboot + * the system immediately, or loop for ever on failure. + */ +static int kdb_reboot(int argc, const char **argv) +{ + emergency_restart(); + kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n"); + while (1) + cpu_relax(); + /* NOTREACHED */ + return 0; +} + +static void kdb_dumpregs(struct pt_regs *regs) +{ + int old_lvl = console_loglevel; + console_loglevel = 15; + kdb_trap_printk++; + show_regs(regs); + kdb_trap_printk--; + kdb_printf("\n"); + console_loglevel = old_lvl; +} + +void kdb_set_current_task(struct task_struct *p) +{ + kdb_current_task = p; + + if (kdb_task_has_cpu(p)) { + kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p)); + return; + } + kdb_current_regs = NULL; +} + +/* + * kdb_local - The main code for kdb. This routine is invoked on a + * specific processor, it is not global. The main kdb() routine + * ensures that only one processor at a time is in this routine. + * This code is called with the real reason code on the first + * entry to a kdb session, thereafter it is called with reason + * SWITCH, even if the user goes back to the original cpu. + * Inputs: + * reason The reason KDB was invoked + * error The hardware-defined error code + * regs The exception frame at time of fault/breakpoint. + * db_result Result code from the break or debug point. + * Returns: + * 0 KDB was invoked for an event which it wasn't responsible + * 1 KDB handled the event for which it was invoked. + * KDB_CMD_GO User typed 'go'. + * KDB_CMD_CPU User switched to another cpu. + * KDB_CMD_SS Single step. + * KDB_CMD_SSB Single step until branch. + */ +static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, + kdb_dbtrap_t db_result) +{ + char *cmdbuf; + int diag; + struct task_struct *kdb_current = + kdb_curr_task(raw_smp_processor_id()); + + KDB_DEBUG_STATE("kdb_local 1", reason); + kdb_go_count = 0; + if (reason == KDB_REASON_DEBUG) { + /* special case below */ + } else { + kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", + kdb_current, kdb_current->pid); +#if defined(CONFIG_SMP) + kdb_printf("on processor %d ", raw_smp_processor_id()); +#endif + } + + switch (reason) { + case KDB_REASON_DEBUG: + { + /* + * If re-entering kdb after a single step + * command, don't print the message. + */ + switch (db_result) { + case KDB_DB_BPT: + kdb_printf("\nEntering kdb (0x%p, pid %d) ", + kdb_current, kdb_current->pid); +#if defined(CONFIG_SMP) + kdb_printf("on processor %d ", raw_smp_processor_id()); +#endif + kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", + instruction_pointer(regs)); + break; + case KDB_DB_SSB: + /* + * In the midst of ssb command. Just return. + */ + KDB_DEBUG_STATE("kdb_local 3", reason); + return KDB_CMD_SSB; /* Continue with SSB command */ + + break; + case KDB_DB_SS: + break; + case KDB_DB_SSBPT: + KDB_DEBUG_STATE("kdb_local 4", reason); + return 1; /* kdba_db_trap did the work */ + default: + kdb_printf("kdb: Bad result from kdba_db_trap: %d\n", + db_result); + break; + } + + } + break; + case KDB_REASON_ENTER: + if (KDB_STATE(KEYBOARD)) + kdb_printf("due to Keyboard Entry\n"); + else + kdb_printf("due to KDB_ENTER()\n"); + break; + case KDB_REASON_KEYBOARD: + KDB_STATE_SET(KEYBOARD); + kdb_printf("due to Keyboard Entry\n"); + break; + case KDB_REASON_ENTER_SLAVE: + /* drop through, slaves only get released via cpu switch */ + case KDB_REASON_SWITCH: + kdb_printf("due to cpu switch\n"); + break; + case KDB_REASON_OOPS: + kdb_printf("Oops: %s\n", kdb_diemsg); + kdb_printf("due to oops @ " kdb_machreg_fmt "\n", + instruction_pointer(regs)); + kdb_dumpregs(regs); + break; + case KDB_REASON_NMI: + kdb_printf("due to NonMaskable Interrupt @ " + kdb_machreg_fmt "\n", + instruction_pointer(regs)); + kdb_dumpregs(regs); + break; + case KDB_REASON_SSTEP: + case KDB_REASON_BREAK: + kdb_printf("due to %s @ " kdb_machreg_fmt "\n", + reason == KDB_REASON_BREAK ? + "Breakpoint" : "SS trap", instruction_pointer(regs)); + /* + * Determine if this breakpoint is one that we + * are interested in. + */ + if (db_result != KDB_DB_BPT) { + kdb_printf("kdb: error return from kdba_bp_trap: %d\n", + db_result); + KDB_DEBUG_STATE("kdb_local 6", reason); + return 0; /* Not for us, dismiss it */ + } + break; + case KDB_REASON_RECURSE: + kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n", + instruction_pointer(regs)); + break; + default: + kdb_printf("kdb: unexpected reason code: %d\n", reason); + KDB_DEBUG_STATE("kdb_local 8", reason); + return 0; /* Not for us, dismiss it */ + } + + while (1) { + /* + * Initialize pager context. + */ + kdb_nextline = 1; + KDB_STATE_CLEAR(SUPPRESS); + + cmdbuf = cmd_cur; + *cmdbuf = '\0'; + *(cmd_hist[cmd_head]) = '\0'; + + if (KDB_FLAG(ONLY_DO_DUMP)) { + /* kdb is off but a catastrophic error requires a dump. + * Take the dump and reboot. + * Turn on logging so the kdb output appears in the log + * buffer in the dump. + */ + const char *setargs[] = { "set", "LOGGING", "1" }; + kdb_set(2, setargs); + kdb_reboot(0, NULL); + /*NOTREACHED*/ + } + +do_full_getstr: +#if defined(CONFIG_SMP) + snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), + raw_smp_processor_id()); +#else + snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT")); +#endif + if (defcmd_in_progress) + strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN); + + /* + * Fetch command from keyboard + */ + cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str); + if (*cmdbuf != '\n') { + if (*cmdbuf < 32) { + if (cmdptr == cmd_head) { + strncpy(cmd_hist[cmd_head], cmd_cur, + CMD_BUFLEN); + *(cmd_hist[cmd_head] + + strlen(cmd_hist[cmd_head])-1) = '\0'; + } + if (!handle_ctrl_cmd(cmdbuf)) + *(cmd_cur+strlen(cmd_cur)-1) = '\0'; + cmdbuf = cmd_cur; + goto do_full_getstr; + } else { + strncpy(cmd_hist[cmd_head], cmd_cur, + CMD_BUFLEN); + } + + cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT; + if (cmd_head == cmd_tail) + cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT; + } + + cmdptr = cmd_head; + diag = kdb_parse(cmdbuf); + if (diag == KDB_NOTFOUND) { + kdb_printf("Unknown kdb command: '%s'\n", cmdbuf); + diag = 0; + } + if (diag == KDB_CMD_GO + || diag == KDB_CMD_CPU + || diag == KDB_CMD_SS + || diag == KDB_CMD_SSB + || diag == KDB_CMD_KGDB) + break; + + if (diag) + kdb_cmderror(diag); + } + KDB_DEBUG_STATE("kdb_local 9", diag); + return diag; +} + + +/* + * kdb_print_state - Print the state data for the current processor + * for debugging. + * Inputs: + * text Identifies the debug point + * value Any integer value to be printed, e.g. reason code. + */ +void kdb_print_state(const char *text, int value) +{ + kdb_printf("state: %s cpu %d value %d initial %d state %x\n", + text, raw_smp_processor_id(), value, kdb_initial_cpu, + kdb_state); +} + +/* + * kdb_main_loop - After initial setup and assignment of the + * controlling cpu, all cpus are in this loop. One cpu is in + * control and will issue the kdb prompt, the others will spin + * until 'go' or cpu switch. + * + * To get a consistent view of the kernel stacks for all + * processes, this routine is invoked from the main kdb code via + * an architecture specific routine. kdba_main_loop is + * responsible for making the kernel stacks consistent for all + * processes, there should be no difference between a blocked + * process and a running process as far as kdb is concerned. + * Inputs: + * reason The reason KDB was invoked + * error The hardware-defined error code + * reason2 kdb's current reason code. + * Initially error but can change + * acording to kdb state. + * db_result Result code from break or debug point. + * regs The exception frame at time of fault/breakpoint. + * should always be valid. + * Returns: + * 0 KDB was invoked for an event which it wasn't responsible + * 1 KDB handled the event for which it was invoked. + */ +int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, + kdb_dbtrap_t db_result, struct pt_regs *regs) +{ + int result = 1; + /* Stay in kdb() until 'go', 'ss[b]' or an error */ + while (1) { + /* + * All processors except the one that is in control + * will spin here. + */ + KDB_DEBUG_STATE("kdb_main_loop 1", reason); + while (KDB_STATE(HOLD_CPU)) { + /* state KDB is turned off by kdb_cpu to see if the + * other cpus are still live, each cpu in this loop + * turns it back on. + */ + if (!KDB_STATE(KDB)) + KDB_STATE_SET(KDB); + } + + KDB_STATE_CLEAR(SUPPRESS); + KDB_DEBUG_STATE("kdb_main_loop 2", reason); + if (KDB_STATE(LEAVING)) + break; /* Another cpu said 'go' */ + /* Still using kdb, this processor is in control */ + result = kdb_local(reason2, error, regs, db_result); + KDB_DEBUG_STATE("kdb_main_loop 3", result); + + if (result == KDB_CMD_CPU) + break; + + if (result == KDB_CMD_SS) { + KDB_STATE_SET(DOING_SS); + break; + } + + if (result == KDB_CMD_SSB) { + KDB_STATE_SET(DOING_SS); + KDB_STATE_SET(DOING_SSB); + break; + } + + if (result == KDB_CMD_KGDB) { + if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) + kdb_printf("Entering please attach debugger " + "or use $D#44+ or $3#33\n"); + break; + } + if (result && result != 1 && result != KDB_CMD_GO) + kdb_printf("\nUnexpected kdb_local return code %d\n", + result); + KDB_DEBUG_STATE("kdb_main_loop 4", reason); + break; + } + if (KDB_STATE(DOING_SS)) + KDB_STATE_CLEAR(SSBPT); + + return result; +} + +/* + * kdb_mdr - This function implements the guts of the 'mdr', memory + * read command. + * mdr <addr arg>,<byte count> + * Inputs: + * addr Start address + * count Number of bytes + * Returns: + * Always 0. Any errors are detected and printed by kdb_getarea. + */ +static int kdb_mdr(unsigned long addr, unsigned int count) +{ + unsigned char c; + while (count--) { + if (kdb_getarea(c, addr)) + return 0; + kdb_printf("%02x", c); + addr++; + } + kdb_printf("\n"); + return 0; +} + +/* + * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4', + * 'md8' 'mdr' and 'mds' commands. + * + * md|mds [<addr arg> [<line count> [<radix>]]] + * mdWcN [<addr arg> [<line count> [<radix>]]] + * where W = is the width (1, 2, 4 or 8) and N is the count. + * for eg., md1c20 reads 20 bytes, 1 at a time. + * mdr <addr arg>,<byte count> + */ +static void kdb_md_line(const char *fmtstr, unsigned long addr, + int symbolic, int nosect, int bytesperword, + int num, int repeat, int phys) +{ + /* print just one line of data */ + kdb_symtab_t symtab; + char cbuf[32]; + char *c = cbuf; + int i; + unsigned long word; + + memset(cbuf, '\0', sizeof(cbuf)); + if (phys) + kdb_printf("phys " kdb_machreg_fmt0 " ", addr); + else + kdb_printf(kdb_machreg_fmt0 " ", addr); + + for (i = 0; i < num && repeat--; i++) { + if (phys) { + if (kdb_getphysword(&word, addr, bytesperword)) + break; + } else if (kdb_getword(&word, addr, bytesperword)) + break; + kdb_printf(fmtstr, word); + if (symbolic) + kdbnearsym(word, &symtab); + else + memset(&symtab, 0, sizeof(symtab)); + if (symtab.sym_name) { + kdb_symbol_print(word, &symtab, 0); + if (!nosect) { + kdb_printf("\n"); + kdb_printf(" %s %s " + kdb_machreg_fmt " " + kdb_machreg_fmt " " + kdb_machreg_fmt, symtab.mod_name, + symtab.sec_name, symtab.sec_start, + symtab.sym_start, symtab.sym_end); + } + addr += bytesperword; + } else { + union { + u64 word; + unsigned char c[8]; + } wc; + unsigned char *cp; +#ifdef __BIG_ENDIAN + cp = wc.c + 8 - bytesperword; +#else + cp = wc.c; +#endif + wc.word = word; +#define printable_char(c) \ + ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; }) + switch (bytesperword) { + case 8: + *c++ = printable_char(*cp++); + *c++ = printable_char(*cp++); + *c++ = printable_char(*cp++); + *c++ = printable_char(*cp++); + addr += 4; + case 4: + *c++ = printable_char(*cp++); + *c++ = printable_char(*cp++); + addr += 2; + case 2: + *c++ = printable_char(*cp++); + addr++; + case 1: + *c++ = printable_char(*cp++); + addr++; + break; + } +#undef printable_char + } + } + kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1), + " ", cbuf); +} + +static int kdb_md(int argc, const char **argv) +{ + static unsigned long last_addr; + static int last_radix, last_bytesperword, last_repeat; + int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat; + int nosect = 0; + char fmtchar, fmtstr[64]; + unsigned long addr; + unsigned long word; + long offset = 0; + int symbolic = 0; + int valid = 0; + int phys = 0; + + kdbgetintenv("MDCOUNT", &mdcount); + kdbgetintenv("RADIX", &radix); + kdbgetintenv("BYTESPERWORD", &bytesperword); + + /* Assume 'md <addr>' and start with environment values */ + repeat = mdcount * 16 / bytesperword; + + if (strcmp(argv[0], "mdr") == 0) { + if (argc != 2) + return KDB_ARGCOUNT; + valid = 1; + } else if (isdigit(argv[0][2])) { + bytesperword = (int)(argv[0][2] - '0'); + if (bytesperword == 0) { + bytesperword = last_bytesperword; + if (bytesperword == 0) + bytesperword = 4; + } + last_bytesperword = bytesperword; + repeat = mdcount * 16 / bytesperword; + if (!argv[0][3]) + valid = 1; + else if (argv[0][3] == 'c' && argv[0][4]) { + char *p; + repeat = simple_strtoul(argv[0] + 4, &p, 10); + mdcount = ((repeat * bytesperword) + 15) / 16; + valid = !*p; + } + last_repeat = repeat; + } else if (strcmp(argv[0], "md") == 0) + valid = 1; + else if (strcmp(argv[0], "mds") == 0) + valid = 1; + else if (strcmp(argv[0], "mdp") == 0) { + phys = valid = 1; + } + if (!valid) + return KDB_NOTFOUND; + + if (argc == 0) { + if (last_addr == 0) + return KDB_ARGCOUNT; + addr = last_addr; + radix = last_radix; + bytesperword = last_bytesperword; + repeat = last_repeat; + mdcount = ((repeat * bytesperword) + 15) / 16; + } + + if (argc) { + unsigned long val; + int diag, nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, + &offset, NULL); + if (diag) + return diag; + if (argc > nextarg+2) + return KDB_ARGCOUNT; + + if (argc >= nextarg) { + diag = kdbgetularg(argv[nextarg], &val); + if (!diag) { + mdcount = (int) val; + repeat = mdcount * 16 / bytesperword; + } + } + if (argc >= nextarg+1) { + diag = kdbgetularg(argv[nextarg+1], &val); + if (!diag) + radix = (int) val; + } + } + + if (strcmp(argv[0], "mdr") == 0) + return kdb_mdr(addr, mdcount); + + switch (radix) { + case 10: + fmtchar = 'd'; + break; + case 16: + fmtchar = 'x'; + break; + case 8: + fmtchar = 'o'; + break; + default: + return KDB_BADRADIX; + } + + last_radix = radix; + + if (bytesperword > KDB_WORD_SIZE) + return KDB_BADWIDTH; + + switch (bytesperword) { + case 8: + sprintf(fmtstr, "%%16.16l%c ", fmtchar); + break; + case 4: + sprintf(fmtstr, "%%8.8l%c ", fmtchar); + break; + case 2: + sprintf(fmtstr, "%%4.4l%c ", fmtchar); + break; + case 1: + sprintf(fmtstr, "%%2.2l%c ", fmtchar); + break; + default: + return KDB_BADWIDTH; + } + + last_repeat = repeat; + last_bytesperword = bytesperword; + + if (strcmp(argv[0], "mds") == 0) { + symbolic = 1; + /* Do not save these changes as last_*, they are temporary mds + * overrides. + */ + bytesperword = KDB_WORD_SIZE; + repeat = mdcount; + kdbgetintenv("NOSECT", &nosect); + } + + /* Round address down modulo BYTESPERWORD */ + + addr &= ~(bytesperword-1); + + while (repeat > 0) { + unsigned long a; + int n, z, num = (symbolic ? 1 : (16 / bytesperword)); + + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) { + if (phys) { + if (kdb_getphysword(&word, a, bytesperword) + || word) + break; + } else if (kdb_getword(&word, a, bytesperword) || word) + break; + } + n = min(num, repeat); + kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword, + num, repeat, phys); + addr += bytesperword * n; + repeat -= n; + z = (z + num - 1) / num; + if (z > 2) { + int s = num * (z-2); + kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0 + " zero suppressed\n", + addr, addr + bytesperword * s - 1); + addr += bytesperword * s; + repeat -= s; + } + } + last_addr = addr; + + return 0; +} + +/* + * kdb_mm - This function implements the 'mm' command. + * mm address-expression new-value + * Remarks: + * mm works on machine words, mmW works on bytes. + */ +static int kdb_mm(int argc, const char **argv) +{ + int diag; + unsigned long addr; + long offset = 0; + unsigned long contents; + int nextarg; + int width; + + if (argv[0][2] && !isdigit(argv[0][2])) + return KDB_NOTFOUND; + + if (argc < 2) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); + if (diag) + return diag; + + if (nextarg > argc) + return KDB_ARGCOUNT; + diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL); + if (diag) + return diag; + + if (nextarg != argc + 1) + return KDB_ARGCOUNT; + + width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE); + diag = kdb_putword(addr, contents, width); + if (diag) + return diag; + + kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents); + + return 0; +} + +/* + * kdb_go - This function implements the 'go' command. + * go [address-expression] + */ +static int kdb_go(int argc, const char **argv) +{ + unsigned long addr; + int diag; + int nextarg; + long offset; + + if (argc == 1) { + if (raw_smp_processor_id() != kdb_initial_cpu) { + kdb_printf("go <address> must be issued from the " + "initial cpu, do cpu %d first\n", + kdb_initial_cpu); + return KDB_ARGCOUNT; + } + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, + &addr, &offset, NULL); + if (diag) + return diag; + } else if (argc) { + return KDB_ARGCOUNT; + } + + diag = KDB_CMD_GO; + if (KDB_FLAG(CATASTROPHIC)) { + kdb_printf("Catastrophic error detected\n"); + kdb_printf("kdb_continue_catastrophic=%d, ", + kdb_continue_catastrophic); + if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) { + kdb_printf("type go a second time if you really want " + "to continue\n"); + return 0; + } + if (kdb_continue_catastrophic == 2) { + kdb_printf("forcing reboot\n"); + kdb_reboot(0, NULL); + } + kdb_printf("attempting to continue\n"); + } + return diag; +} + +/* + * kdb_rd - This function implements the 'rd' command. + */ +static int kdb_rd(int argc, const char **argv) +{ + int diag = kdb_check_regs(); + if (diag) + return diag; + + kdb_dumpregs(kdb_current_regs); + return 0; +} + +/* + * kdb_rm - This function implements the 'rm' (register modify) command. + * rm register-name new-contents + * Remarks: + * Currently doesn't allow modification of control or + * debug registers. + */ +static int kdb_rm(int argc, const char **argv) +{ + int diag; + int ind = 0; + unsigned long contents; + + if (argc != 2) + return KDB_ARGCOUNT; + /* + * Allow presence or absence of leading '%' symbol. + */ + if (argv[1][0] == '%') + ind = 1; + + diag = kdbgetularg(argv[2], &contents); + if (diag) + return diag; + + diag = kdb_check_regs(); + if (diag) + return diag; + kdb_printf("ERROR: Register set currently not implemented\n"); + return 0; +} + +#if defined(CONFIG_MAGIC_SYSRQ) +/* + * kdb_sr - This function implements the 'sr' (SYSRQ key) command + * which interfaces to the soi-disant MAGIC SYSRQ functionality. + * sr <magic-sysrq-code> + */ +static int kdb_sr(int argc, const char **argv) +{ + if (argc != 1) + return KDB_ARGCOUNT; + kdb_trap_printk++; + __handle_sysrq(*argv[1], NULL, 0); + kdb_trap_printk--; + + return 0; +} +#endif /* CONFIG_MAGIC_SYSRQ */ + +/* + * kdb_ef - This function implements the 'regs' (display exception + * frame) command. This command takes an address and expects to + * find an exception frame at that address, formats and prints + * it. + * regs address-expression + * Remarks: + * Not done yet. + */ +static int kdb_ef(int argc, const char **argv) +{ + int diag; + unsigned long addr; + long offset; + int nextarg; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); + if (diag) + return diag; + show_regs((struct pt_regs *)addr); + return 0; +} + +#if defined(CONFIG_MODULES) +/* + * kdb_lsmod - This function implements the 'lsmod' command. Lists + * currently loaded kernel modules. + * Mostly taken from userland lsmod. + */ +static int kdb_lsmod(int argc, const char **argv) +{ + struct module *mod; + + if (argc != 0) + return KDB_ARGCOUNT; + + kdb_printf("Module Size modstruct Used by\n"); + list_for_each_entry(mod, kdb_modules, list) { + + kdb_printf("%-20s%8u 0x%p ", mod->name, + mod->core_size, (void *)mod); +#ifdef CONFIG_MODULE_UNLOAD + kdb_printf("%4d ", module_refcount(mod)); +#endif + if (mod->state == MODULE_STATE_GOING) + kdb_printf(" (Unloading)"); + else if (mod->state == MODULE_STATE_COMING) + kdb_printf(" (Loading)"); + else + kdb_printf(" (Live)"); + kdb_printf(" 0x%p", mod->module_core); + +#ifdef CONFIG_MODULE_UNLOAD + { + struct module_use *use; + kdb_printf(" [ "); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); + kdb_printf("]\n"); + } +#endif + } + + return 0; +} + +#endif /* CONFIG_MODULES */ + +/* + * kdb_env - This function implements the 'env' command. Display the + * current environment variables. + */ + +static int kdb_env(int argc, const char **argv) +{ + int i; + + for (i = 0; i < __nenv; i++) { + if (__env[i]) + kdb_printf("%s\n", __env[i]); + } + + if (KDB_DEBUG(MASK)) + kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); + + return 0; +} + +#ifdef CONFIG_PRINTK +/* + * kdb_dmesg - This function implements the 'dmesg' command to display + * the contents of the syslog buffer. + * dmesg [lines] [adjust] + */ +static int kdb_dmesg(int argc, const char **argv) +{ + char *syslog_data[4], *start, *end, c = '\0', *p; + int diag, logging, logsize, lines = 0, adjust = 0, n; + + if (argc > 2) + return KDB_ARGCOUNT; + if (argc) { + char *cp; + lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + lines = 0; + if (argc > 1) { + adjust = simple_strtoul(argv[2], &cp, 0); + if (*cp || adjust < 0) + adjust = 0; + } + } + + /* disable LOGGING if set */ + diag = kdbgetintenv("LOGGING", &logging); + if (!diag && logging) { + const char *setargs[] = { "set", "LOGGING", "0" }; + kdb_set(2, setargs); + } + + /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] + * logical start, end+1. */ + kdb_syslog_data(syslog_data); + if (syslog_data[2] == syslog_data[3]) + return 0; + logsize = syslog_data[1] - syslog_data[0]; + start = syslog_data[2]; + end = syslog_data[3]; +#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) + for (n = 0, p = start; p < end; ++p) { + c = *KDB_WRAP(p); + if (c == '\n') + ++n; + } + if (c != '\n') + ++n; + if (lines < 0) { + if (adjust >= n) + kdb_printf("buffer only contains %d lines, nothing " + "printed\n", n); + else if (adjust - lines >= n) + kdb_printf("buffer only contains %d lines, last %d " + "lines printed\n", n, n - adjust); + if (adjust) { + for (; start < end && adjust; ++start) { + if (*KDB_WRAP(start) == '\n') + --adjust; + } + if (start < end) + ++start; + } + for (p = start; p < end && lines; ++p) { + if (*KDB_WRAP(p) == '\n') + ++lines; + } + end = p; + } else if (lines > 0) { + int skip = n - (adjust + lines); + if (adjust >= n) { + kdb_printf("buffer only contains %d lines, " + "nothing printed\n", n); + skip = n; + } else if (skip < 0) { + lines += skip; + skip = 0; + kdb_printf("buffer only contains %d lines, first " + "%d lines printed\n", n, lines); + } + for (; start < end && skip; ++start) { + if (*KDB_WRAP(start) == '\n') + --skip; + } + for (p = start; p < end && lines; ++p) { + if (*KDB_WRAP(p) == '\n') + --lines; + } + end = p; + } + /* Do a line at a time (max 200 chars) to reduce protocol overhead */ + c = '\n'; + while (start != end) { + char buf[201]; + p = buf; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + while (start < end && (c = *KDB_WRAP(start)) && + (p - buf) < sizeof(buf)-1) { + ++start; + *p++ = c; + if (c == '\n') + break; + } + *p = '\0'; + kdb_printf("%s", buf); + } + if (c != '\n') + kdb_printf("\n"); + + return 0; +} +#endif /* CONFIG_PRINTK */ +/* + * kdb_cpu - This function implements the 'cpu' command. + * cpu [<cpunum>] + * Returns: + * KDB_CMD_CPU for success, a kdb diagnostic if error + */ +static void kdb_cpu_status(void) +{ + int i, start_cpu, first_print = 1; + char state, prev_state = '?'; + + kdb_printf("Currently on cpu %d\n", raw_smp_processor_id()); + kdb_printf("Available cpus: "); + for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) { + state = 'F'; /* cpu is offline */ + } else { + state = ' '; /* cpu is responding to kdb */ + if (kdb_task_state_char(KDB_TSK(i)) == 'I') + state = 'I'; /* idle task */ + } + if (state != prev_state) { + if (prev_state != '?') { + if (!first_print) + kdb_printf(", "); + first_print = 0; + kdb_printf("%d", start_cpu); + if (start_cpu < i-1) + kdb_printf("-%d", i-1); + if (prev_state != ' ') + kdb_printf("(%c)", prev_state); + } + prev_state = state; + start_cpu = i; + } + } + /* print the trailing cpus, ignoring them if they are all offline */ + if (prev_state != 'F') { + if (!first_print) + kdb_printf(", "); + kdb_printf("%d", start_cpu); + if (start_cpu < i-1) + kdb_printf("-%d", i-1); + if (prev_state != ' ') + kdb_printf("(%c)", prev_state); + } + kdb_printf("\n"); +} + +static int kdb_cpu(int argc, const char **argv) +{ + unsigned long cpunum; + int diag; + + if (argc == 0) { + kdb_cpu_status(); + return 0; + } + + if (argc != 1) + return KDB_ARGCOUNT; + + diag = kdbgetularg(argv[1], &cpunum); + if (diag) + return diag; + + /* + * Validate cpunum + */ + if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) + return KDB_BADCPUNUM; + + dbg_switch_cpu = cpunum; + + /* + * Switch to other cpu + */ + return KDB_CMD_CPU; +} + +/* The user may not realize that ps/bta with no parameters does not print idle + * or sleeping system daemon processes, so tell them how many were suppressed. + */ +void kdb_ps_suppressed(void) +{ + int idle = 0, daemon = 0; + unsigned long mask_I = kdb_task_state_string("I"), + mask_M = kdb_task_state_string("M"); + unsigned long cpu; + const struct task_struct *p, *g; + for_each_online_cpu(cpu) { + p = kdb_curr_task(cpu); + if (kdb_task_state(p, mask_I)) + ++idle; + } + kdb_do_each_thread(g, p) { + if (kdb_task_state(p, mask_M)) + ++daemon; + } kdb_while_each_thread(g, p); + if (idle || daemon) { + if (idle) + kdb_printf("%d idle process%s (state I)%s\n", + idle, idle == 1 ? "" : "es", + daemon ? " and " : ""); + if (daemon) + kdb_printf("%d sleeping system daemon (state M) " + "process%s", daemon, + daemon == 1 ? "" : "es"); + kdb_printf(" suppressed,\nuse 'ps A' to see all.\n"); + } +} + +/* + * kdb_ps - This function implements the 'ps' command which shows a + * list of the active processes. + * ps [DRSTCZEUIMA] All processes, optionally filtered by state + */ +void kdb_ps1(const struct task_struct *p) +{ + int cpu; + unsigned long tmp; + + if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) + return; + + cpu = kdb_process_cpu(p); + kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n", + (void *)p, p->pid, p->parent->pid, + kdb_task_has_cpu(p), kdb_process_cpu(p), + kdb_task_state_char(p), + (void *)(&p->thread), + p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ', + p->comm); + if (kdb_task_has_cpu(p)) { + if (!KDB_TSK(cpu)) { + kdb_printf(" Error: no saved data for this cpu\n"); + } else { + if (KDB_TSK(cpu) != p) + kdb_printf(" Error: does not match running " + "process table (0x%p)\n", KDB_TSK(cpu)); + } + } +} + +static int kdb_ps(int argc, const char **argv) +{ + struct task_struct *g, *p; + unsigned long mask, cpu; + + if (argc == 0) + kdb_ps_suppressed(); + kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n", + (int)(2*sizeof(void *))+2, "Task Addr", + (int)(2*sizeof(void *))+2, "Thread"); + mask = kdb_task_state_string(argc ? argv[1] : NULL); + /* Run the active tasks first */ + for_each_online_cpu(cpu) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + p = kdb_curr_task(cpu); + if (kdb_task_state(p, mask)) + kdb_ps1(p); + } + kdb_printf("\n"); + /* Now the real tasks */ + kdb_do_each_thread(g, p) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + if (kdb_task_state(p, mask)) + kdb_ps1(p); + } kdb_while_each_thread(g, p); + + return 0; +} + +/* + * kdb_pid - This function implements the 'pid' command which switches + * the currently active process. + * pid [<pid> | R] + */ +static int kdb_pid(int argc, const char **argv) +{ + struct task_struct *p; + unsigned long val; + int diag; + + if (argc > 1) + return KDB_ARGCOUNT; + + if (argc) { + if (strcmp(argv[1], "R") == 0) { + p = KDB_TSK(kdb_initial_cpu); + } else { + diag = kdbgetularg(argv[1], &val); + if (diag) + return KDB_BADINT; + + p = find_task_by_pid_ns((pid_t)val, &init_pid_ns); + if (!p) { + kdb_printf("No task with pid=%d\n", (pid_t)val); + return 0; + } + } + kdb_set_current_task(p); + } + kdb_printf("KDB current process is %s(pid=%d)\n", + kdb_current_task->comm, + kdb_current_task->pid); + + return 0; +} + +/* + * kdb_ll - This function implements the 'll' command which follows a + * linked list and executes an arbitrary command for each + * element. + */ +static int kdb_ll(int argc, const char **argv) +{ + int diag; + unsigned long addr; + long offset = 0; + unsigned long va; + unsigned long linkoffset; + int nextarg; + const char *command; + + if (argc != 3) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); + if (diag) + return diag; + + diag = kdbgetularg(argv[2], &linkoffset); + if (diag) + return diag; + + /* + * Using the starting address as + * the first element in the list, and assuming that + * the list ends with a null pointer. + */ + + va = addr; + command = kdb_strdup(argv[3], GFP_KDB); + if (!command) { + kdb_printf("%s: cannot duplicate command\n", __func__); + return 0; + } + /* Recursive use of kdb_parse, do not use argv after this point */ + argv = NULL; + + while (va) { + char buf[80]; + + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + + sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); + diag = kdb_parse(buf); + if (diag) + return diag; + + addr = va + linkoffset; + if (kdb_getword(&va, addr, sizeof(va))) + return 0; + } + kfree(command); + + return 0; +} + +static int kdb_kgdb(int argc, const char **argv) +{ + return KDB_CMD_KGDB; +} + +/* + * kdb_help - This function implements the 'help' and '?' commands. + */ +static int kdb_help(int argc, const char **argv) +{ + kdbtab_t *kt; + int i; + + kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description"); + kdb_printf("-----------------------------" + "-----------------------------\n"); + for_each_kdbcmd(kt, i) { + if (kt->cmd_name) + kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name, + kt->cmd_usage, kt->cmd_help); + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + } + return 0; +} + +/* + * kdb_kill - This function implements the 'kill' commands. + */ +static int kdb_kill(int argc, const char **argv) +{ + long sig, pid; + char *endp; + struct task_struct *p; + struct siginfo info; + + if (argc != 2) + return KDB_ARGCOUNT; + + sig = simple_strtol(argv[1], &endp, 0); + if (*endp) + return KDB_BADINT; + if (sig >= 0) { + kdb_printf("Invalid signal parameter.<-signal>\n"); + return 0; + } + sig = -sig; + + pid = simple_strtol(argv[2], &endp, 0); + if (*endp) + return KDB_BADINT; + if (pid <= 0) { + kdb_printf("Process ID must be large than 0.\n"); + return 0; + } + + /* Find the process. */ + p = find_task_by_pid_ns(pid, &init_pid_ns); + if (!p) { + kdb_printf("The specified process isn't found.\n"); + return 0; + } + p = p->group_leader; + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_USER; + info.si_pid = pid; /* same capabilities as process being signalled */ + info.si_uid = 0; /* kdb has root authority */ + kdb_send_sig_info(p, &info); + return 0; +} + +struct kdb_tm { + int tm_sec; /* seconds */ + int tm_min; /* minutes */ + int tm_hour; /* hours */ + int tm_mday; /* day of the month */ + int tm_mon; /* month */ + int tm_year; /* year */ +}; + +static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) +{ + /* This will work from 1970-2099, 2100 is not a leap year */ + static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31, + 31, 30, 31, 30, 31 }; + memset(tm, 0, sizeof(*tm)); + tm->tm_sec = tv->tv_sec % (24 * 60 * 60); + tm->tm_mday = tv->tv_sec / (24 * 60 * 60) + + (2 * 365 + 1); /* shift base from 1970 to 1968 */ + tm->tm_min = tm->tm_sec / 60 % 60; + tm->tm_hour = tm->tm_sec / 60 / 60; + tm->tm_sec = tm->tm_sec % 60; + tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1)); + tm->tm_mday %= (4*365+1); + mon_day[1] = 29; + while (tm->tm_mday >= mon_day[tm->tm_mon]) { + tm->tm_mday -= mon_day[tm->tm_mon]; + if (++tm->tm_mon == 12) { + tm->tm_mon = 0; + ++tm->tm_year; + mon_day[1] = 28; + } + } + ++tm->tm_mday; +} + +/* + * Most of this code has been lifted from kernel/timer.c::sys_sysinfo(). + * I cannot call that code directly from kdb, it has an unconditional + * cli()/sti() and calls routines that take locks which can stop the debugger. + */ +static void kdb_sysinfo(struct sysinfo *val) +{ + struct timespec uptime; + do_posix_clock_monotonic_gettime(&uptime); + memset(val, 0, sizeof(*val)); + val->uptime = uptime.tv_sec; + val->loads[0] = avenrun[0]; + val->loads[1] = avenrun[1]; + val->loads[2] = avenrun[2]; + val->procs = nr_threads-1; + si_meminfo(val); + + return; +} + +/* + * kdb_summary - This function implements the 'summary' command. + */ +static int kdb_summary(int argc, const char **argv) +{ + struct kdb_tm tm; + struct sysinfo val; + + if (argc) + return KDB_ARGCOUNT; + + kdb_printf("sysname %s\n", init_uts_ns.name.sysname); + kdb_printf("release %s\n", init_uts_ns.name.release); + kdb_printf("version %s\n", init_uts_ns.name.version); + kdb_printf("machine %s\n", init_uts_ns.name.machine); + kdb_printf("nodename %s\n", init_uts_ns.name.nodename); + kdb_printf("domainname %s\n", init_uts_ns.name.domainname); + kdb_printf("ccversion %s\n", __stringify(CCVERSION)); + + kdb_gmtime(&xtime, &tm); + kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " + "tz_minuteswest %d\n", + 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, + sys_tz.tz_minuteswest); + + kdb_sysinfo(&val); + kdb_printf("uptime "); + if (val.uptime > (24*60*60)) { + int days = val.uptime / (24*60*60); + val.uptime %= (24*60*60); + kdb_printf("%d day%s ", days, days == 1 ? "" : "s"); + } + kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); + + /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", + LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), + LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), + LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); +#undef LOAD_INT +#undef LOAD_FRAC + /* Display in kilobytes */ +#define K(x) ((x) << (PAGE_SHIFT - 10)) + kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" + "Buffers: %8lu kB\n", + val.totalram, val.freeram, val.bufferram); + return 0; +} + +/* + * kdb_per_cpu - This function implements the 'per_cpu' command. + */ +static int kdb_per_cpu(int argc, const char **argv) +{ + char buf[256], fmtstr[64]; + kdb_symtab_t symtab; + cpumask_t suppress = CPU_MASK_NONE; + int cpu, diag; + unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL; + + if (argc < 1 || argc > 3) + return KDB_ARGCOUNT; + + snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); + if (!kdbgetsymval(buf, &symtab)) { + kdb_printf("%s is not a per_cpu variable\n", argv[1]); + return KDB_BADADDR; + } + if (argc >= 2) { + diag = kdbgetularg(argv[2], &bytesperword); + if (diag) + return diag; + } + if (!bytesperword) + bytesperword = KDB_WORD_SIZE; + else if (bytesperword > KDB_WORD_SIZE) + return KDB_BADWIDTH; + sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword)); + if (argc >= 3) { + diag = kdbgetularg(argv[3], &whichcpu); + if (diag) + return diag; + if (!cpu_online(whichcpu)) { + kdb_printf("cpu %ld is not online\n", whichcpu); + return KDB_BADCPUNUM; + } + } + + /* Most architectures use __per_cpu_offset[cpu], some use + * __per_cpu_offset(cpu), smp has no __per_cpu_offset. + */ +#ifdef __per_cpu_offset +#define KDB_PCU(cpu) __per_cpu_offset(cpu) +#else +#ifdef CONFIG_SMP +#define KDB_PCU(cpu) __per_cpu_offset[cpu] +#else +#define KDB_PCU(cpu) 0 +#endif +#endif + + for_each_online_cpu(cpu) { + if (whichcpu != ~0UL && whichcpu != cpu) + continue; + addr = symtab.sym_start + KDB_PCU(cpu); + diag = kdb_getword(&val, addr, bytesperword); + if (diag) { + kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " + "read, diag=%d\n", cpu, addr, diag); + continue; + } +#ifdef CONFIG_SMP + if (!val) { + cpu_set(cpu, suppress); + continue; + } +#endif /* CONFIG_SMP */ + kdb_printf("%5d ", cpu); + kdb_md_line(fmtstr, addr, + bytesperword == KDB_WORD_SIZE, + 1, bytesperword, 1, 1, 0); + } + if (cpus_weight(suppress) == 0) + return 0; + kdb_printf("Zero suppressed cpu(s):"); + for (cpu = first_cpu(suppress); cpu < num_possible_cpus(); + cpu = next_cpu(cpu, suppress)) { + kdb_printf(" %d", cpu); + if (cpu == num_possible_cpus() - 1 || + next_cpu(cpu, suppress) != cpu + 1) + continue; + while (cpu < num_possible_cpus() && + next_cpu(cpu, suppress) == cpu + 1) + ++cpu; + kdb_printf("-%d", cpu); + } + kdb_printf("\n"); + +#undef KDB_PCU + + return 0; +} + +/* + * display help for the use of cmd | grep pattern + */ +static int kdb_grep_help(int argc, const char **argv) +{ + kdb_printf("Usage of cmd args | grep pattern:\n"); + kdb_printf(" Any command's output may be filtered through an "); + kdb_printf("emulated 'pipe'.\n"); + kdb_printf(" 'grep' is just a key word.\n"); + kdb_printf(" The pattern may include a very limited set of " + "metacharacters:\n"); + kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n"); + kdb_printf(" And if there are spaces in the pattern, you may " + "quote it:\n"); + kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\"" + " or \"^pat tern$\"\n"); + return 0; +} + +/* + * kdb_register_repeat - This function is used to register a kernel + * debugger command. + * Inputs: + * cmd Command name + * func Function to execute the command + * usage A simple usage string showing arguments + * help A simple help string describing command + * repeat Does the command auto repeat on enter? + * Returns: + * zero for success, one if a duplicate command. + */ +#define kdb_command_extend 50 /* arbitrary */ +int kdb_register_repeat(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen, + kdb_repeat_t repeat) +{ + int i; + kdbtab_t *kp; + + /* + * Brute force method to determine duplicates + */ + for_each_kdbcmd(kp, i) { + if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { + kdb_printf("Duplicate kdb command registered: " + "%s, func %p help %s\n", cmd, func, help); + return 1; + } + } + + /* + * Insert command into first available location in table + */ + for_each_kdbcmd(kp, i) { + if (kp->cmd_name == NULL) + break; + } + + if (i >= kdb_max_commands) { + kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX + + kdb_command_extend) * sizeof(*new), GFP_KDB); + if (!new) { + kdb_printf("Could not allocate new kdb_command " + "table\n"); + return 1; + } + if (kdb_commands) { + memcpy(new, kdb_commands, + kdb_max_commands * sizeof(*new)); + kfree(kdb_commands); + } + memset(new + kdb_max_commands, 0, + kdb_command_extend * sizeof(*new)); + kdb_commands = new; + kp = kdb_commands + kdb_max_commands; + kdb_max_commands += kdb_command_extend; + } + + kp->cmd_name = cmd; + kp->cmd_func = func; + kp->cmd_usage = usage; + kp->cmd_help = help; + kp->cmd_flags = 0; + kp->cmd_minlen = minlen; + kp->cmd_repeat = repeat; + + return 0; +} + +/* + * kdb_register - Compatibility register function for commands that do + * not need to specify a repeat state. Equivalent to + * kdb_register_repeat with KDB_REPEAT_NONE. + * Inputs: + * cmd Command name + * func Function to execute the command + * usage A simple usage string showing arguments + * help A simple help string describing command + * Returns: + * zero for success, one if a duplicate command. + */ +int kdb_register(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen) +{ + return kdb_register_repeat(cmd, func, usage, help, minlen, + KDB_REPEAT_NONE); +} + +/* + * kdb_unregister - This function is used to unregister a kernel + * debugger command. It is generally called when a module which + * implements kdb commands is unloaded. + * Inputs: + * cmd Command name + * Returns: + * zero for success, one command not registered. + */ +int kdb_unregister(char *cmd) +{ + int i; + kdbtab_t *kp; + + /* + * find the command. + */ + for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { + if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { + kp->cmd_name = NULL; + return 0; + } + } + + /* Couldn't find it. */ + return 1; +} + +/* Initialize the kdb command table. */ +static void __init kdb_inittab(void) +{ + int i; + kdbtab_t *kp; + + for_each_kdbcmd(kp, i) + kp->cmd_name = NULL; + + kdb_register_repeat("md", kdb_md, "<vaddr>", + "Display Memory Contents, also mdWcN, e.g. md8c1", 1, + KDB_REPEAT_NO_ARGS); + kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", + "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", + "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("mds", kdb_md, "<vaddr>", + "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", + "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("go", kdb_go, "[<vaddr>]", + "Continue Execution", 1, KDB_REPEAT_NONE); + kdb_register_repeat("rd", kdb_rd, "", + "Display Registers", 0, KDB_REPEAT_NONE); + kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", + "Modify Registers", 0, KDB_REPEAT_NONE); + kdb_register_repeat("ef", kdb_ef, "<vaddr>", + "Display exception frame", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", + "Stack traceback", 1, KDB_REPEAT_NONE); + kdb_register_repeat("btp", kdb_bt, "<pid>", + "Display stack for process <pid>", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", + "Display stack all processes", 0, KDB_REPEAT_NONE); + kdb_register_repeat("btc", kdb_bt, "", + "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); + kdb_register_repeat("btt", kdb_bt, "<vaddr>", + "Backtrace process given its struct task address", 0, + KDB_REPEAT_NONE); + kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>", + "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE); + kdb_register_repeat("env", kdb_env, "", + "Show environment variables", 0, KDB_REPEAT_NONE); + kdb_register_repeat("set", kdb_set, "", + "Set environment variables", 0, KDB_REPEAT_NONE); + kdb_register_repeat("help", kdb_help, "", + "Display Help Message", 1, KDB_REPEAT_NONE); + kdb_register_repeat("?", kdb_help, "", + "Display Help Message", 0, KDB_REPEAT_NONE); + kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", + "Switch to new cpu", 0, KDB_REPEAT_NONE); + kdb_register_repeat("kgdb", kdb_kgdb, "", + "Enter kgdb mode", 0, KDB_REPEAT_NONE); + kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", + "Display active task list", 0, KDB_REPEAT_NONE); + kdb_register_repeat("pid", kdb_pid, "<pidnum>", + "Switch to another task", 0, KDB_REPEAT_NONE); + kdb_register_repeat("reboot", kdb_reboot, "", + "Reboot the machine immediately", 0, KDB_REPEAT_NONE); +#if defined(CONFIG_MODULES) + kdb_register_repeat("lsmod", kdb_lsmod, "", + "List loaded kernel modules", 0, KDB_REPEAT_NONE); +#endif +#if defined(CONFIG_MAGIC_SYSRQ) + kdb_register_repeat("sr", kdb_sr, "<key>", + "Magic SysRq key", 0, KDB_REPEAT_NONE); +#endif +#if defined(CONFIG_PRINTK) + kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", + "Display syslog buffer", 0, KDB_REPEAT_NONE); +#endif + kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", + "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); + kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", + "Send a signal to a process", 0, KDB_REPEAT_NONE); + kdb_register_repeat("summary", kdb_summary, "", + "Summarize the system", 4, KDB_REPEAT_NONE); + kdb_register_repeat("per_cpu", kdb_per_cpu, "", + "Display per_cpu variables", 3, KDB_REPEAT_NONE); + kdb_register_repeat("grephelp", kdb_grep_help, "", + "Display help on | grep", 0, KDB_REPEAT_NONE); +} + +/* Execute any commands defined in kdb_cmds. */ +static void __init kdb_cmd_init(void) +{ + int i, diag; + for (i = 0; kdb_cmds[i]; ++i) { + diag = kdb_parse(kdb_cmds[i]); + if (diag) + kdb_printf("kdb command %s failed, kdb diag %d\n", + kdb_cmds[i], diag); + } + if (defcmd_in_progress) { + kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n"); + kdb_parse("endefcmd"); + } +} + +/* Intialize kdb_printf, breakpoint tables and kdb state */ +void __init kdb_init(int lvl) +{ + static int kdb_init_lvl = KDB_NOT_INITIALIZED; + int i; + + if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl) + return; + for (i = kdb_init_lvl; i < lvl; i++) { + switch (i) { + case KDB_NOT_INITIALIZED: + kdb_inittab(); /* Initialize Command Table */ + kdb_initbptab(); /* Initialize Breakpoints */ + break; + case KDB_INIT_EARLY: + kdb_cmd_init(); /* Build kdb_cmds tables */ + break; + } + } + kdb_init_lvl = lvl; +} diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h new file mode 100644 index 0000000..97d3ba6 --- /dev/null +++ b/kernel/debug/kdb/kdb_private.h @@ -0,0 +1,300 @@ +#ifndef _KDBPRIVATE_H +#define _KDBPRIVATE_H + +/* + * Kernel Debugger Architecture Independent Private Headers + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include <linux/kgdb.h> +#include "../debug_core.h" + +/* Kernel Debugger Error codes. Must not overlap with command codes. */ +#define KDB_NOTFOUND (-1) +#define KDB_ARGCOUNT (-2) +#define KDB_BADWIDTH (-3) +#define KDB_BADRADIX (-4) +#define KDB_NOTENV (-5) +#define KDB_NOENVVALUE (-6) +#define KDB_NOTIMP (-7) +#define KDB_ENVFULL (-8) +#define KDB_ENVBUFFULL (-9) +#define KDB_TOOMANYBPT (-10) +#define KDB_TOOMANYDBREGS (-11) +#define KDB_DUPBPT (-12) +#define KDB_BPTNOTFOUND (-13) +#define KDB_BADMODE (-14) +#define KDB_BADINT (-15) +#define KDB_INVADDRFMT (-16) +#define KDB_BADREG (-17) +#define KDB_BADCPUNUM (-18) +#define KDB_BADLENGTH (-19) +#define KDB_NOBP (-20) +#define KDB_BADADDR (-21) + +/* Kernel Debugger Command codes. Must not overlap with error codes. */ +#define KDB_CMD_GO (-1001) +#define KDB_CMD_CPU (-1002) +#define KDB_CMD_SS (-1003) +#define KDB_CMD_SSB (-1004) +#define KDB_CMD_KGDB (-1005) +#define KDB_CMD_KGDB2 (-1006) + +/* Internal debug flags */ +#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ +#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */ +#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */ +#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */ +#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */ +#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */ +#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */ +#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */ + +#define KDB_DEBUG(flag) (kdb_flags & \ + (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT)) +#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \ + kdb_print_state(text, value) + +#if BITS_PER_LONG == 32 + +#define KDB_PLATFORM_ENV "BYTESPERWORD=4" + +#define kdb_machreg_fmt "0x%lx" +#define kdb_machreg_fmt0 "0x%08lx" +#define kdb_bfd_vma_fmt "0x%lx" +#define kdb_bfd_vma_fmt0 "0x%08lx" +#define kdb_elfw_addr_fmt "0x%x" +#define kdb_elfw_addr_fmt0 "0x%08x" +#define kdb_f_count_fmt "%d" + +#elif BITS_PER_LONG == 64 + +#define KDB_PLATFORM_ENV "BYTESPERWORD=8" + +#define kdb_machreg_fmt "0x%lx" +#define kdb_machreg_fmt0 "0x%016lx" +#define kdb_bfd_vma_fmt "0x%lx" +#define kdb_bfd_vma_fmt0 "0x%016lx" +#define kdb_elfw_addr_fmt "0x%x" +#define kdb_elfw_addr_fmt0 "0x%016x" +#define kdb_f_count_fmt "%ld" + +#endif + +/* + * KDB_MAXBPT describes the total number of breakpoints + * supported by this architecure. + */ +#define KDB_MAXBPT 16 + +/* Maximum number of arguments to a function */ +#define KDB_MAXARGS 16 + +typedef enum { + KDB_REPEAT_NONE = 0, /* Do not repeat this command */ + KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */ + KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */ +} kdb_repeat_t; + +typedef int (*kdb_func_t)(int, const char **); + +/* Symbol table format returned by kallsyms. */ +typedef struct __ksymtab { + unsigned long value; /* Address of symbol */ + const char *mod_name; /* Module containing symbol or + * "kernel" */ + unsigned long mod_start; + unsigned long mod_end; + const char *sec_name; /* Section containing symbol */ + unsigned long sec_start; + unsigned long sec_end; + const char *sym_name; /* Full symbol name, including + * any version */ + unsigned long sym_start; + unsigned long sym_end; + } kdb_symtab_t; +extern int kallsyms_symbol_next(char *prefix_name, int flag); +extern int kallsyms_symbol_complete(char *prefix_name, int max_len); + +/* Exported Symbols for kernel loadable modules to use. */ +extern int kdb_register(char *, kdb_func_t, char *, char *, short); +extern int kdb_register_repeat(char *, kdb_func_t, char *, char *, + short, kdb_repeat_t); +extern int kdb_unregister(char *); + +extern int kdb_getarea_size(void *, unsigned long, size_t); +extern int kdb_putarea_size(unsigned long, void *, size_t); + +/* + * Like get_user and put_user, kdb_getarea and kdb_putarea take variable + * names, not pointers. The underlying *_size functions take pointers. + */ +#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x))) +#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x))) + +extern int kdb_getphysword(unsigned long *word, + unsigned long addr, size_t size); +extern int kdb_getword(unsigned long *, unsigned long, size_t); +extern int kdb_putword(unsigned long, unsigned long, size_t); + +extern int kdbgetularg(const char *, unsigned long *); +extern int kdb_set(int, const char **); +extern char *kdbgetenv(const char *); +extern int kdbgetintenv(const char *, int *); +extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, + long *, char **); +extern int kdbgetsymval(const char *, kdb_symtab_t *); +extern int kdbnearsym(unsigned long, kdb_symtab_t *); +extern void kdbnearsym_cleanup(void); +extern char *kdb_strdup(const char *str, gfp_t type); +extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int); + +/* Routine for debugging the debugger state. */ +extern void kdb_print_state(const char *, int); + +extern int kdb_state; +#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */ +#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */ +#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */ +#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under + * kdb control */ +#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ +#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ +#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command, + * DOING_SS is also set */ +#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint + * after one ss, independent of + * DOING_SS */ +#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */ +#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */ +#define KDB_STATE_PAGER 0x00000400 /* pager is available */ +#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching + * back to initial cpu */ +#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */ +#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ +#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ +#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been + * adjusted */ +#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */ +#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via + * keyboard on this cpu */ +#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ +#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ +#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */ +#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ +#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch + * specific use */ + +#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag) +#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag)) +#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag)) + +extern int kdb_nextline; /* Current number of lines displayed */ + +typedef struct _kdb_bp { + unsigned long bp_addr; /* Address breakpoint is present at */ + unsigned int bp_free:1; /* This entry is available */ + unsigned int bp_enabled:1; /* Breakpoint is active in register */ + unsigned int bp_type:4; /* Uses hardware register */ + unsigned int bp_installed:1; /* Breakpoint is installed */ + unsigned int bp_delay:1; /* Do delayed bp handling */ + unsigned int bp_delayed:1; /* Delayed breakpoint */ + unsigned int bph_length; /* HW break length */ +} kdb_bp_t; + +#ifdef CONFIG_KGDB_KDB +extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */]; + +/* The KDB shell command table */ +typedef struct _kdbtab { + char *cmd_name; /* Command name */ + kdb_func_t cmd_func; /* Function to execute command */ + char *cmd_usage; /* Usage String for this command */ + char *cmd_help; /* Help message for this command */ + short cmd_flags; /* Parsing flags */ + short cmd_minlen; /* Minimum legal # command + * chars required */ + kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ +} kdbtab_t; + +extern int kdb_bt(int, const char **); /* KDB display back trace */ + +/* KDB breakpoint management functions */ +extern void kdb_initbptab(void); +extern void kdb_bp_install(struct pt_regs *); +extern void kdb_bp_remove(void); + +typedef enum { + KDB_DB_BPT, /* Breakpoint */ + KDB_DB_SS, /* Single-step trap */ + KDB_DB_SSB, /* Single step to branch */ + KDB_DB_SSBPT, /* Single step over breakpoint */ + KDB_DB_NOBPT /* Spurious breakpoint */ +} kdb_dbtrap_t; + +extern int kdb_main_loop(kdb_reason_t, kdb_reason_t, + int, kdb_dbtrap_t, struct pt_regs *); + +/* Miscellaneous functions and data areas */ +extern int kdb_grepping_flag; +extern char kdb_grep_string[]; +extern int kdb_grep_leading; +extern int kdb_grep_trailing; +extern char *kdb_cmds[]; +extern void kdb_syslog_data(char *syslog_data[]); +extern unsigned long kdb_task_state_string(const char *); +extern char kdb_task_state_char (const struct task_struct *); +extern unsigned long kdb_task_state(const struct task_struct *p, + unsigned long mask); +extern void kdb_ps_suppressed(void); +extern void kdb_ps1(const struct task_struct *p); +extern void kdb_print_nameval(const char *name, unsigned long val); +extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); +extern void kdb_meminfo_proc_show(void); +extern const char *kdb_walk_kallsyms(loff_t *pos); +extern char *kdb_getstr(char *, size_t, char *); + +/* Defines for kdb_symbol_print */ +#define KDB_SP_SPACEB 0x0001 /* Space before string */ +#define KDB_SP_SPACEA 0x0002 /* Space after string */ +#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */ +#define KDB_SP_VALUE 0x0008 /* Print the value of the address */ +#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */ +#define KDB_SP_NEWLINE 0x0020 /* Newline after string */ +#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN) + +#define KDB_TSK(cpu) kgdb_info[cpu].task +#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo + +extern struct task_struct *kdb_curr_task(int); + +#define kdb_task_has_cpu(p) (task_curr(p)) + +/* Simplify coexistence with NPTL */ +#define kdb_do_each_thread(g, p) do_each_thread(g, p) +#define kdb_while_each_thread(g, p) while_each_thread(g, p) + +#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) + +extern void *debug_kmalloc(size_t size, gfp_t flags); +extern void debug_kfree(void *); +extern void debug_kusage(void); + +extern void kdb_set_current_task(struct task_struct *); +extern struct task_struct *kdb_current_task; +#ifdef CONFIG_MODULES +extern struct list_head *kdb_modules; +#endif /* CONFIG_MODULES */ + +extern char kdb_prompt_str[]; + +#define KDB_WORD_SIZE ((int)sizeof(unsigned long)) + +#endif /* CONFIG_KGDB_KDB */ +#endif /* !_KDBPRIVATE_H */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c new file mode 100644 index 0000000..45344d5c --- /dev/null +++ b/kernel/debug/kdb/kdb_support.c @@ -0,0 +1,927 @@ +/* + * Kernel Debugger Architecture Independent Support Functions + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net> + */ + +#include <stdarg.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/kallsyms.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/ptrace.h> +#include <linux/module.h> +#include <linux/highmem.h> +#include <linux/hardirq.h> +#include <linux/delay.h> +#include <linux/uaccess.h> +#include <linux/kdb.h> +#include <linux/slab.h> +#include "kdb_private.h" + +/* + * kdbgetsymval - Return the address of the given symbol. + * + * Parameters: + * symname Character string containing symbol name + * symtab Structure to receive results + * Returns: + * 0 Symbol not found, symtab zero filled + * 1 Symbol mapped to module/symbol/section, data in symtab + */ +int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) +{ + if (KDB_DEBUG(AR)) + kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname, + symtab); + memset(symtab, 0, sizeof(*symtab)); + symtab->sym_start = kallsyms_lookup_name(symname); + if (symtab->sym_start) { + if (KDB_DEBUG(AR)) + kdb_printf("kdbgetsymval: returns 1, " + "symtab->sym_start=0x%lx\n", + symtab->sym_start); + return 1; + } + if (KDB_DEBUG(AR)) + kdb_printf("kdbgetsymval: returns 0\n"); + return 0; +} +EXPORT_SYMBOL(kdbgetsymval); + +static char *kdb_name_table[100]; /* arbitrary size */ + +/* + * kdbnearsym - Return the name of the symbol with the nearest address + * less than 'addr'. + * + * Parameters: + * addr Address to check for symbol near + * symtab Structure to receive results + * Returns: + * 0 No sections contain this address, symtab zero filled + * 1 Address mapped to module/symbol/section, data in symtab + * Remarks: + * 2.6 kallsyms has a "feature" where it unpacks the name into a + * string. If that string is reused before the caller expects it + * then the caller sees its string change without warning. To + * avoid cluttering up the main kdb code with lots of kdb_strdup, + * tests and kfree calls, kdbnearsym maintains an LRU list of the + * last few unique strings. The list is sized large enough to + * hold active strings, no kdb caller of kdbnearsym makes more + * than ~20 later calls before using a saved value. + */ +int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) +{ + int ret = 0; + unsigned long symbolsize; + unsigned long offset; +#define knt1_size 128 /* must be >= kallsyms table size */ + char *knt1 = NULL; + + if (KDB_DEBUG(AR)) + kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab); + memset(symtab, 0, sizeof(*symtab)); + + if (addr < 4096) + goto out; + knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC); + if (!knt1) { + kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n", + addr); + goto out; + } + symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset, + (char **)(&symtab->mod_name), knt1); + if (offset > 8*1024*1024) { + symtab->sym_name = NULL; + addr = offset = symbolsize = 0; + } + symtab->sym_start = addr - offset; + symtab->sym_end = symtab->sym_start + symbolsize; + ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0'; + + if (ret) { + int i; + /* Another 2.6 kallsyms "feature". Sometimes the sym_name is + * set but the buffer passed into kallsyms_lookup is not used, + * so it contains garbage. The caller has to work out which + * buffer needs to be saved. + * + * What was Rusty smoking when he wrote that code? + */ + if (symtab->sym_name != knt1) { + strncpy(knt1, symtab->sym_name, knt1_size); + knt1[knt1_size-1] = '\0'; + } + for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { + if (kdb_name_table[i] && + strcmp(kdb_name_table[i], knt1) == 0) + break; + } + if (i >= ARRAY_SIZE(kdb_name_table)) { + debug_kfree(kdb_name_table[0]); + memcpy(kdb_name_table, kdb_name_table+1, + sizeof(kdb_name_table[0]) * + (ARRAY_SIZE(kdb_name_table)-1)); + } else { + debug_kfree(knt1); + knt1 = kdb_name_table[i]; + memcpy(kdb_name_table+i, kdb_name_table+i+1, + sizeof(kdb_name_table[0]) * + (ARRAY_SIZE(kdb_name_table)-i-1)); + } + i = ARRAY_SIZE(kdb_name_table) - 1; + kdb_name_table[i] = knt1; + symtab->sym_name = kdb_name_table[i]; + knt1 = NULL; + } + + if (symtab->mod_name == NULL) + symtab->mod_name = "kernel"; + if (KDB_DEBUG(AR)) + kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, " + "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret, + symtab->sym_start, symtab->mod_name, symtab->sym_name, + symtab->sym_name); + +out: + debug_kfree(knt1); + return ret; +} + +void kdbnearsym_cleanup(void) +{ + int i; + for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { + if (kdb_name_table[i]) { + debug_kfree(kdb_name_table[i]); + kdb_name_table[i] = NULL; + } + } +} + +static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1]; + +/* + * kallsyms_symbol_complete + * + * Parameters: + * prefix_name prefix of a symbol name to lookup + * max_len maximum length that can be returned + * Returns: + * Number of symbols which match the given prefix. + * Notes: + * prefix_name is changed to contain the longest unique prefix that + * starts with this prefix (tab completion). + */ +int kallsyms_symbol_complete(char *prefix_name, int max_len) +{ + loff_t pos = 0; + int prefix_len = strlen(prefix_name), prev_len = 0; + int i, number = 0; + const char *name; + + while ((name = kdb_walk_kallsyms(&pos))) { + if (strncmp(name, prefix_name, prefix_len) == 0) { + strcpy(ks_namebuf, name); + /* Work out the longest name that matches the prefix */ + if (++number == 1) { + prev_len = min_t(int, max_len-1, + strlen(ks_namebuf)); + memcpy(ks_namebuf_prev, ks_namebuf, prev_len); + ks_namebuf_prev[prev_len] = '\0'; + continue; + } + for (i = 0; i < prev_len; i++) { + if (ks_namebuf[i] != ks_namebuf_prev[i]) { + prev_len = i; + ks_namebuf_prev[i] = '\0'; + break; + } + } + } + } + if (prev_len > prefix_len) + memcpy(prefix_name, ks_namebuf_prev, prev_len+1); + return number; +} + +/* + * kallsyms_symbol_next + * + * Parameters: + * prefix_name prefix of a symbol name to lookup + * flag 0 means search from the head, 1 means continue search. + * Returns: + * 1 if a symbol matches the given prefix. + * 0 if no string found + */ +int kallsyms_symbol_next(char *prefix_name, int flag) +{ + int prefix_len = strlen(prefix_name); + static loff_t pos; + const char *name; + + if (!flag) + pos = 0; + + while ((name = kdb_walk_kallsyms(&pos))) { + if (strncmp(name, prefix_name, prefix_len) == 0) { + strncpy(prefix_name, name, strlen(name)+1); + return 1; + } + } + return 0; +} + +/* + * kdb_symbol_print - Standard method for printing a symbol name and offset. + * Inputs: + * addr Address to be printed. + * symtab Address of symbol data, if NULL this routine does its + * own lookup. + * punc Punctuation for string, bit field. + * Remarks: + * The string and its punctuation is only printed if the address + * is inside the kernel, except that the value is always printed + * when requested. + */ +void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p, + unsigned int punc) +{ + kdb_symtab_t symtab, *symtab_p2; + if (symtab_p) { + symtab_p2 = (kdb_symtab_t *)symtab_p; + } else { + symtab_p2 = &symtab; + kdbnearsym(addr, symtab_p2); + } + if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE))) + return; + if (punc & KDB_SP_SPACEB) + kdb_printf(" "); + if (punc & KDB_SP_VALUE) + kdb_printf(kdb_machreg_fmt0, addr); + if (symtab_p2->sym_name) { + if (punc & KDB_SP_VALUE) + kdb_printf(" "); + if (punc & KDB_SP_PAREN) + kdb_printf("("); + if (strcmp(symtab_p2->mod_name, "kernel")) + kdb_printf("[%s]", symtab_p2->mod_name); + kdb_printf("%s", symtab_p2->sym_name); + if (addr != symtab_p2->sym_start) + kdb_printf("+0x%lx", addr - symtab_p2->sym_start); + if (punc & KDB_SP_SYMSIZE) + kdb_printf("/0x%lx", + symtab_p2->sym_end - symtab_p2->sym_start); + if (punc & KDB_SP_PAREN) + kdb_printf(")"); + } + if (punc & KDB_SP_SPACEA) + kdb_printf(" "); + if (punc & KDB_SP_NEWLINE) + kdb_printf("\n"); +} + +/* + * kdb_strdup - kdb equivalent of strdup, for disasm code. + * Inputs: + * str The string to duplicate. + * type Flags to kmalloc for the new string. + * Returns: + * Address of the new string, NULL if storage could not be allocated. + * Remarks: + * This is not in lib/string.c because it uses kmalloc which is not + * available when string.o is used in boot loaders. + */ +char *kdb_strdup(const char *str, gfp_t type) +{ + int n = strlen(str)+1; + char *s = kmalloc(n, type); + if (!s) + return NULL; + return strcpy(s, str); +} + +/* + * kdb_getarea_size - Read an area of data. The kdb equivalent of + * copy_from_user, with kdb messages for invalid addresses. + * Inputs: + * res Pointer to the area to receive the result. + * addr Address of the area to copy. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_getarea_size(void *res, unsigned long addr, size_t size) +{ + int ret = probe_kernel_read((char *)res, (char *)addr, size); + if (ret) { + if (!KDB_STATE(SUPPRESS)) { + kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr); + KDB_STATE_SET(SUPPRESS); + } + ret = KDB_BADADDR; + } else { + KDB_STATE_CLEAR(SUPPRESS); + } + return ret; +} + +/* + * kdb_putarea_size - Write an area of data. The kdb equivalent of + * copy_to_user, with kdb messages for invalid addresses. + * Inputs: + * addr Address of the area to write to. + * res Pointer to the area holding the data. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_putarea_size(unsigned long addr, void *res, size_t size) +{ + int ret = probe_kernel_read((char *)addr, (char *)res, size); + if (ret) { + if (!KDB_STATE(SUPPRESS)) { + kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr); + KDB_STATE_SET(SUPPRESS); + } + ret = KDB_BADADDR; + } else { + KDB_STATE_CLEAR(SUPPRESS); + } + return ret; +} + +/* + * kdb_getphys - Read data from a physical address. Validate the + * address is in range, use kmap_atomic() to get data + * similar to kdb_getarea() - but for phys addresses + * Inputs: + * res Pointer to the word to receive the result + * addr Physical address of the area to copy + * size Size of the area + * Returns: + * 0 for success, < 0 for error. + */ +static int kdb_getphys(void *res, unsigned long addr, size_t size) +{ + unsigned long pfn; + void *vaddr; + struct page *page; + + pfn = (addr >> PAGE_SHIFT); + if (!pfn_valid(pfn)) + return 1; + page = pfn_to_page(pfn); + vaddr = kmap_atomic(page, KM_KDB); + memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); + kunmap_atomic(vaddr, KM_KDB); + + return 0; +} + +/* + * kdb_getphysword + * Inputs: + * word Pointer to the word to receive the result. + * addr Address of the area to copy. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) +{ + int diag; + __u8 w1; + __u16 w2; + __u32 w4; + __u64 w8; + *word = 0; /* Default value if addr or size is invalid */ + + switch (size) { + case 1: + diag = kdb_getphys(&w1, addr, sizeof(w1)); + if (!diag) + *word = w1; + break; + case 2: + diag = kdb_getphys(&w2, addr, sizeof(w2)); + if (!diag) + *word = w2; + break; + case 4: + diag = kdb_getphys(&w4, addr, sizeof(w4)); + if (!diag) + *word = w4; + break; + case 8: + if (size <= sizeof(*word)) { + diag = kdb_getphys(&w8, addr, sizeof(w8)); + if (!diag) + *word = w8; + break; + } + /* drop through */ + default: + diag = KDB_BADWIDTH; + kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); + } + return diag; +} + +/* + * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats + * data as numbers. + * Inputs: + * word Pointer to the word to receive the result. + * addr Address of the area to copy. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_getword(unsigned long *word, unsigned long addr, size_t size) +{ + int diag; + __u8 w1; + __u16 w2; + __u32 w4; + __u64 w8; + *word = 0; /* Default value if addr or size is invalid */ + switch (size) { + case 1: + diag = kdb_getarea(w1, addr); + if (!diag) + *word = w1; + break; + case 2: + diag = kdb_getarea(w2, addr); + if (!diag) + *word = w2; + break; + case 4: + diag = kdb_getarea(w4, addr); + if (!diag) + *word = w4; + break; + case 8: + if (size <= sizeof(*word)) { + diag = kdb_getarea(w8, addr); + if (!diag) + *word = w8; + break; + } + /* drop through */ + default: + diag = KDB_BADWIDTH; + kdb_printf("kdb_getword: bad width %ld\n", (long) size); + } + return diag; +} + +/* + * kdb_putword - Write a binary value. Unlike kdb_putarea, this + * treats data as numbers. + * Inputs: + * addr Address of the area to write to.. + * word The value to set. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_putword(unsigned long addr, unsigned long word, size_t size) +{ + int diag; + __u8 w1; + __u16 w2; + __u32 w4; + __u64 w8; + switch (size) { + case 1: + w1 = word; + diag = kdb_putarea(addr, w1); + break; + case 2: + w2 = word; + diag = kdb_putarea(addr, w2); + break; + case 4: + w4 = word; + diag = kdb_putarea(addr, w4); + break; + case 8: + if (size <= sizeof(word)) { + w8 = word; + diag = kdb_putarea(addr, w8); + break; + } + /* drop through */ + default: + diag = KDB_BADWIDTH; + kdb_printf("kdb_putword: bad width %ld\n", (long) size); + } + return diag; +} + +/* + * kdb_task_state_string - Convert a string containing any of the + * letters DRSTCZEUIMA to a mask for the process state field and + * return the value. If no argument is supplied, return the mask + * that corresponds to environment variable PS, DRSTCZEU by + * default. + * Inputs: + * s String to convert + * Returns: + * Mask for process state. + * Notes: + * The mask folds data from several sources into a single long value, so + * be carefull not to overlap the bits. TASK_* bits are in the LSB, + * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there + * is no overlap between TASK_* and EXIT_* but that may not always be + * true, so EXIT_* bits are shifted left 16 bits before being stored in + * the mask. + */ + +/* unrunnable is < 0 */ +#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1)) +#define RUNNING (1UL << (8*sizeof(unsigned long) - 2)) +#define IDLE (1UL << (8*sizeof(unsigned long) - 3)) +#define DAEMON (1UL << (8*sizeof(unsigned long) - 4)) + +unsigned long kdb_task_state_string(const char *s) +{ + long res = 0; + if (!s) { + s = kdbgetenv("PS"); + if (!s) + s = "DRSTCZEU"; /* default value for ps */ + } + while (*s) { + switch (*s) { + case 'D': + res |= TASK_UNINTERRUPTIBLE; + break; + case 'R': + res |= RUNNING; + break; + case 'S': + res |= TASK_INTERRUPTIBLE; + break; + case 'T': + res |= TASK_STOPPED; + break; + case 'C': + res |= TASK_TRACED; + break; + case 'Z': + res |= EXIT_ZOMBIE << 16; + break; + case 'E': + res |= EXIT_DEAD << 16; + break; + case 'U': + res |= UNRUNNABLE; + break; + case 'I': + res |= IDLE; + break; + case 'M': + res |= DAEMON; + break; + case 'A': + res = ~0UL; + break; + default: + kdb_printf("%s: unknown flag '%c' ignored\n", + __func__, *s); + break; + } + ++s; + } + return res; +} + +/* + * kdb_task_state_char - Return the character that represents the task state. + * Inputs: + * p struct task for the process + * Returns: + * One character to represent the task state. + */ +char kdb_task_state_char (const struct task_struct *p) +{ + int cpu; + char state; + unsigned long tmp; + + if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) + return 'E'; + + cpu = kdb_process_cpu(p); + state = (p->state == 0) ? 'R' : + (p->state < 0) ? 'U' : + (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p->state & TASK_STOPPED) ? 'T' : + (p->state & TASK_TRACED) ? 'C' : + (p->exit_state & EXIT_ZOMBIE) ? 'Z' : + (p->exit_state & EXIT_DEAD) ? 'E' : + (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; + if (p->pid == 0) { + /* Idle task. Is it really idle, apart from the kdb + * interrupt? */ + if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { + if (cpu != kdb_initial_cpu) + state = 'I'; /* idle task */ + } + } else if (!p->mm && state == 'S') { + state = 'M'; /* sleeping system daemon */ + } + return state; +} + +/* + * kdb_task_state - Return true if a process has the desired state + * given by the mask. + * Inputs: + * p struct task for the process + * mask mask from kdb_task_state_string to select processes + * Returns: + * True if the process matches at least one criteria defined by the mask. + */ +unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask) +{ + char state[] = { kdb_task_state_char(p), '\0' }; + return (mask & kdb_task_state_string(state)) != 0; +} + +/* + * kdb_print_nameval - Print a name and its value, converting the + * value to a symbol lookup if possible. + * Inputs: + * name field name to print + * val value of field + */ +void kdb_print_nameval(const char *name, unsigned long val) +{ + kdb_symtab_t symtab; + kdb_printf(" %-11.11s ", name); + if (kdbnearsym(val, &symtab)) + kdb_symbol_print(val, &symtab, + KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE); + else + kdb_printf("0x%lx\n", val); +} + +/* Last ditch allocator for debugging, so we can still debug even when + * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned + * for space usage, not for speed. One smallish memory pool, the free + * chain is always in ascending address order to allow coalescing, + * allocations are done in brute force best fit. + */ + +struct debug_alloc_header { + u32 next; /* offset of next header from start of pool */ + u32 size; + void *caller; +}; + +/* The memory returned by this allocator must be aligned, which means + * so must the header size. Do not assume that sizeof(struct + * debug_alloc_header) is a multiple of the alignment, explicitly + * calculate the overhead of this header, including the alignment. + * The rest of this code must not use sizeof() on any header or + * pointer to a header. + */ +#define dah_align 8 +#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align) + +static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */ +static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned; +static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max; + +/* Locking is awkward. The debug code is called from all contexts, + * including non maskable interrupts. A normal spinlock is not safe + * in NMI context. Try to get the debug allocator lock, if it cannot + * be obtained after a second then give up. If the lock could not be + * previously obtained on this cpu then only try once. + * + * sparse has no annotation for "this function _sometimes_ acquires a + * lock", so fudge the acquire/release notation. + */ +static DEFINE_SPINLOCK(dap_lock); +static int get_dap_lock(void) + __acquires(dap_lock) +{ + static int dap_locked = -1; + int count; + if (dap_locked == smp_processor_id()) + count = 1; + else + count = 1000; + while (1) { + if (spin_trylock(&dap_lock)) { + dap_locked = -1; + return 1; + } + if (!count--) + break; + udelay(1000); + } + dap_locked = smp_processor_id(); + __acquire(dap_lock); + return 0; +} + +void *debug_kmalloc(size_t size, gfp_t flags) +{ + unsigned int rem, h_offset; + struct debug_alloc_header *best, *bestprev, *prev, *h; + void *p = NULL; + if (!get_dap_lock()) { + __release(dap_lock); /* we never actually got it */ + return NULL; + } + h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); + if (dah_first_call) { + h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead; + dah_first_call = 0; + } + size = ALIGN(size, dah_align); + prev = best = bestprev = NULL; + while (1) { + if (h->size >= size && (!best || h->size < best->size)) { + best = h; + bestprev = prev; + if (h->size == size) + break; + } + if (!h->next) + break; + prev = h; + h = (struct debug_alloc_header *)(debug_alloc_pool + h->next); + } + if (!best) + goto out; + rem = best->size - size; + /* The pool must always contain at least one header */ + if (best->next == 0 && bestprev == NULL && rem < dah_overhead) + goto out; + if (rem >= dah_overhead) { + best->size = size; + h_offset = ((char *)best - debug_alloc_pool) + + dah_overhead + best->size; + h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset); + h->size = rem - dah_overhead; + h->next = best->next; + } else + h_offset = best->next; + best->caller = __builtin_return_address(0); + dah_used += best->size; + dah_used_max = max(dah_used, dah_used_max); + if (bestprev) + bestprev->next = h_offset; + else + dah_first = h_offset; + p = (char *)best + dah_overhead; + memset(p, POISON_INUSE, best->size - 1); + *((char *)p + best->size - 1) = POISON_END; +out: + spin_unlock(&dap_lock); + return p; +} + +void debug_kfree(void *p) +{ + struct debug_alloc_header *h; + unsigned int h_offset; + if (!p) + return; + if ((char *)p < debug_alloc_pool || + (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) { + kfree(p); + return; + } + if (!get_dap_lock()) { + __release(dap_lock); /* we never actually got it */ + return; /* memory leak, cannot be helped */ + } + h = (struct debug_alloc_header *)((char *)p - dah_overhead); + memset(p, POISON_FREE, h->size - 1); + *((char *)p + h->size - 1) = POISON_END; + h->caller = NULL; + dah_used -= h->size; + h_offset = (char *)h - debug_alloc_pool; + if (h_offset < dah_first) { + h->next = dah_first; + dah_first = h_offset; + } else { + struct debug_alloc_header *prev; + unsigned int prev_offset; + prev = (struct debug_alloc_header *)(debug_alloc_pool + + dah_first); + while (1) { + if (!prev->next || prev->next > h_offset) + break; + prev = (struct debug_alloc_header *) + (debug_alloc_pool + prev->next); + } + prev_offset = (char *)prev - debug_alloc_pool; + if (prev_offset + dah_overhead + prev->size == h_offset) { + prev->size += dah_overhead + h->size; + memset(h, POISON_FREE, dah_overhead - 1); + *((char *)h + dah_overhead - 1) = POISON_END; + h = prev; + h_offset = prev_offset; + } else { + h->next = prev->next; + prev->next = h_offset; + } + } + if (h_offset + dah_overhead + h->size == h->next) { + struct debug_alloc_header *next; + next = (struct debug_alloc_header *) + (debug_alloc_pool + h->next); + h->size += dah_overhead + next->size; + h->next = next->next; + memset(next, POISON_FREE, dah_overhead - 1); + *((char *)next + dah_overhead - 1) = POISON_END; + } + spin_unlock(&dap_lock); +} + +void debug_kusage(void) +{ + struct debug_alloc_header *h_free, *h_used; +#ifdef CONFIG_IA64 + /* FIXME: using dah for ia64 unwind always results in a memory leak. + * Fix that memory leak first, then set debug_kusage_one_time = 1 for + * all architectures. + */ + static int debug_kusage_one_time; +#else + static int debug_kusage_one_time = 1; +#endif + if (!get_dap_lock()) { + __release(dap_lock); /* we never actually got it */ + return; + } + h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); + if (dah_first == 0 && + (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead || + dah_first_call)) + goto out; + if (!debug_kusage_one_time) + goto out; + debug_kusage_one_time = 0; + kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n", + __func__, dah_first); + if (dah_first) { + h_used = (struct debug_alloc_header *)debug_alloc_pool; + kdb_printf("%s: h_used %p size %d\n", __func__, h_used, + h_used->size); + } + do { + h_used = (struct debug_alloc_header *) + ((char *)h_free + dah_overhead + h_free->size); + kdb_printf("%s: h_used %p size %d caller %p\n", + __func__, h_used, h_used->size, h_used->caller); + h_free = (struct debug_alloc_header *) + (debug_alloc_pool + h_free->next); + } while (h_free->next); + h_used = (struct debug_alloc_header *) + ((char *)h_free + dah_overhead + h_free->size); + if ((char *)h_used - debug_alloc_pool != + sizeof(debug_alloc_pool_aligned)) + kdb_printf("%s: h_used %p size %d caller %p\n", + __func__, h_used, h_used->size, h_used->caller); +out: + spin_unlock(&dap_lock); +} + +/* Maintain a small stack of kdb_flags to allow recursion without disturbing + * the global kdb state. + */ + +static int kdb_flags_stack[4], kdb_flags_index; + +void kdb_save_flags(void) +{ + BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack)); + kdb_flags_stack[kdb_flags_index++] = kdb_flags; +} + +void kdb_restore_flags(void) +{ + BUG_ON(kdb_flags_index <= 0); + kdb_flags = kdb_flags_stack[--kdb_flags_index]; +} diff --git a/kernel/early_res.c b/kernel/early_res.c index 3cb2c66..7bfae88 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -7,6 +7,8 @@ #include <linux/bootmem.h> #include <linux/mm.h> #include <linux/early_res.h> +#include <linux/slab.h> +#include <linux/kmemleak.h> /* * Early reserved memory areas. @@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + i = find_overlapped_early(start, end); r = &early_res[i]; if (i >= max_early_res || r->end != end || r->start != start) @@ -333,6 +337,14 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + + if (start == end) + return; + + if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) + return; + try_next: i = find_overlapped_early(start, end); if (i >= max_early_res) diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c35452c..dd62f8e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain; static DEFINE_RWLOCK(exec_domains_lock); -static u_long ident_map[32] = { +static unsigned long ident_map[32] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp) } static struct exec_domain * -lookup_exec_domain(u_long personality) +lookup_exec_domain(unsigned int personality) { - struct exec_domain * ep; - u_long pers = personality(personality); + unsigned int pers = personality(personality); + struct exec_domain *ep; read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality) #ifdef CONFIG_MODULES read_unlock(&exec_domains_lock); - request_module("personality-%ld", pers); + request_module("personality-%d", pers); read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -135,7 +135,7 @@ unregister: } int -__set_personality(u_long personality) +__set_personality(unsigned int personality) { struct exec_domain *ep, *oep; @@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void) module_init(proc_execdomains_init); #endif -SYSCALL_DEFINE1(personality, u_long, personality) +SYSCALL_DEFINE1(personality, unsigned int, personality) { - u_long old = current->personality; + unsigned int old = current->personality; if (personality != 0xffffffff) { set_personality(personality); @@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality) return -EINVAL; } - return (long)old; + return old; } diff --git a/kernel/exit.c b/kernel/exit.c index cce59cb..ceffc67 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -55,15 +55,14 @@ #include <asm/unistd.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> -#include "cred-internals.h" static void exit_mm(struct task_struct * tsk); -static void __unhash_process(struct task_struct *p) +static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(p, PIDTYPE_PID); - if (thread_group_leader(p)) { + if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -80,10 +79,9 @@ static void __unhash_process(struct task_struct *p) static void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; + bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - - BUG_ON(!sig); - BUG_ON(!atomic_read(&sig->count)); + struct tty_struct *uninitialized_var(tty); sighand = rcu_dereference_check(tsk->sighand, rcu_read_lock_held() || @@ -91,14 +89,16 @@ static void __exit_signal(struct task_struct *tsk) spin_lock(&sighand->siglock); posix_cpu_timers_exit(tsk); - if (atomic_dec_and_test(&sig->count)) + if (group_dead) { posix_cpu_timers_exit_group(tsk); - else { + tty = sig->tty; + sig->tty = NULL; + } else { /* * If there is any task waiting for the group exit * then notify it: */ - if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) + if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exit_task); if (tsk == sig->curr_target) @@ -124,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk) sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; - sig = NULL; /* Marker for below. */ } - __unhash_process(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); /* * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ flush_sigqueue(&tsk->pending); - - tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - if (sig) { + if (group_dead) { flush_sigqueue(&sig->shared_pending); - taskstats_tgid_free(sig); - /* - * Make sure ->signal can't go away under rq->lock, - * see account_group_exec_runtime(). - */ - task_rq_unlock_wait(tsk); - __cleanup_signal(sig); + tty_kref_put(tty); } } @@ -857,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; - /* mt-exec, de_thread() is waiting for us */ - if (thread_group_leader(tsk) && - tsk->signal->group_exit_task && - tsk->signal->notify_count < 0) + /* mt-exec, de_thread() is waiting for group leader */ + if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); - write_unlock_irq(&tasklist_lock); tracehook_report_death(tsk, signal, cookie, group_dead); @@ -953,7 +942,8 @@ NORET_TYPE void do_exit(long code) acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ - sync_mm_rss(tsk, tsk->mm); + if (tsk->mm) + sync_mm_rss(tsk, tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -1002,8 +992,10 @@ NORET_TYPE void do_exit(long code) exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA + task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; + task_unlock(tsk); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) diff --git a/kernel/fork.c b/kernel/fork.c index 4799c5f..b6cce14 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,6 +165,18 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +static inline void free_signal_struct(struct signal_struct *sig) +{ + taskstats_tgid_free(sig); + kmem_cache_free(signal_cachep, sig); +} + +static inline void put_signal_struct(struct signal_struct *sig) +{ + if (atomic_dec_and_test(&sig->sigcnt)) + free_signal_struct(sig); +} + void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); @@ -173,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk) exit_creds(tsk); delayacct_tsk_free(tsk); + put_signal_struct(tsk->signal); if (!profile_handoff_task(tsk)) free_task(tsk); @@ -864,8 +877,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - atomic_set(&sig->count, 1); + sig->nr_threads = 1; atomic_set(&sig->live, 1); + atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); if (clone_flags & CLONE_NEWPID) sig->flags |= SIGNAL_UNKILLABLE; @@ -889,13 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -void __cleanup_signal(struct signal_struct *sig) -{ - thread_group_cputime_free(sig); - tty_kref_put(sig->tty); - kmem_cache_free(signal_cachep, sig); -} - static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; @@ -1052,6 +1059,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; #endif +#if defined(SPLIT_RSS_COUNTING) + memset(&p->rss_stat, 0, sizeof(p->rss_stat)); +#endif p->default_timer_slack_ns = current->timer_slack_ns; @@ -1109,10 +1119,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.memcg = NULL; #endif - p->bts = NULL; - - p->stack_start = stack_start; - /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); @@ -1246,8 +1252,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); + current->signal->nr_threads++; atomic_inc(¤t->signal->live); + atomic_inc(¤t->signal->sigcnt); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } @@ -1260,7 +1267,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; - tty_kref_put(p->signal->tty); p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); @@ -1293,7 +1299,7 @@ bad_fork_cleanup_mm: mmput(p->mm); bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - __cleanup_signal(p->signal); + free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: @@ -1328,6 +1334,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re return regs; } +static inline void init_idle_pids(struct pid_link *links) +{ + enum pid_type type; + + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + INIT_HLIST_NODE(&links[type].node); /* not really needed */ + links[type].pid = &init_struct_pid; + } +} + struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; @@ -1335,8 +1351,10 @@ struct task_struct * __cpuinit fork_idle(int cpu) task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, &init_struct_pid, 0); - if (!IS_ERR(task)) + if (!IS_ERR(task)) { + init_idle_pids(task->pids); init_idle(task, cpu); + } return task; } @@ -1508,14 +1526,6 @@ static void check_unshare_flags(unsigned long *flags_ptr) *flags_ptr |= CLONE_SIGHAND; /* - * If unsharing signal handlers and the task was created - * using CLONE_THREAD, then must unshare the thread - */ - if ((*flags_ptr & CLONE_SIGHAND) && - (atomic_read(¤t->signal->count) > 1)) - *flags_ptr |= CLONE_THREAD; - - /* * If unsharing namespace, must also unshare filesystem information. */ if (*flags_ptr & CLONE_NEWNS) diff --git a/kernel/futex.c b/kernel/futex.c index e7a35f10..6a3a5fa 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p) { - p = ERR_PTR(-ESRCH); - } else { - pcred = __task_cred(p); - if (cred->euid != pcred->euid && - cred->euid != pcred->uid) - p = ERR_PTR(-ESRCH); - else - get_task_struct(p); - } + if (p) + get_task_struct(p); rcu_read_unlock(); @@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (IS_ERR(p)) - return PTR_ERR(p); + if (!p) + return -ESRCH; /* * We need to look at the task state flags to figure out, diff --git a/kernel/groups.c b/kernel/groups.c index 2b45b2e..53b1916 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp) */ int set_groups(struct cred *new, struct group_info *group_info) { - int retval; - - retval = security_task_setgroups(group_info); - if (retval) - return retval; - put_group_info(new->group_info); groups_sort(group_info); get_group_info(group_info); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0086628..5c69e99 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -89,7 +89,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) do { seq = read_seqbegin(&xtime_lock); - xts = current_kernel_time(); + xts = __current_kernel_time(); tom = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); @@ -1749,35 +1749,15 @@ void __init hrtimers_init(void) } /** - * schedule_hrtimeout_range - sleep until timeout + * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired otherwise -EINTR + * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, - const enum hrtimer_mode mode) +int __sched +schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, return -EINTR; } - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); + hrtimer_init_on_stack(&t.timer, clock, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_init_sleeper(&t, current); @@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, return !t.task ? 0 : -EINTR; } + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 03808ed..7a56b22 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -40,23 +40,29 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/hw_breakpoint.h> + /* * Constraints data */ /* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); +static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); /* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); + +static int nr_slots[TYPE_MAX]; + +static int constraints_initialized; /* Gather the number of total pinned and un-pinned bp in a cpuset */ struct bp_busy_slots { @@ -67,16 +73,29 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +__weak int hw_breakpoint_weight(struct perf_event *bp) +{ + return 1; +} + +static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +{ + if (bp->attr.bp_type & HW_BREAKPOINT_RW) + return TYPE_DATA; + + return TYPE_INST; +} + /* * Report the maximum number of pinned breakpoints a task * have in this cpu */ -static unsigned int max_task_bp_pinned(int cpu) +static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - for (i = HBP_NUM -1; i >= 0; i--) { + for (i = nr_slots[type] - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } @@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu) return 0; } -static int task_bp_pinned(struct task_struct *tsk) +static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) { struct perf_event_context *ctx = tsk->perf_event_ctxp; struct list_head *list; @@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk) */ list_for_each_entry(bp, list, event_entry) { if (bp->attr.type == PERF_TYPE_BREAKPOINT) - count++; + if (find_slot_idx(bp) == type) + count += hw_breakpoint_weight(bp); } raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk) * a given cpu (cpu > -1) or in all of them (cpu = -1). */ static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, + enum bp_type_idx type) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) - slots->pinned += max_task_bp_pinned(cpu); + slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(tsk); - slots->flexible = per_cpu(nr_bp_flexible, cpu); + slots->pinned += task_bp_pinned(tsk, type); + slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; } @@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) for_each_online_cpu(cpu) { unsigned int nr; - nr = per_cpu(nr_cpu_bp_pinned, cpu); + nr = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) - nr += max_task_bp_pinned(cpu); + nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(tsk); + nr += task_bp_pinned(tsk, type); if (nr > slots->pinned) slots->pinned = nr; - nr = per_cpu(nr_bp_flexible, cpu); + nr = per_cpu(nr_bp_flexible[type], cpu); if (nr > slots->flexible) slots->flexible = nr; @@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) } /* + * For now, continue to consider flexible as pinned, until we can + * ensure no flexible event can ever be scheduled before a pinned event + * in a same cpu. + */ +static void +fetch_this_slot(struct bp_busy_slots *slots, int weight) +{ + slots->pinned += weight; +} + +/* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, + enum bp_type_idx type, int weight) { unsigned int *tsk_pinned; - int count = 0; + int old_count = 0; + int old_idx = 0; + int idx = 0; - count = task_bp_pinned(tsk); + old_count = task_bp_pinned(tsk, type); + old_idx = old_count - 1; + idx = old_idx + weight; - tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { - tsk_pinned[count]++; - if (count > 0) - tsk_pinned[count-1]--; + tsk_pinned[idx]++; + if (old_count > 0) + tsk_pinned[old_idx]--; } else { - tsk_pinned[count]--; - if (count > 0) - tsk_pinned[count-1]++; + tsk_pinned[idx]--; + if (old_count > 0) + tsk_pinned[old_idx]++; } } /* * Add/remove the given breakpoint in our constraint table */ -static void toggle_bp_slot(struct perf_event *bp, bool enable) +static void +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, + int weight) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; @@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) /* Pinned counter task profiling */ if (tsk) { if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable); + toggle_bp_task_slot(tsk, cpu, enable, type, weight); return; } for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable); + toggle_bp_task_slot(tsk, cpu, enable, type, weight); return; } /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; else - per_cpu(nr_cpu_bp_pinned, bp->cpu)--; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; } /* @@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; + enum bp_type_idx type; + int weight; - fetch_bp_busy_slots(&slots, bp); + /* We couldn't initialize breakpoint constraints on boot */ + if (!constraints_initialized) + return -ENOMEM; + + /* Basic checks */ + if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || + bp->attr.bp_type == HW_BREAKPOINT_INVALID) + return -EINVAL; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + + fetch_bp_busy_slots(&slots, bp, type); + fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) + if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; - toggle_bp_slot(bp, true); + toggle_bp_slot(bp, true, type, weight); return 0; } @@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp) static void __release_bp_slot(struct perf_event *bp) { - toggle_bp_slot(bp, false); + enum bp_type_idx type; + int weight; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + toggle_bp_slot(bp, false, type, weight); } void release_bp_slot(struct perf_event *bp) @@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } +static int validate_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = arch_validate_hwbkpt_settings(bp); + if (ret) + return ret; + + if (arch_check_bp_in_kernelspace(bp)) { + if (bp->attr.exclude_kernel) + return -EINVAL; + /* + * Don't let unprivileged users set a breakpoint in the trap + * path to avoid trap recursion attacks. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return 0; +} + int register_perf_hw_breakpoint(struct perf_event *bp) { int ret; @@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (ret) return ret; - /* - * Ptrace breakpoints can be temporary perf events only - * meant to reserve a slot. In this case, it is created disabled and - * we don't want to check the params right now (as we put a null addr) - * But perf tools create events as disabled and we want to check - * the params for them. - * This is a quick hack that will be removed soon, once we remove - * the tmp breakpoints from ptrace - */ - if (!bp->attr.disabled || !bp->overflow_handler) - ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + ret = validate_hw_breakpoint(bp); /* if arch_validate_hwbkpt_settings() fails then release bp slot */ if (ret) @@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att if (attr->disabled) goto end; - err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + err = validate_hw_breakpoint(bp); if (!err) perf_event_enable(bp); @@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { static int __init init_hw_breakpoint(void) { + unsigned int **task_bp_pinned; + int cpu, err_cpu; + int i; + + for (i = 0; i < TYPE_MAX; i++) + nr_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); + *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], + GFP_KERNEL); + if (!*task_bp_pinned) + goto err_alloc; + } + } + + constraints_initialized = 1; + return register_die_notifier(&hw_breakpoint_exceptions_nb); + + err_alloc: + for_each_possible_cpu(err_cpu) { + if (err_cpu == cpu) + break; + for (i = 0; i < TYPE_MAX; i++) + kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + } + + return -ENOMEM; } core_initcall(init_hw_breakpoint); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 42ec11b..b7091d5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) if (desc->chip->ack) desc->chip->ack(irq); } + desc->status |= IRQ_MASKED; +} + +static inline void mask_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->mask) { + desc->chip->mask(irq); + desc->status |= IRQ_MASKED; + } +} + +static inline void unmask_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->unmask) { + desc->chip->unmask(irq); + desc->status &= ~IRQ_MASKED; + } } /* @@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; - if (unlikely(desc->status & IRQ_ONESHOT)) - desc->status |= IRQ_MASKED; - else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) - desc->chip->unmask(irq); + if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + unmask_irq(desc, irq); out_unlock: raw_spin_unlock(&desc->lock); } @@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; - if (desc->chip->mask) - desc->chip->mask(irq); + mask_irq(desc, irq); goto out; } @@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; if (unlikely(!action)) { - desc->chip->mask(irq); + mask_irq(desc, irq); goto out_unlock; } @@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_MASKED))) { - desc->chip->unmask(irq); - desc->status &= ~IRQ_MASKED; + unmask_irq(desc, irq); } desc->status &= ~IRQ_PENDING; @@ -716,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, __set_irq_handler(irq, handle, 0, name); } -void __init set_irq_noprobe(unsigned int irq) +void set_irq_noprobe(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -731,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } -void __init set_irq_probe(unsigned int irq) +void set_irq_probe(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 76d5a67..27e5c69 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - if (!(action->flags & IRQF_DISABLED)) - local_irq_enable_in_hardirq(); - do { trace_irq_handler_entry(irq, action); ret = action->handler(irq, action->dev_id); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eb6078c..e149748 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) return 0; } +int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + if (!desc) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc->affinity_hint = m; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_hint); + #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. @@ -382,6 +398,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; + unsigned long flags; if (!desc) return 0; @@ -389,11 +406,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) if (desc->status & IRQ_NOREQUEST) return 0; + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; + raw_spin_unlock_irqrestore(&desc->lock, flags); + return !action; } @@ -436,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; + + if (chip != desc->chip) + irq_chip_set_defaults(desc->chip); } return ret; @@ -483,8 +506,26 @@ static int irq_wait_for_interrupt(struct irqaction *action) */ static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { +again: chip_bus_lock(irq, desc); raw_spin_lock_irq(&desc->lock); + + /* + * Implausible though it may be we need to protect us against + * the following scenario: + * + * The thread is faster done than the hard interrupt handler + * on the other CPU. If we unmask the irq line then the + * interrupt can come in again and masks the line, leaves due + * to IRQ_INPROGRESS and the irq line is masked forever. + */ + if (unlikely(desc->status & IRQ_INPROGRESS)) { + raw_spin_unlock_irq(&desc->lock); + chip_bus_sync_unlock(irq, desc); + cpu_relax(); + goto again; + } + if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); @@ -884,6 +925,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } +#ifdef CONFIG_SMP + /* make sure affinity_hint is cleaned up */ + if (WARN_ON_ONCE(desc->affinity_hint)) + desc->affinity_hint = NULL; +#endif + raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -995,7 +1042,6 @@ EXPORT_SYMBOL(free_irq); * Flags: * * IRQF_SHARED Interrupt is shared - * IRQF_DISABLED Disable local interrupts while processing * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * IRQF_TRIGGER_* Specify active edge(s) or level * @@ -1009,25 +1055,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, int retval; /* - * handle_IRQ_event() always ignores IRQF_DISABLED except for - * the _first_ irqaction (sigh). That can cause oopsing, but - * the behavior is classified as "will not fix" so we need to - * start nudging drivers away from using that idiom. - */ - if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) == - (IRQF_SHARED|IRQF_DISABLED)) { - pr_warning( - "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n", - irq, devname); - } - -#ifdef CONFIG_LOCKDEP - /* - * Lockdep wants atomic interrupt handlers: - */ - irqflags |= IRQF_DISABLED; -#endif - /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing @@ -1088,3 +1115,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, return retval; } EXPORT_SYMBOL(request_threaded_irq); + +/** + * request_any_context_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @flags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. It selects either a + * hardirq or threaded handling method depending on the + * context. + * + * On failure, it returns a negative value. On success, + * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. + */ +int request_any_context_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *name, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + int ret; + + if (!desc) + return -EINVAL; + + if (desc->status & IRQ_NESTED_THREAD) { + ret = request_threaded_irq(irq, NULL, handler, + flags, name, dev_id); + return !ret ? IRQC_IS_NESTED : ret; + } + + ret = request_irq(irq, handler, flags, name, dev_id); + return !ret ? IRQC_IS_HARDIRQ : ret; +} +EXPORT_SYMBOL_GPL(request_any_context_irq); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 963559d..65d3845 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -6,6 +6,7 @@ */ #include <linux/irq.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6f50ecc..09a2ee5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -7,6 +7,7 @@ */ #include <linux/irq.h> +#include <linux/gfp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/interrupt.h> @@ -31,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) return 0; } +static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long)m->private); + unsigned long flags; + cpumask_var_t mask; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + raw_spin_lock_irqsave(&desc->lock, flags); + if (desc->affinity_hint) + cpumask_copy(mask, desc->affinity_hint); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + seq_cpumask(m, mask); + seq_putc(m, '\n'); + free_cpumask_var(mask); + + return 0; +} + #ifndef is_affinity_mask_valid #define is_affinity_mask_valid(val) 1 #endif @@ -83,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file) return single_open(file, irq_affinity_proc_show, PDE(inode)->data); } +static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); +} + static const struct file_operations irq_affinity_proc_fops = { .open = irq_affinity_proc_open, .read = seq_read, @@ -91,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = { .write = irq_affinity_proc_write, }; +static const struct file_operations irq_affinity_hint_proc_fops = { + .open = irq_affinity_hint_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int default_affinity_show(struct seq_file *m, void *v) { seq_cpumask(m, irq_default_affinity); @@ -146,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = { .release = single_release, .write = default_affinity_write, }; + +static int irq_node_proc_show(struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long) m->private); + + seq_printf(m, "%d\n", desc->node); + return 0; +} + +static int irq_node_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_node_proc_show, PDE(inode)->data); +} + +static const struct file_operations irq_node_proc_fops = { + .open = irq_node_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif static int irq_spurious_proc_show(struct seq_file *m, void *v) @@ -230,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) /* create /proc/irq/<irq>/smp_affinity */ proc_create_data("smp_affinity", 0600, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); + + /* create /proc/irq/<irq>/affinity_hint */ + proc_create_data("affinity_hint", 0400, desc->dir, + &irq_affinity_hint_proc_fops, (void *)(long)irq); + + proc_create_data("node", 0444, desc->dir, + &irq_node_proc_fops, (void *)(long)irq); #endif proc_create_data("spurious", 0444, desc->dir, diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8e5288a..6f6d091 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -16,11 +16,13 @@ #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> +#include <linux/kdb.h> #include <linux/err.h> #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ #include <linux/mm.h> #include <linux/ctype.h> +#include <linux/slab.h> #include <asm/sections.h> @@ -515,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file) return ret; } +#ifdef CONFIG_KGDB_KDB +const char *kdb_walk_kallsyms(loff_t *pos) +{ + static struct kallsym_iter kdb_walk_kallsyms_iter; + if (*pos == 0) { + memset(&kdb_walk_kallsyms_iter, 0, + sizeof(kdb_walk_kallsyms_iter)); + reset_iter(&kdb_walk_kallsyms_iter, 0); + } + while (1) { + if (!update_iter(&kdb_walk_kallsyms_iter, *pos)) + return NULL; + ++*pos; + /* Some debugging symbols have no name. Ignore them. */ + if (kdb_walk_kallsyms_iter.name[0]) + return kdb_walk_kallsyms_iter.name; + } +} +#endif /* CONFIG_KGDB_KDB */ + static const struct file_operations kallsyms_operations = { .open = kallsyms_open, .read = seq_read, diff --git a/kernel/kexec.c b/kernel/kexec.c index 87ebe8a..131b170 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs) size_t crash_get_memory_size(void) { - size_t size; + size_t size = 0; mutex_lock(&kexec_mutex); - size = crashk_res.end - crashk_res.start + 1; + if (crashk_res.end != crashk_res.start) + size = crashk_res.end - crashk_res.start + 1; mutex_unlock(&kexec_mutex); return size; } @@ -1134,11 +1135,9 @@ int crash_shrink_memory(unsigned long new_size) free_reserved_phys_range(end, crashk_res.end); - if (start == end) { - crashk_res.end = end; + if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); - } else - crashk_res.end = end - 1; + crashk_res.end = end - 1; unlock: mutex_unlock(&kexec_mutex); diff --git a/kernel/kgdb.c b/kernel/kgdb.c deleted file mode 100644 index 761fdd2..0000000 --- a/kernel/kgdb.c +++ /dev/null @@ -1,1763 +0,0 @@ -/* - * KGDB stub. - * - * Maintainer: Jason Wessel <jason.wessel@windriver.com> - * - * Copyright (C) 2000-2001 VERITAS Software Corporation. - * Copyright (C) 2002-2004 Timesys Corporation - * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> - * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. - * Copyright (C) 2005-2008 Wind River Systems, Inc. - * Copyright (C) 2007 MontaVista Software, Inc. - * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * - * Contributors at various stages not listed above: - * Jason Wessel ( jason.wessel@windriver.com ) - * George Anzinger <george@mvista.com> - * Anurekh Saxena (anurekh.saxena@timesys.com) - * Lake Stevens Instrument Division (Glenn Engel) - * Jim Kingdon, Cygnus Support. - * - * Original KGDB stub: David Grothe <dave@gcom.com>, - * Tigran Aivazian <tigran@sco.com> - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ -#include <linux/pid_namespace.h> -#include <linux/clocksource.h> -#include <linux/interrupt.h> -#include <linux/spinlock.h> -#include <linux/console.h> -#include <linux/threads.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/ptrace.h> -#include <linux/reboot.h> -#include <linux/string.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/sysrq.h> -#include <linux/init.h> -#include <linux/kgdb.h> -#include <linux/pid.h> -#include <linux/smp.h> -#include <linux/mm.h> - -#include <asm/cacheflush.h> -#include <asm/byteorder.h> -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/unaligned.h> - -static int kgdb_break_asap; - -#define KGDB_MAX_THREAD_QUERY 17 -struct kgdb_state { - int ex_vector; - int signo; - int err_code; - int cpu; - int pass_exception; - unsigned long thr_query; - unsigned long threadid; - long kgdb_usethreadid; - struct pt_regs *linux_regs; -}; - -static struct debuggerinfo_struct { - void *debuggerinfo; - struct task_struct *task; -} kgdb_info[NR_CPUS]; - -/** - * kgdb_connected - Is a host GDB connected to us? - */ -int kgdb_connected; -EXPORT_SYMBOL_GPL(kgdb_connected); - -/* All the KGDB handlers are installed */ -static int kgdb_io_module_registered; - -/* Guard for recursive entry */ -static int exception_level; - -static struct kgdb_io *kgdb_io_ops; -static DEFINE_SPINLOCK(kgdb_registration_lock); - -/* kgdb console driver is loaded */ -static int kgdb_con_registered; -/* determine if kgdb console output should be used */ -static int kgdb_use_con; - -static int __init opt_kgdb_con(char *str) -{ - kgdb_use_con = 1; - return 0; -} - -early_param("kgdbcon", opt_kgdb_con); - -module_param(kgdb_use_con, int, 0644); - -/* - * Holds information about breakpoints in a kernel. These breakpoints are - * added and removed by gdb. - */ -static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { - [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } -}; - -/* - * The CPU# of the active CPU, or -1 if none: - */ -atomic_t kgdb_active = ATOMIC_INIT(-1); - -/* - * We use NR_CPUs not PERCPU, in case kgdb is used to debug early - * bootup code (which might not have percpu set up yet): - */ -static atomic_t passive_cpu_wait[NR_CPUS]; -static atomic_t cpu_in_kgdb[NR_CPUS]; -atomic_t kgdb_setting_breakpoint; - -struct task_struct *kgdb_usethread; -struct task_struct *kgdb_contthread; - -int kgdb_single_step; -pid_t kgdb_sstep_pid; - -/* Our I/O buffers. */ -static char remcom_in_buffer[BUFMAX]; -static char remcom_out_buffer[BUFMAX]; - -/* Storage for the registers, in GDB format. */ -static unsigned long gdb_regs[(NUMREGBYTES + - sizeof(unsigned long) - 1) / - sizeof(unsigned long)]; - -/* to keep track of the CPU which is doing the single stepping*/ -atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); - -/* - * If you are debugging a problem where roundup (the collection of - * all other CPUs) is a problem [this should be extremely rare], - * then use the nokgdbroundup option to avoid roundup. In that case - * the other CPUs might interfere with your debugging context, so - * use this with care: - */ -static int kgdb_do_roundup = 1; - -static int __init opt_nokgdbroundup(char *str) -{ - kgdb_do_roundup = 0; - - return 0; -} - -early_param("nokgdbroundup", opt_nokgdbroundup); - -/* - * Finally, some KGDB code :-) - */ - -/* - * Weak aliases for breakpoint management, - * can be overriden by architectures when needed: - */ -int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) -{ - int err; - - err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); - if (err) - return err; - - return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); -} - -int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) -{ - return probe_kernel_write((char *)addr, - (char *)bundle, BREAK_INSTR_SIZE); -} - -int __weak kgdb_validate_break_address(unsigned long addr) -{ - char tmp_variable[BREAK_INSTR_SIZE]; - int err; - /* Validate setting the breakpoint and then removing it. In the - * remove fails, the kernel needs to emit a bad message because we - * are deep trouble not being able to put things back the way we - * found them. - */ - err = kgdb_arch_set_breakpoint(addr, tmp_variable); - if (err) - return err; - err = kgdb_arch_remove_breakpoint(addr, tmp_variable); - if (err) - printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " - "memory destroyed at: %lx", addr); - return err; -} - -unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) -{ - return instruction_pointer(regs); -} - -int __weak kgdb_arch_init(void) -{ - return 0; -} - -int __weak kgdb_skipexception(int exception, struct pt_regs *regs) -{ - return 0; -} - -void __weak -kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) -{ - return; -} - -/** - * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. - * @regs: Current &struct pt_regs. - * - * This function will be called if the particular architecture must - * disable hardware debugging while it is processing gdb packets or - * handling exception. - */ -void __weak kgdb_disable_hw_debug(struct pt_regs *regs) -{ -} - -/* - * GDB remote protocol parser: - */ - -static int hex(char ch) -{ - if ((ch >= 'a') && (ch <= 'f')) - return ch - 'a' + 10; - if ((ch >= '0') && (ch <= '9')) - return ch - '0'; - if ((ch >= 'A') && (ch <= 'F')) - return ch - 'A' + 10; - return -1; -} - -/* scan for the sequence $<data>#<checksum> */ -static void get_packet(char *buffer) -{ - unsigned char checksum; - unsigned char xmitcsum; - int count; - char ch; - - do { - /* - * Spin and wait around for the start character, ignore all - * other characters: - */ - while ((ch = (kgdb_io_ops->read_char())) != '$') - /* nothing */; - - kgdb_connected = 1; - checksum = 0; - xmitcsum = -1; - - count = 0; - - /* - * now, read until a # or end of buffer is found: - */ - while (count < (BUFMAX - 1)) { - ch = kgdb_io_ops->read_char(); - if (ch == '#') - break; - checksum = checksum + ch; - buffer[count] = ch; - count = count + 1; - } - buffer[count] = 0; - - if (ch == '#') { - xmitcsum = hex(kgdb_io_ops->read_char()) << 4; - xmitcsum += hex(kgdb_io_ops->read_char()); - - if (checksum != xmitcsum) - /* failed checksum */ - kgdb_io_ops->write_char('-'); - else - /* successful transfer */ - kgdb_io_ops->write_char('+'); - if (kgdb_io_ops->flush) - kgdb_io_ops->flush(); - } - } while (checksum != xmitcsum); -} - -/* - * Send the packet in buffer. - * Check for gdb connection if asked for. - */ -static void put_packet(char *buffer) -{ - unsigned char checksum; - int count; - char ch; - - /* - * $<packet info>#<checksum>. - */ - while (1) { - kgdb_io_ops->write_char('$'); - checksum = 0; - count = 0; - - while ((ch = buffer[count])) { - kgdb_io_ops->write_char(ch); - checksum += ch; - count++; - } - - kgdb_io_ops->write_char('#'); - kgdb_io_ops->write_char(hex_asc_hi(checksum)); - kgdb_io_ops->write_char(hex_asc_lo(checksum)); - if (kgdb_io_ops->flush) - kgdb_io_ops->flush(); - - /* Now see what we get in reply. */ - ch = kgdb_io_ops->read_char(); - - if (ch == 3) - ch = kgdb_io_ops->read_char(); - - /* If we get an ACK, we are done. */ - if (ch == '+') - return; - - /* - * If we get the start of another packet, this means - * that GDB is attempting to reconnect. We will NAK - * the packet being sent, and stop trying to send this - * packet. - */ - if (ch == '$') { - kgdb_io_ops->write_char('-'); - if (kgdb_io_ops->flush) - kgdb_io_ops->flush(); - return; - } - } -} - -/* - * Convert the memory pointed to by mem into hex, placing result in buf. - * Return a pointer to the last char put in buf (null). May return an error. - */ -int kgdb_mem2hex(char *mem, char *buf, int count) -{ - char *tmp; - int err; - - /* - * We use the upper half of buf as an intermediate buffer for the - * raw memory copy. Hex conversion will work against this one. - */ - tmp = buf + count; - - err = probe_kernel_read(tmp, mem, count); - if (!err) { - while (count > 0) { - buf = pack_hex_byte(buf, *tmp); - tmp++; - count--; - } - - *buf = 0; - } - - return err; -} - -/* - * Copy the binary array pointed to by buf into mem. Fix $, #, and - * 0x7d escaped with 0x7d. Return a pointer to the character after - * the last byte written. - */ -static int kgdb_ebin2mem(char *buf, char *mem, int count) -{ - int err = 0; - char c; - - while (count-- > 0) { - c = *buf++; - if (c == 0x7d) - c = *buf++ ^ 0x20; - - err = probe_kernel_write(mem, &c, 1); - if (err) - break; - - mem++; - } - - return err; -} - -/* - * Convert the hex array pointed to by buf into binary to be placed in mem. - * Return a pointer to the character AFTER the last byte written. - * May return an error. - */ -int kgdb_hex2mem(char *buf, char *mem, int count) -{ - char *tmp_raw; - char *tmp_hex; - - /* - * We use the upper half of buf as an intermediate buffer for the - * raw memory that is converted from hex. - */ - tmp_raw = buf + count * 2; - - tmp_hex = tmp_raw - 1; - while (tmp_hex >= buf) { - tmp_raw--; - *tmp_raw = hex(*tmp_hex--); - *tmp_raw |= hex(*tmp_hex--) << 4; - } - - return probe_kernel_write(mem, tmp_raw, count); -} - -/* - * While we find nice hex chars, build a long_val. - * Return number of chars processed. - */ -int kgdb_hex2long(char **ptr, unsigned long *long_val) -{ - int hex_val; - int num = 0; - int negate = 0; - - *long_val = 0; - - if (**ptr == '-') { - negate = 1; - (*ptr)++; - } - while (**ptr) { - hex_val = hex(**ptr); - if (hex_val < 0) - break; - - *long_val = (*long_val << 4) | hex_val; - num++; - (*ptr)++; - } - - if (negate) - *long_val = -*long_val; - - return num; -} - -/* Write memory due to an 'M' or 'X' packet. */ -static int write_mem_msg(int binary) -{ - char *ptr = &remcom_in_buffer[1]; - unsigned long addr; - unsigned long length; - int err; - - if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && - kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { - if (binary) - err = kgdb_ebin2mem(ptr, (char *)addr, length); - else - err = kgdb_hex2mem(ptr, (char *)addr, length); - if (err) - return err; - if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length); - return 0; - } - - return -EINVAL; -} - -static void error_packet(char *pkt, int error) -{ - error = -error; - pkt[0] = 'E'; - pkt[1] = hex_asc[(error / 10)]; - pkt[2] = hex_asc[(error % 10)]; - pkt[3] = '\0'; -} - -/* - * Thread ID accessors. We represent a flat TID space to GDB, where - * the per CPU idle threads (which under Linux all have PID 0) are - * remapped to negative TIDs. - */ - -#define BUF_THREAD_ID_SIZE 16 - -static char *pack_threadid(char *pkt, unsigned char *id) -{ - char *limit; - - limit = pkt + BUF_THREAD_ID_SIZE; - while (pkt < limit) - pkt = pack_hex_byte(pkt, *id++); - - return pkt; -} - -static void int_to_threadref(unsigned char *id, int value) -{ - unsigned char *scan; - int i = 4; - - scan = (unsigned char *)id; - while (i--) - *scan++ = 0; - put_unaligned_be32(value, scan); -} - -static struct task_struct *getthread(struct pt_regs *regs, int tid) -{ - /* - * Non-positive TIDs are remapped to the cpu shadow information - */ - if (tid == 0 || tid == -1) - tid = -atomic_read(&kgdb_active) - 2; - if (tid < -1 && tid > -NR_CPUS - 2) { - if (kgdb_info[-tid - 2].task) - return kgdb_info[-tid - 2].task; - else - return idle_task(-tid - 2); - } - if (tid <= 0) { - printk(KERN_ERR "KGDB: Internal thread select error\n"); - dump_stack(); - return NULL; - } - - /* - * find_task_by_pid_ns() does not take the tasklist lock anymore - * but is nicely RCU locked - hence is a pretty resilient - * thing to use: - */ - return find_task_by_pid_ns(tid, &init_pid_ns); -} - -/* - * CPU debug state control: - */ - -#ifdef CONFIG_SMP -static void kgdb_wait(struct pt_regs *regs) -{ - unsigned long flags; - int cpu; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - kgdb_info[cpu].debuggerinfo = regs; - kgdb_info[cpu].task = current; - /* - * Make sure the above info reaches the primary CPU before - * our cpu_in_kgdb[] flag setting does: - */ - smp_wmb(); - atomic_set(&cpu_in_kgdb[cpu], 1); - - /* Disable any cpu specific hw breakpoints */ - kgdb_disable_hw_debug(regs); - - /* Wait till primary CPU is done with debugging */ - while (atomic_read(&passive_cpu_wait[cpu])) - cpu_relax(); - - kgdb_info[cpu].debuggerinfo = NULL; - kgdb_info[cpu].task = NULL; - - /* fix up hardware debug registers on local cpu */ - if (arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - - /* Signal the primary CPU that we are done: */ - atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); - local_irq_restore(flags); -} -#endif - -/* - * Some architectures need cache flushes when we set/clear a - * breakpoint: - */ -static void kgdb_flush_swbreak_addr(unsigned long addr) -{ - if (!CACHE_FLUSH_IS_SAFE) - return; - - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); - } - /* Force flush instruction cache if it was outside the mm */ - flush_icache_range(addr, addr + BREAK_INSTR_SIZE); -} - -/* - * SW breakpoint management: - */ -static int kgdb_activate_sw_breakpoints(void) -{ - unsigned long addr; - int error; - int ret = 0; - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_SET) - continue; - - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_set_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) { - ret = error; - printk(KERN_INFO "KGDB: BP install failed: %lx", addr); - continue; - } - - kgdb_flush_swbreak_addr(addr); - kgdb_break[i].state = BP_ACTIVE; - } - return ret; -} - -static int kgdb_set_sw_break(unsigned long addr) -{ - int err = kgdb_validate_break_address(addr); - int breakno = -1; - int i; - - if (err) - return err; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_SET) && - (kgdb_break[i].bpt_addr == addr)) - return -EEXIST; - } - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state == BP_REMOVED && - kgdb_break[i].bpt_addr == addr) { - breakno = i; - break; - } - } - - if (breakno == -1) { - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state == BP_UNDEFINED) { - breakno = i; - break; - } - } - } - - if (breakno == -1) - return -E2BIG; - - kgdb_break[breakno].state = BP_SET; - kgdb_break[breakno].type = BP_BREAKPOINT; - kgdb_break[breakno].bpt_addr = addr; - - return 0; -} - -static int kgdb_deactivate_sw_breakpoints(void) -{ - unsigned long addr; - int error; - int ret = 0; - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_ACTIVE) - continue; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) { - printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); - ret = error; - } - - kgdb_flush_swbreak_addr(addr); - kgdb_break[i].state = BP_SET; - } - return ret; -} - -static int kgdb_remove_sw_break(unsigned long addr) -{ - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_SET) && - (kgdb_break[i].bpt_addr == addr)) { - kgdb_break[i].state = BP_REMOVED; - return 0; - } - } - return -ENOENT; -} - -int kgdb_isremovedbreak(unsigned long addr) -{ - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_REMOVED) && - (kgdb_break[i].bpt_addr == addr)) - return 1; - } - return 0; -} - -static int remove_all_break(void) -{ - unsigned long addr; - int error; - int i; - - /* Clear memory breakpoints. */ - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_ACTIVE) - goto setundefined; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) - printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", - addr); -setundefined: - kgdb_break[i].state = BP_UNDEFINED; - } - - /* Clear hardware breakpoints. */ - if (arch_kgdb_ops.remove_all_hw_break) - arch_kgdb_ops.remove_all_hw_break(); - - return 0; -} - -/* - * Remap normal tasks to their real PID, - * CPU shadow threads are mapped to -CPU - 2 - */ -static inline int shadow_pid(int realpid) -{ - if (realpid) - return realpid; - - return -raw_smp_processor_id() - 2; -} - -static char gdbmsgbuf[BUFMAX + 1]; - -static void kgdb_msg_write(const char *s, int len) -{ - char *bufptr; - int wcount; - int i; - - /* 'O'utput */ - gdbmsgbuf[0] = 'O'; - - /* Fill and send buffers... */ - while (len > 0) { - bufptr = gdbmsgbuf + 1; - - /* Calculate how many this time */ - if ((len << 1) > (BUFMAX - 2)) - wcount = (BUFMAX - 2) >> 1; - else - wcount = len; - - /* Pack in hex chars */ - for (i = 0; i < wcount; i++) - bufptr = pack_hex_byte(bufptr, s[i]); - *bufptr = '\0'; - - /* Move up */ - s += wcount; - len -= wcount; - - /* Write packet */ - put_packet(gdbmsgbuf); - } -} - -/* - * Return true if there is a valid kgdb I/O module. Also if no - * debugger is attached a message can be printed to the console about - * waiting for the debugger to attach. - * - * The print_wait argument is only to be true when called from inside - * the core kgdb_handle_exception, because it will wait for the - * debugger to attach. - */ -static int kgdb_io_ready(int print_wait) -{ - if (!kgdb_io_ops) - return 0; - if (kgdb_connected) - return 1; - if (atomic_read(&kgdb_setting_breakpoint)) - return 1; - if (print_wait) - printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); - return 1; -} - -/* - * All the functions that start with gdb_cmd are the various - * operations to implement the handlers for the gdbserial protocol - * where KGDB is communicating with an external debugger - */ - -/* Handle the '?' status packets */ -static void gdb_cmd_status(struct kgdb_state *ks) -{ - /* - * We know that this packet is only sent - * during initial connect. So to be safe, - * we clear out our breakpoints now in case - * GDB is reconnecting. - */ - remove_all_break(); - - remcom_out_buffer[0] = 'S'; - pack_hex_byte(&remcom_out_buffer[1], ks->signo); -} - -/* Handle the 'g' get registers request */ -static void gdb_cmd_getregs(struct kgdb_state *ks) -{ - struct task_struct *thread; - void *local_debuggerinfo; - int i; - - thread = kgdb_usethread; - if (!thread) { - thread = kgdb_info[ks->cpu].task; - local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; - } else { - local_debuggerinfo = NULL; - for_each_online_cpu(i) { - /* - * Try to find the task on some other - * or possibly this node if we do not - * find the matching task then we try - * to approximate the results. - */ - if (thread == kgdb_info[i].task) - local_debuggerinfo = kgdb_info[i].debuggerinfo; - } - } - - /* - * All threads that don't have debuggerinfo should be - * in schedule() sleeping, since all other CPUs - * are in kgdb_wait, and thus have debuggerinfo. - */ - if (local_debuggerinfo) { - pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); - } else { - /* - * Pull stuff saved during switch_to; nothing - * else is accessible (or even particularly - * relevant). - * - * This should be enough for a stack trace. - */ - sleeping_thread_to_gdb_regs(gdb_regs, thread); - } - kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); -} - -/* Handle the 'G' set registers request */ -static void gdb_cmd_setregs(struct kgdb_state *ks) -{ - kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); - - if (kgdb_usethread && kgdb_usethread != current) { - error_packet(remcom_out_buffer, -EINVAL); - } else { - gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); - strcpy(remcom_out_buffer, "OK"); - } -} - -/* Handle the 'm' memory read bytes */ -static void gdb_cmd_memread(struct kgdb_state *ks) -{ - char *ptr = &remcom_in_buffer[1]; - unsigned long length; - unsigned long addr; - int err; - - if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && - kgdb_hex2long(&ptr, &length) > 0) { - err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); - if (err) - error_packet(remcom_out_buffer, err); - } else { - error_packet(remcom_out_buffer, -EINVAL); - } -} - -/* Handle the 'M' memory write bytes */ -static void gdb_cmd_memwrite(struct kgdb_state *ks) -{ - int err = write_mem_msg(0); - - if (err) - error_packet(remcom_out_buffer, err); - else - strcpy(remcom_out_buffer, "OK"); -} - -/* Handle the 'X' memory binary write bytes */ -static void gdb_cmd_binwrite(struct kgdb_state *ks) -{ - int err = write_mem_msg(1); - - if (err) - error_packet(remcom_out_buffer, err); - else - strcpy(remcom_out_buffer, "OK"); -} - -/* Handle the 'D' or 'k', detach or kill packets */ -static void gdb_cmd_detachkill(struct kgdb_state *ks) -{ - int error; - - /* The detach case */ - if (remcom_in_buffer[0] == 'D') { - error = remove_all_break(); - if (error < 0) { - error_packet(remcom_out_buffer, error); - } else { - strcpy(remcom_out_buffer, "OK"); - kgdb_connected = 0; - } - put_packet(remcom_out_buffer); - } else { - /* - * Assume the kill case, with no exit code checking, - * trying to force detach the debugger: - */ - remove_all_break(); - kgdb_connected = 0; - } -} - -/* Handle the 'R' reboot packets */ -static int gdb_cmd_reboot(struct kgdb_state *ks) -{ - /* For now, only honor R0 */ - if (strcmp(remcom_in_buffer, "R0") == 0) { - printk(KERN_CRIT "Executing emergency reboot\n"); - strcpy(remcom_out_buffer, "OK"); - put_packet(remcom_out_buffer); - - /* - * Execution should not return from - * machine_emergency_restart() - */ - machine_emergency_restart(); - kgdb_connected = 0; - - return 1; - } - return 0; -} - -/* Handle the 'q' query packets */ -static void gdb_cmd_query(struct kgdb_state *ks) -{ - struct task_struct *g; - struct task_struct *p; - unsigned char thref[8]; - char *ptr; - int i; - int cpu; - int finished = 0; - - switch (remcom_in_buffer[1]) { - case 's': - case 'f': - if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - - i = 0; - remcom_out_buffer[0] = 'm'; - ptr = remcom_out_buffer + 1; - if (remcom_in_buffer[1] == 'f') { - /* Each cpu is a shadow thread */ - for_each_online_cpu(cpu) { - ks->thr_query = 0; - int_to_threadref(thref, -cpu - 2); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; - *(ptr++) = ','; - i++; - } - } - - do_each_thread(g, p) { - if (i >= ks->thr_query && !finished) { - int_to_threadref(thref, p->pid); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; - *(ptr++) = ','; - ks->thr_query++; - if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) - finished = 1; - } - i++; - } while_each_thread(g, p); - - *(--ptr) = '\0'; - break; - - case 'C': - /* Current thread id */ - strcpy(remcom_out_buffer, "QC"); - ks->threadid = shadow_pid(current->pid); - int_to_threadref(thref, ks->threadid); - pack_threadid(remcom_out_buffer + 2, thref); - break; - case 'T': - if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - ks->threadid = 0; - ptr = remcom_in_buffer + 17; - kgdb_hex2long(&ptr, &ks->threadid); - if (!getthread(ks->linux_regs, ks->threadid)) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - if ((int)ks->threadid > 0) { - kgdb_mem2hex(getthread(ks->linux_regs, - ks->threadid)->comm, - remcom_out_buffer, 16); - } else { - static char tmpstr[23 + BUF_THREAD_ID_SIZE]; - - sprintf(tmpstr, "shadowCPU%d", - (int)(-ks->threadid - 2)); - kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); - } - break; - } -} - -/* Handle the 'H' task query packets */ -static void gdb_cmd_task(struct kgdb_state *ks) -{ - struct task_struct *thread; - char *ptr; - - switch (remcom_in_buffer[1]) { - case 'g': - ptr = &remcom_in_buffer[2]; - kgdb_hex2long(&ptr, &ks->threadid); - thread = getthread(ks->linux_regs, ks->threadid); - if (!thread && ks->threadid > 0) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - kgdb_usethread = thread; - ks->kgdb_usethreadid = ks->threadid; - strcpy(remcom_out_buffer, "OK"); - break; - case 'c': - ptr = &remcom_in_buffer[2]; - kgdb_hex2long(&ptr, &ks->threadid); - if (!ks->threadid) { - kgdb_contthread = NULL; - } else { - thread = getthread(ks->linux_regs, ks->threadid); - if (!thread && ks->threadid > 0) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - kgdb_contthread = thread; - } - strcpy(remcom_out_buffer, "OK"); - break; - } -} - -/* Handle the 'T' thread query packets */ -static void gdb_cmd_thread(struct kgdb_state *ks) -{ - char *ptr = &remcom_in_buffer[1]; - struct task_struct *thread; - - kgdb_hex2long(&ptr, &ks->threadid); - thread = getthread(ks->linux_regs, ks->threadid); - if (thread) - strcpy(remcom_out_buffer, "OK"); - else - error_packet(remcom_out_buffer, -EINVAL); -} - -/* Handle the 'z' or 'Z' breakpoint remove or set packets */ -static void gdb_cmd_break(struct kgdb_state *ks) -{ - /* - * Since GDB-5.3, it's been drafted that '0' is a software - * breakpoint, '1' is a hardware breakpoint, so let's do that. - */ - char *bpt_type = &remcom_in_buffer[1]; - char *ptr = &remcom_in_buffer[2]; - unsigned long addr; - unsigned long length; - int error = 0; - - if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { - /* Unsupported */ - if (*bpt_type > '4') - return; - } else { - if (*bpt_type != '0' && *bpt_type != '1') - /* Unsupported. */ - return; - } - - /* - * Test if this is a hardware breakpoint, and - * if we support it: - */ - if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) - /* Unsupported. */ - return; - - if (*(ptr++) != ',') { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - if (!kgdb_hex2long(&ptr, &addr)) { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - if (*(ptr++) != ',' || - !kgdb_hex2long(&ptr, &length)) { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - - if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') - error = kgdb_set_sw_break(addr); - else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') - error = kgdb_remove_sw_break(addr); - else if (remcom_in_buffer[0] == 'Z') - error = arch_kgdb_ops.set_hw_breakpoint(addr, - (int)length, *bpt_type - '0'); - else if (remcom_in_buffer[0] == 'z') - error = arch_kgdb_ops.remove_hw_breakpoint(addr, - (int) length, *bpt_type - '0'); - - if (error == 0) - strcpy(remcom_out_buffer, "OK"); - else - error_packet(remcom_out_buffer, error); -} - -/* Handle the 'C' signal / exception passing packets */ -static int gdb_cmd_exception_pass(struct kgdb_state *ks) -{ - /* C09 == pass exception - * C15 == detach kgdb, pass exception - */ - if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { - - ks->pass_exception = 1; - remcom_in_buffer[0] = 'c'; - - } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { - - ks->pass_exception = 1; - remcom_in_buffer[0] = 'D'; - remove_all_break(); - kgdb_connected = 0; - return 1; - - } else { - kgdb_msg_write("KGDB only knows signal 9 (pass)" - " and 15 (pass and disconnect)\n" - "Executing a continue without signal passing\n", 0); - remcom_in_buffer[0] = 'c'; - } - - /* Indicate fall through */ - return -1; -} - -/* - * This function performs all gdbserial command procesing - */ -static int gdb_serial_stub(struct kgdb_state *ks) -{ - int error = 0; - int tmp; - - /* Clear the out buffer. */ - memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); - - if (kgdb_connected) { - unsigned char thref[8]; - char *ptr; - - /* Reply to host that an exception has occurred */ - ptr = remcom_out_buffer; - *ptr++ = 'T'; - ptr = pack_hex_byte(ptr, ks->signo); - ptr += strlen(strcpy(ptr, "thread:")); - int_to_threadref(thref, shadow_pid(current->pid)); - ptr = pack_threadid(ptr, thref); - *ptr++ = ';'; - put_packet(remcom_out_buffer); - } - - kgdb_usethread = kgdb_info[ks->cpu].task; - ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); - ks->pass_exception = 0; - - while (1) { - error = 0; - - /* Clear the out buffer. */ - memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); - - get_packet(remcom_in_buffer); - - switch (remcom_in_buffer[0]) { - case '?': /* gdbserial status */ - gdb_cmd_status(ks); - break; - case 'g': /* return the value of the CPU registers */ - gdb_cmd_getregs(ks); - break; - case 'G': /* set the value of the CPU registers - return OK */ - gdb_cmd_setregs(ks); - break; - case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ - gdb_cmd_memread(ks); - break; - case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ - gdb_cmd_memwrite(ks); - break; - case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ - gdb_cmd_binwrite(ks); - break; - /* kill or detach. KGDB should treat this like a - * continue. - */ - case 'D': /* Debugger detach */ - case 'k': /* Debugger detach via kill */ - gdb_cmd_detachkill(ks); - goto default_handle; - case 'R': /* Reboot */ - if (gdb_cmd_reboot(ks)) - goto default_handle; - break; - case 'q': /* query command */ - gdb_cmd_query(ks); - break; - case 'H': /* task related */ - gdb_cmd_task(ks); - break; - case 'T': /* Query thread status */ - gdb_cmd_thread(ks); - break; - case 'z': /* Break point remove */ - case 'Z': /* Break point set */ - gdb_cmd_break(ks); - break; - case 'C': /* Exception passing */ - tmp = gdb_cmd_exception_pass(ks); - if (tmp > 0) - goto default_handle; - if (tmp == 0) - break; - /* Fall through on tmp < 0 */ - case 'c': /* Continue packet */ - case 's': /* Single step packet */ - if (kgdb_contthread && kgdb_contthread != current) { - /* Can't switch threads in kgdb */ - error_packet(remcom_out_buffer, -EINVAL); - break; - } - kgdb_activate_sw_breakpoints(); - /* Fall through to default processing */ - default: -default_handle: - error = kgdb_arch_handle_exception(ks->ex_vector, - ks->signo, - ks->err_code, - remcom_in_buffer, - remcom_out_buffer, - ks->linux_regs); - /* - * Leave cmd processing on error, detach, - * kill, continue, or single step. - */ - if (error >= 0 || remcom_in_buffer[0] == 'D' || - remcom_in_buffer[0] == 'k') { - error = 0; - goto kgdb_exit; - } - - } - - /* reply to the request */ - put_packet(remcom_out_buffer); - } - -kgdb_exit: - if (ks->pass_exception) - error = 1; - return error; -} - -static int kgdb_reenter_check(struct kgdb_state *ks) -{ - unsigned long addr; - - if (atomic_read(&kgdb_active) != raw_smp_processor_id()) - return 0; - - /* Panic on recursive debugger calls: */ - exception_level++; - addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); - kgdb_deactivate_sw_breakpoints(); - - /* - * If the break point removed ok at the place exception - * occurred, try to recover and print a warning to the end - * user because the user planted a breakpoint in a place that - * KGDB needs in order to function. - */ - if (kgdb_remove_sw_break(addr) == 0) { - exception_level = 0; - kgdb_skipexception(ks->ex_vector, ks->linux_regs); - kgdb_activate_sw_breakpoints(); - printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", - addr); - WARN_ON_ONCE(1); - - return 1; - } - remove_all_break(); - kgdb_skipexception(ks->ex_vector, ks->linux_regs); - - if (exception_level > 1) { - dump_stack(); - panic("Recursive entry to debugger"); - } - - printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); - dump_stack(); - panic("Recursive entry to debugger"); - - return 1; -} - -/* - * kgdb_handle_exception() - main entry point from a kernel exception - * - * Locking hierarchy: - * interface locks, if any (begin_session) - * kgdb lock (kgdb_active) - */ -int -kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) -{ - struct kgdb_state kgdb_var; - struct kgdb_state *ks = &kgdb_var; - unsigned long flags; - int sstep_tries = 100; - int error = 0; - int i, cpu; - - ks->cpu = raw_smp_processor_id(); - ks->ex_vector = evector; - ks->signo = signo; - ks->ex_vector = evector; - ks->err_code = ecode; - ks->kgdb_usethreadid = 0; - ks->linux_regs = regs; - - if (kgdb_reenter_check(ks)) - return 0; /* Ouch, double exception ! */ - -acquirelock: - /* - * Interrupts will be restored by the 'trap return' code, except when - * single stepping. - */ - local_irq_save(flags); - - cpu = raw_smp_processor_id(); - - /* - * Acquire the kgdb_active lock: - */ - while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) - cpu_relax(); - - /* - * For single stepping, try to only enter on the processor - * that was single stepping. To gaurd against a deadlock, the - * kernel will only try for the value of sstep_tries before - * giving up and continuing on. - */ - if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && - (kgdb_info[cpu].task && - kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { - atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); - local_irq_restore(flags); - - goto acquirelock; - } - - if (!kgdb_io_ready(1)) { - error = 1; - goto kgdb_restore; /* No I/O connection, so resume the system */ - } - - /* - * Don't enter if we have hit a removed breakpoint. - */ - if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) - goto kgdb_restore; - - /* Call the I/O driver's pre_exception routine */ - if (kgdb_io_ops->pre_exception) - kgdb_io_ops->pre_exception(); - - kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; - kgdb_info[ks->cpu].task = current; - - kgdb_disable_hw_debug(ks->linux_regs); - - /* - * Get the passive CPU lock which will hold all the non-primary - * CPU in a spin state while the debugger is active - */ - if (!kgdb_single_step) { - for (i = 0; i < NR_CPUS; i++) - atomic_set(&passive_cpu_wait[i], 1); - } - - /* - * spin_lock code is good enough as a barrier so we don't - * need one here: - */ - atomic_set(&cpu_in_kgdb[ks->cpu], 1); - -#ifdef CONFIG_SMP - /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step) && kgdb_do_roundup) - kgdb_roundup_cpus(flags); -#endif - - /* - * Wait for the other CPUs to be notified and be waiting for us: - */ - for_each_online_cpu(i) { - while (!atomic_read(&cpu_in_kgdb[i])) - cpu_relax(); - } - - /* - * At this point the primary processor is completely - * in the debugger and all secondary CPUs are quiescent - */ - kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); - kgdb_deactivate_sw_breakpoints(); - kgdb_single_step = 0; - kgdb_contthread = current; - exception_level = 0; - - /* Talk to debugger with gdbserial protocol */ - error = gdb_serial_stub(ks); - - /* Call the I/O driver's post_exception routine */ - if (kgdb_io_ops->post_exception) - kgdb_io_ops->post_exception(); - - kgdb_info[ks->cpu].debuggerinfo = NULL; - kgdb_info[ks->cpu].task = NULL; - atomic_set(&cpu_in_kgdb[ks->cpu], 0); - - if (!kgdb_single_step) { - for (i = NR_CPUS-1; i >= 0; i--) - atomic_set(&passive_cpu_wait[i], 0); - /* - * Wait till all the CPUs have quit - * from the debugger. - */ - for_each_online_cpu(i) { - while (atomic_read(&cpu_in_kgdb[i])) - cpu_relax(); - } - } - -kgdb_restore: - if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { - int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step); - if (kgdb_info[sstep_cpu].task) - kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid; - else - kgdb_sstep_pid = 0; - } - /* Free kgdb_active */ - atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); - local_irq_restore(flags); - - return error; -} - -int kgdb_nmicallback(int cpu, void *regs) -{ -#ifdef CONFIG_SMP - if (!atomic_read(&cpu_in_kgdb[cpu]) && - atomic_read(&kgdb_active) != cpu && - atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { - kgdb_wait((struct pt_regs *)regs); - return 0; - } -#endif - return 1; -} - -static void kgdb_console_write(struct console *co, const char *s, - unsigned count) -{ - unsigned long flags; - - /* If we're debugging, or KGDB has not connected, don't try - * and print. */ - if (!kgdb_connected || atomic_read(&kgdb_active) != -1) - return; - - local_irq_save(flags); - kgdb_msg_write(s, count); - local_irq_restore(flags); -} - -static struct console kgdbcons = { - .name = "kgdb", - .write = kgdb_console_write, - .flags = CON_PRINTBUFFER | CON_ENABLED, - .index = -1, -}; - -#ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_gdb(int key, struct tty_struct *tty) -{ - if (!kgdb_io_ops) { - printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); - return; - } - if (!kgdb_connected) - printk(KERN_CRIT "Entering KGDB\n"); - - kgdb_breakpoint(); -} - -static struct sysrq_key_op sysrq_gdb_op = { - .handler = sysrq_handle_gdb, - .help_msg = "debug(G)", - .action_msg = "DEBUG", -}; -#endif - -static void kgdb_register_callbacks(void) -{ - if (!kgdb_io_module_registered) { - kgdb_io_module_registered = 1; - kgdb_arch_init(); -#ifdef CONFIG_MAGIC_SYSRQ - register_sysrq_key('g', &sysrq_gdb_op); -#endif - if (kgdb_use_con && !kgdb_con_registered) { - register_console(&kgdbcons); - kgdb_con_registered = 1; - } - } -} - -static void kgdb_unregister_callbacks(void) -{ - /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any - * break exceptions at the time. - */ - if (kgdb_io_module_registered) { - kgdb_io_module_registered = 0; - kgdb_arch_exit(); -#ifdef CONFIG_MAGIC_SYSRQ - unregister_sysrq_key('g', &sysrq_gdb_op); -#endif - if (kgdb_con_registered) { - unregister_console(&kgdbcons); - kgdb_con_registered = 0; - } - } -} - -static void kgdb_initial_breakpoint(void) -{ - kgdb_break_asap = 0; - - printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); - kgdb_breakpoint(); -} - -/** - * kgdb_register_io_module - register KGDB IO module - * @new_kgdb_io_ops: the io ops vector - * - * Register it with the KGDB core. - */ -int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops) -{ - int err; - - spin_lock(&kgdb_registration_lock); - - if (kgdb_io_ops) { - spin_unlock(&kgdb_registration_lock); - - printk(KERN_ERR "kgdb: Another I/O driver is already " - "registered with KGDB.\n"); - return -EBUSY; - } - - if (new_kgdb_io_ops->init) { - err = new_kgdb_io_ops->init(); - if (err) { - spin_unlock(&kgdb_registration_lock); - return err; - } - } - - kgdb_io_ops = new_kgdb_io_ops; - - spin_unlock(&kgdb_registration_lock); - - printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", - new_kgdb_io_ops->name); - - /* Arm KGDB now. */ - kgdb_register_callbacks(); - - if (kgdb_break_asap) - kgdb_initial_breakpoint(); - - return 0; -} -EXPORT_SYMBOL_GPL(kgdb_register_io_module); - -/** - * kkgdb_unregister_io_module - unregister KGDB IO module - * @old_kgdb_io_ops: the io ops vector - * - * Unregister it with the KGDB core. - */ -void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops) -{ - BUG_ON(kgdb_connected); - - /* - * KGDB is no longer able to communicate out, so - * unregister our callbacks and reset state. - */ - kgdb_unregister_callbacks(); - - spin_lock(&kgdb_registration_lock); - - WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops); - kgdb_io_ops = NULL; - - spin_unlock(&kgdb_registration_lock); - - printk(KERN_INFO - "kgdb: Unregistered I/O driver %s, debugger disabled.\n", - old_kgdb_io_ops->name); -} -EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); - -/** - * kgdb_breakpoint - generate breakpoint exception - * - * This function will generate a breakpoint exception. It is used at the - * beginning of a program to sync up with a debugger and can be used - * otherwise as a quick means to stop program execution and "break" into - * the debugger. - */ -void kgdb_breakpoint(void) -{ - atomic_set(&kgdb_setting_breakpoint, 1); - wmb(); /* Sync point before breakpoint */ - arch_kgdb_breakpoint(); - wmb(); /* Sync point after breakpoint */ - atomic_set(&kgdb_setting_breakpoint, 0); -} -EXPORT_SYMBOL_GPL(kgdb_breakpoint); - -static int __init opt_kgdb_wait(char *str) -{ - kgdb_break_asap = 1; - - if (kgdb_io_module_registered) - kgdb_initial_breakpoint(); - - return 0; -} - -early_param("kgdbwait", opt_kgdb_wait); diff --git a/kernel/kmod.c b/kernel/kmod.c index bf0e231..6e9b196 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...) trace_module_request(module_name, wait, _RET_IP_); - ret = call_usermodehelper(modprobe_path, argv, envp, - wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); + ret = call_usermodehelper_fns(modprobe_path, argv, envp, + wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, + NULL, NULL, NULL); + atomic_dec(&kmod_concurrent); return ret; } EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ -struct subprocess_info { - struct work_struct work; - struct completion *complete; - struct cred *cred; - char *path; - char **argv; - char **envp; - enum umh_wait wait; - int retval; - struct file *stdin; - void (*cleanup)(char **argv, char **envp); -}; - /* * This is the task which runs the usermode application */ @@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data) struct subprocess_info *sub_info = data; int retval; - BUG_ON(atomic_read(&sub_info->cred->usage) != 1); - - /* Unblock all signals */ spin_lock_irq(¤t->sighand->siglock); flush_signal_handlers(current, 1); - sigemptyset(¤t->blocked); - recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - /* Install the credentials */ - commit_creds(sub_info->cred); - sub_info->cred = NULL; - - /* Install input pipe when needed */ - if (sub_info->stdin) { - struct files_struct *f = current->files; - struct fdtable *fdt; - /* no races because files should be private here */ - sys_close(0); - fd_install(0, sub_info->stdin); - spin_lock(&f->file_lock); - fdt = files_fdtable(f); - FD_SET(0, fdt->open_fds); - FD_CLR(0, fdt->close_on_exec); - spin_unlock(&f->file_lock); - - /* and disallow core files too */ - current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0}; - } - /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed_ptr(current, cpu_all_mask); @@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); + if (sub_info->init) { + retval = sub_info->init(sub_info); + if (retval) + goto fail; + } + retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ +fail: sub_info->retval = retval; do_exit(0); } @@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data) void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) - (*info->cleanup)(info->argv, info->envp); - if (info->cred) - put_cred(info->cred); + (*info->cleanup)(info); kfree(info); } EXPORT_SYMBOL(call_usermodehelper_freeinfo); @@ -207,16 +175,16 @@ static int wait_for_helper(void *data) struct subprocess_info *sub_info = data; pid_t pid; - /* Install a handler: if SIGCLD isn't handled sys_wait4 won't - * populate the status, but will return -ECHILD. */ - allow_signal(SIGCHLD); + /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; + spin_unlock_irq(¤t->sighand->siglock); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; } else { - int ret; - + int ret = -ECHILD; /* * Normally it is bogus to call wait4() from in-kernel because * wait4() wants to write the exit code to a userspace address. @@ -237,10 +205,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - if (sub_info->wait == UMH_NO_WAIT) - call_usermodehelper_freeinfo(sub_info); - else - complete(sub_info->complete); + complete(sub_info->complete); return 0; } @@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - pid_t pid; enum umh_wait wait = sub_info->wait; - - BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + pid_t pid; /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ - if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) + if (wait == UMH_WAIT_PROC) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); else @@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work) switch (wait) { case UMH_NO_WAIT: + call_usermodehelper_freeinfo(sub_info); break; case UMH_WAIT_PROC: if (pid > 0) break; - sub_info->retval = pid; /* FALLTHROUGH */ - case UMH_WAIT_EXEC: + if (pid < 0) + sub_info->retval = pid; complete(sub_info->complete); } } @@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; - sub_info->cred = prepare_usermodehelper_creds(); - if (!sub_info->cred) { - kfree(sub_info); - return NULL; - } - out: return sub_info; } EXPORT_SYMBOL(call_usermodehelper_setup); /** - * call_usermodehelper_setkeys - set the session keys for usermode helper - * @info: a subprocess_info returned by call_usermodehelper_setup - * @session_keyring: the session keyring for the process - */ -void call_usermodehelper_setkeys(struct subprocess_info *info, - struct key *session_keyring) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = info->cred->tgcred; - key_put(tgcred->session_keyring); - tgcred->session_keyring = key_get(session_keyring); -#else - BUG(); -#endif -} -EXPORT_SYMBOL(call_usermodehelper_setkeys); - -/** - * call_usermodehelper_setcleanup - set a cleanup function + * call_usermodehelper_setfns - set a cleanup/init function * @info: a subprocess_info returned by call_usermodehelper_setup * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data * - * The cleanup function is just befor ethe subprocess_info is about to + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ -void call_usermodehelper_setcleanup(struct subprocess_info *info, - void (*cleanup)(char **argv, char **envp)) +void call_usermodehelper_setfns(struct subprocess_info *info, + int (*init)(struct subprocess_info *info), + void (*cleanup)(struct subprocess_info *info), + void *data) { info->cleanup = cleanup; + info->init = init; + info->data = data; } -EXPORT_SYMBOL(call_usermodehelper_setcleanup); - -/** - * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin - * @sub_info: a subprocess_info returned by call_usermodehelper_setup - * @filp: set to the write-end of a pipe - * - * This constructs a pipe, and sets the read end to be the stdin of the - * subprocess, and returns the write-end in *@filp. - */ -int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, - struct file **filp) -{ - struct file *f; - - f = create_write_pipe(0); - if (IS_ERR(f)) - return PTR_ERR(f); - *filp = f; - - f = create_read_pipe(f, 0); - if (IS_ERR(f)) { - free_write_pipe(*filp); - return PTR_ERR(f); - } - sub_info->stdin = f; - - return 0; -} -EXPORT_SYMBOL(call_usermodehelper_stdinpipe); +EXPORT_SYMBOL(call_usermodehelper_setfns); /** * call_usermodehelper_exec - start a usermode application @@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, DECLARE_COMPLETION_ONSTACK(done); int retval = 0; - BUG_ON(atomic_read(&sub_info->cred->usage) != 1); - validate_creds(sub_info->cred); - helper_lock(); if (sub_info->path[0] == '\0') goto out; @@ -498,41 +416,6 @@ unlock: } EXPORT_SYMBOL(call_usermodehelper_exec); -/** - * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @filp: set to the write-end of a pipe - * - * This is a simple wrapper which executes a usermode-helper function - * with a pipe as stdin. It is implemented entirely in terms of - * lower-level call_usermodehelper_* functions. - */ -int call_usermodehelper_pipe(char *path, char **argv, char **envp, - struct file **filp) -{ - struct subprocess_info *sub_info; - int ret; - - sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL); - if (sub_info == NULL) - return -ENOMEM; - - ret = call_usermodehelper_stdinpipe(sub_info, filp); - if (ret < 0) { - call_usermodehelper_freeinfo(sub_info); - return ret; - } - - ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - if (ret < 0) /* Failed to execute helper, close pipe */ - filp_close(*filp, NULL); - - return ret; -} -EXPORT_SYMBOL(call_usermodehelper_pipe); - void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0ed46f3..282035f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +/* Disable one kprobe */ +int __kprobes disable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + /* If the probe is already disabled (or gone), just return */ + if (kprobe_disabled(kp)) + goto out; + + kp->flags |= KPROBE_FLAG_DISABLED; + if (p != kp) + /* When kp != p, p is always enabled. */ + try_to_disable_aggr_kprobe(p); + + if (!kprobes_all_disarmed && kprobe_disabled(p)) + disarm_kprobe(p); +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(disable_kprobe); + +/* Enable one kprobe */ +int __kprobes enable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + if (kprobe_gone(kp)) { + /* This kprobe has gone, we couldn't enable it. */ + ret = -EINVAL; + goto out; + } + + if (p != kp) + kp->flags &= ~KPROBE_FLAG_DISABLED; + + if (!kprobes_all_disarmed && kprobe_disabled(p)) { + p->flags &= ~KPROBE_FLAG_DISABLED; + arm_kprobe(p); + } +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(enable_kprobe); + void __kprobes dump_kprobe(struct kprobe *kp) { printk(KERN_WARNING "Dumping kprobe:\n"); @@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; -/* Disable one kprobe */ -int __kprobes disable_kprobe(struct kprobe *kp) -{ - int ret = 0; - struct kprobe *p; - - mutex_lock(&kprobe_mutex); - - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } - - /* If the probe is already disabled (or gone), just return */ - if (kprobe_disabled(kp)) - goto out; - - kp->flags |= KPROBE_FLAG_DISABLED; - if (p != kp) - /* When kp != p, p is always enabled. */ - try_to_disable_aggr_kprobe(p); - - if (!kprobes_all_disarmed && kprobe_disabled(p)) - disarm_kprobe(p); -out: - mutex_unlock(&kprobe_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(disable_kprobe); - -/* Enable one kprobe */ -int __kprobes enable_kprobe(struct kprobe *kp) -{ - int ret = 0; - struct kprobe *p; - - mutex_lock(&kprobe_mutex); - - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } - - if (kprobe_gone(kp)) { - /* This kprobe has gone, we couldn't enable it. */ - ret = -EINVAL; - goto out; - } - - if (p != kp) - kp->flags &= ~KPROBE_FLAG_DISABLED; - - if (!kprobes_all_disarmed && kprobe_disabled(p)) { - p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); - } -out: - mutex_unlock(&kprobe_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(enable_kprobe); - static void __kprobes arm_all_kprobes(void) { struct hlist_head *head; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 21fe3c4..0b624e7 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak)); extern const void __stop_notes __attribute__((weak)); #define notes_size (&__stop_notes - &__start_notes) -static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, +static ssize_t notes_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { memcpy(buf, &__start_notes + off, count); diff --git a/kernel/kthread.c b/kernel/kthread.c index 82ed0ea..83911c78 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -219,7 +219,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ca07c5c..877fb30 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -56,7 +56,6 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/list.h> -#include <linux/slab.h> #include <linux/stacktrace.h> static DEFINE_SPINLOCK(latency_lock); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c927a549..5428679 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -43,6 +43,7 @@ #include <linux/ftrace.h> #include <linux/stringify.h> #include <linux/bitops.h> +#include <linux/gfp.h> #include <asm/sections.h> @@ -430,20 +431,7 @@ static struct stack_trace lockdep_init_trace = { /* * Various lockdep statistics: */ -atomic_t chain_lookup_hits; -atomic_t chain_lookup_misses; -atomic_t hardirqs_on_events; -atomic_t hardirqs_off_events; -atomic_t redundant_hardirqs_on; -atomic_t redundant_hardirqs_off; -atomic_t softirqs_on_events; -atomic_t softirqs_off_events; -atomic_t redundant_softirqs_on; -atomic_t redundant_softirqs_off; -atomic_t nr_unused_locks; -atomic_t nr_cyclic_checks; -atomic_t nr_find_usage_forwards_checks; -atomic_t nr_find_usage_backwards_checks; +DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif /* @@ -582,9 +570,6 @@ static int static_obj(void *obj) unsigned long start = (unsigned long) &_stext, end = (unsigned long) &_end, addr = (unsigned long) obj; -#ifdef CONFIG_SMP - int i; -#endif /* * static variable? @@ -595,24 +580,16 @@ static int static_obj(void *obj) if (arch_is_kernel_data(addr)) return 1; -#ifdef CONFIG_SMP /* - * percpu var? + * in-kernel percpu var? */ - for_each_possible_cpu(i) { - start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM - + per_cpu_offset(i); - - if ((addr >= start) && (addr < end)) - return 1; - } -#endif + if (is_kernel_percpu_address(addr)) + return 1; /* - * module var? + * module static or percpu var? */ - return is_module_address(addr); + return is_module_address(addr) || is_module_percpu_address(addr); } /* @@ -758,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) return NULL; } class = lock_classes + nr_lock_classes++; - debug_atomic_inc(&nr_unused_locks); + debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; class->subclass = subclass; @@ -828,7 +805,8 @@ static struct lock_list *alloc_list_entry(void) * Add a new dependency to the head of the list: */ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, int distance) + struct list_head *head, unsigned long ip, + int distance, struct stack_trace *trace) { struct lock_list *entry; /* @@ -839,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, if (!entry) return 0; - if (!save_trace(&entry->trace)) - return 0; - entry->class = this; entry->distance = distance; + entry->trace = *trace; /* * Since we never remove from the dependency list, the list can * be walked lockless by other CPUs, it's only allocation @@ -1215,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target, { int result; - debug_atomic_inc(&nr_cyclic_checks); + debug_atomic_inc(nr_cyclic_checks); result = __bfs_forwards(root, target, class_equal, target_entry); @@ -1252,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, { int result; - debug_atomic_inc(&nr_find_usage_forwards_checks); + debug_atomic_inc(nr_find_usage_forwards_checks); result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); @@ -1275,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, { int result; - debug_atomic_inc(&nr_find_usage_backwards_checks); + debug_atomic_inc(nr_find_usage_backwards_checks); result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); @@ -1645,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance) + struct held_lock *next, int distance, int trylock_loop) { struct lock_list *entry; int ret; struct lock_list this; struct lock_list *uninitialized_var(target_entry); + /* + * Static variable, serialized by the graph_lock(). + * + * We use this static variable to save the stack trace in case + * we call into this function multiple times due to encountering + * trylocks in the held lock stack. + */ + static struct stack_trace trace; /* * Prove that the new <prev> -> <next> dependency would not @@ -1698,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } + if (!trylock_loop && !save_trace(&trace)) + return 0; + /* * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(prev)->locks_after, - next->acquire_ip, distance); + next->acquire_ip, distance, &trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(next)->locks_before, - next->acquire_ip, distance); + next->acquire_ip, distance, &trace); if (!ret) return 0; @@ -1741,6 +1728,7 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; + int trylock_loop = 0; struct held_lock *hlock; /* @@ -1766,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) * added: */ if (hlock->read != 2) { - if (!check_prev_add(curr, hlock, next, distance)) + if (!check_prev_add(curr, hlock, next, + distance, trylock_loop)) return 0; /* * Stop after the first non-trylock entry, @@ -1789,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; + trylock_loop = 1; } return 1; out_bug: @@ -1835,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, list_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: - debug_atomic_inc(&chain_lookup_hits); + debug_atomic_inc(chain_lookup_hits); if (very_verbose(class)) printk("\nhash chain already cached, key: " "%016Lx tail class: [%p] %s\n", @@ -1900,7 +1890,7 @@ cache_hit: chain_hlocks[chain->base + j] = class - lock_classes; } list_add_tail_rcu(&chain->entry, hash_head); - debug_atomic_inc(&chain_lookup_misses); + debug_atomic_inc(chain_lookup_misses); inc_chains(); return 1; @@ -2321,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip) return; if (unlikely(curr->hardirqs_enabled)) { - debug_atomic_inc(&redundant_hardirqs_on); + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but loosing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); return; } /* we'll do an OFF -> ON transition: */ @@ -2348,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip) curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(&hardirqs_on_events); + debug_atomic_inc(hardirqs_on_events); } EXPORT_SYMBOL(trace_hardirqs_on_caller); @@ -2380,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip) curr->hardirqs_enabled = 0; curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; - debug_atomic_inc(&hardirqs_off_events); + debug_atomic_inc(hardirqs_off_events); } else - debug_atomic_inc(&redundant_hardirqs_off); + debug_atomic_inc(redundant_hardirqs_off); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -2406,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip) return; if (curr->softirqs_enabled) { - debug_atomic_inc(&redundant_softirqs_on); + debug_atomic_inc(redundant_softirqs_on); return; } @@ -2416,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip) curr->softirqs_enabled = 1; curr->softirq_enable_ip = ip; curr->softirq_enable_event = ++curr->irq_events; - debug_atomic_inc(&softirqs_on_events); + debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the * usage bit for all held locks, if hardirqs are @@ -2446,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip) curr->softirqs_enabled = 0; curr->softirq_disable_ip = ip; curr->softirq_disable_event = ++curr->irq_events; - debug_atomic_inc(&softirqs_off_events); + debug_atomic_inc(softirqs_off_events); DEBUG_LOCKS_WARN_ON(!softirq_count()); } else - debug_atomic_inc(&redundant_softirqs_off); + debug_atomic_inc(redundant_softirqs_off); } static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) @@ -2654,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; break; case LOCK_USED: - debug_atomic_dec(&nr_unused_locks); + debug_atomic_dec(nr_unused_locks); break; default: if (!debug_locks_off_graph_unlock()) @@ -2716,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lockdep_init_map); +struct lock_class_key __lockdep_no_validate__; + /* * This gets called for every mutex_lock*()/spin_lock*() operation. * We maintain the dependency maps and validate the locking attempt: @@ -2750,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 0; } + if (lock->key == &__lockdep_no_validate__) + check = 1; + if (!subclass) class = lock->class_cache; /* @@ -2760,7 +2760,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!class) return 0; } - debug_atomic_inc((atomic_t *)&class->ops); + atomic_inc((atomic_t *)&class->ops); if (very_verbose(class)) { printk("\nacquire class [%p] %s", class->key, class->name); if (class->name_version > 1) @@ -3237,7 +3237,7 @@ void lock_release(struct lockdep_map *lock, int nested, raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - trace_lock_release(lock, nested, ip); + trace_lock_release(lock, ip); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); @@ -3390,7 +3390,7 @@ found_it: hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip, waittime); + trace_lock_acquired(lock, ip); stats = get_lock_stats(hlock_class(hlock)); if (waittime) { @@ -3811,8 +3811,11 @@ void lockdep_rcu_dereference(const char *file, const int line) { struct task_struct *curr = current; +#ifndef CONFIG_PROVE_RCU_REPEATEDLY if (!debug_locks_off()) return; +#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ + /* Note: the following can be executed concurrently, so be careful. */ printk("\n===================================================\n"); printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); printk( "---------------------------------------------------\n"); diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index a2ee95a..4f560cf 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class) #endif #ifdef CONFIG_DEBUG_LOCKDEP + +#include <asm/local.h> /* - * Various lockdep statistics: + * Various lockdep statistics. + * We want them per cpu as they are often accessed in fast path + * and we want to avoid too much cache bouncing. */ -extern atomic_t chain_lookup_hits; -extern atomic_t chain_lookup_misses; -extern atomic_t hardirqs_on_events; -extern atomic_t hardirqs_off_events; -extern atomic_t redundant_hardirqs_on; -extern atomic_t redundant_hardirqs_off; -extern atomic_t softirqs_on_events; -extern atomic_t softirqs_off_events; -extern atomic_t redundant_softirqs_on; -extern atomic_t redundant_softirqs_off; -extern atomic_t nr_unused_locks; -extern atomic_t nr_cyclic_checks; -extern atomic_t nr_cyclic_check_recursions; -extern atomic_t nr_find_usage_forwards_checks; -extern atomic_t nr_find_usage_forwards_recursions; -extern atomic_t nr_find_usage_backwards_checks; -extern atomic_t nr_find_usage_backwards_recursions; -# define debug_atomic_inc(ptr) atomic_inc(ptr) -# define debug_atomic_dec(ptr) atomic_dec(ptr) -# define debug_atomic_read(ptr) atomic_read(ptr) +struct lockdep_stats { + int chain_lookup_hits; + int chain_lookup_misses; + int hardirqs_on_events; + int hardirqs_off_events; + int redundant_hardirqs_on; + int redundant_hardirqs_off; + int softirqs_on_events; + int softirqs_off_events; + int redundant_softirqs_on; + int redundant_softirqs_off; + int nr_unused_locks; + int nr_cyclic_checks; + int nr_cyclic_check_recursions; + int nr_find_usage_forwards_checks; + int nr_find_usage_forwards_recursions; + int nr_find_usage_backwards_checks; + int nr_find_usage_backwards_recursions; +}; + +DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); + +#define __debug_atomic_inc(ptr) \ + this_cpu_inc(lockdep_stats.ptr); + +#define debug_atomic_inc(ptr) { \ + WARN_ON_ONCE(!irqs_disabled()); \ + __this_cpu_inc(lockdep_stats.ptr); \ +} + +#define debug_atomic_dec(ptr) { \ + WARN_ON_ONCE(!irqs_disabled()); \ + __this_cpu_dec(lockdep_stats.ptr); \ +} + +#define debug_atomic_read(ptr) ({ \ + struct lockdep_stats *__cpu_lockdep_stats; \ + unsigned long long __total = 0; \ + int __cpu; \ + for_each_possible_cpu(__cpu) { \ + __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \ + __total += __cpu_lockdep_stats->ptr; \ + } \ + __total; \ +}) #else +# define __debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_inc(ptr) do { } while (0) # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr) 0 diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d4aba4f..59b76c8 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = { static void lockdep_stats_debug_show(struct seq_file *m) { #ifdef CONFIG_DEBUG_LOCKDEP - unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), - hi2 = debug_atomic_read(&hardirqs_off_events), - hr1 = debug_atomic_read(&redundant_hardirqs_on), - hr2 = debug_atomic_read(&redundant_hardirqs_off), - si1 = debug_atomic_read(&softirqs_on_events), - si2 = debug_atomic_read(&softirqs_off_events), - sr1 = debug_atomic_read(&redundant_softirqs_on), - sr2 = debug_atomic_read(&redundant_softirqs_off); - - seq_printf(m, " chain lookup misses: %11u\n", - debug_atomic_read(&chain_lookup_misses)); - seq_printf(m, " chain lookup hits: %11u\n", - debug_atomic_read(&chain_lookup_hits)); - seq_printf(m, " cyclic checks: %11u\n", - debug_atomic_read(&nr_cyclic_checks)); - seq_printf(m, " find-mask forwards checks: %11u\n", - debug_atomic_read(&nr_find_usage_forwards_checks)); - seq_printf(m, " find-mask backwards checks: %11u\n", - debug_atomic_read(&nr_find_usage_backwards_checks)); - - seq_printf(m, " hardirq on events: %11u\n", hi1); - seq_printf(m, " hardirq off events: %11u\n", hi2); - seq_printf(m, " redundant hardirq ons: %11u\n", hr1); - seq_printf(m, " redundant hardirq offs: %11u\n", hr2); - seq_printf(m, " softirq on events: %11u\n", si1); - seq_printf(m, " softirq off events: %11u\n", si2); - seq_printf(m, " redundant softirq ons: %11u\n", sr1); - seq_printf(m, " redundant softirq offs: %11u\n", sr2); + unsigned long long hi1 = debug_atomic_read(hardirqs_on_events), + hi2 = debug_atomic_read(hardirqs_off_events), + hr1 = debug_atomic_read(redundant_hardirqs_on), + hr2 = debug_atomic_read(redundant_hardirqs_off), + si1 = debug_atomic_read(softirqs_on_events), + si2 = debug_atomic_read(softirqs_off_events), + sr1 = debug_atomic_read(redundant_softirqs_on), + sr2 = debug_atomic_read(redundant_softirqs_off); + + seq_printf(m, " chain lookup misses: %11llu\n", + debug_atomic_read(chain_lookup_misses)); + seq_printf(m, " chain lookup hits: %11llu\n", + debug_atomic_read(chain_lookup_hits)); + seq_printf(m, " cyclic checks: %11llu\n", + debug_atomic_read(nr_cyclic_checks)); + seq_printf(m, " find-mask forwards checks: %11llu\n", + debug_atomic_read(nr_find_usage_forwards_checks)); + seq_printf(m, " find-mask backwards checks: %11llu\n", + debug_atomic_read(nr_find_usage_backwards_checks)); + + seq_printf(m, " hardirq on events: %11llu\n", hi1); + seq_printf(m, " hardirq off events: %11llu\n", hi2); + seq_printf(m, " redundant hardirq ons: %11llu\n", hr1); + seq_printf(m, " redundant hardirq offs: %11llu\n", hr2); + seq_printf(m, " softirq on events: %11llu\n", si1); + seq_printf(m, " softirq off events: %11llu\n", si2); + seq_printf(m, " redundant softirq ons: %11llu\n", sr1); + seq_printf(m, " redundant softirq offs: %11llu\n", sr2); #endif } @@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #endif } #ifdef CONFIG_DEBUG_LOCKDEP - DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); + DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); diff --git a/kernel/module.c b/kernel/module.c index c968d36..6c56282 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -59,8 +59,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/module.h> -EXPORT_TRACEPOINT_SYMBOL(module_get); - #if 0 #define DEBUGP printk #else @@ -74,11 +72,19 @@ EXPORT_TRACEPOINT_SYMBOL(module_get); /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -/* List of modules, protected by module_mutex or preempt_disable +/* + * Mutex protects: + * 1) List of modules (also safely readable with preempt_disable), + * 2) module_use links, + * 3) module_addr_min/module_addr_max. * (delete uses stop_machine/add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); +#ifdef CONFIG_KGDB_KDB +struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ +#endif /* CONFIG_KGDB_KDB */ + /* Block module loading/unloading? */ int modules_disabled = 0; @@ -88,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); -/* Bounds of module allocation, for speeding __module_address */ +/* Bounds of module allocation, for speeding __module_address. + * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; int register_module_notifier(struct notifier_block * nb) @@ -178,8 +185,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const struct kernel_symbol __start___ksymtab_gpl_future[]; extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; -extern const struct kernel_symbol __start___ksymtab_gpl_future[]; -extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; extern const unsigned long __start___kcrctab[]; extern const unsigned long __start___kcrctab_gpl[]; extern const unsigned long __start___kcrctab_gpl_future[]; @@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms, } /* Find a symbol and return it, along with, (optional) crc and - * (optional) module which owns it */ + * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const unsigned long **crc, @@ -370,54 +375,98 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -static void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) +static inline void __percpu *mod_percpu(struct module *mod) { - void *ptr; + return mod->percpu; +} +static int percpu_modalloc(struct module *mod, + unsigned long size, unsigned long align) +{ if (align > PAGE_SIZE) { printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); + mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } - ptr = __alloc_reserved_percpu(size, align); - if (!ptr) + mod->percpu = __alloc_reserved_percpu(size, align); + if (!mod->percpu) { printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", size); - return ptr; + return -ENOMEM; + } + mod->percpu_size = size; + return 0; } -static void percpu_modfree(void *freeme) +static void percpu_modfree(struct module *mod) { - free_percpu(freeme); + free_percpu(mod->percpu); } static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) { - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); + return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); } -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +static void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) { int cpu; for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); + memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); +} + +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + struct module *mod; + unsigned int cpu; + + preempt_disable(); + + list_for_each_entry_rcu(mod, &modules, list) { + if (!mod->percpu_size) + continue; + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(mod->percpu, cpu); + + if ((void *)addr >= start && + (void *)addr < start + mod->percpu_size) { + preempt_enable(); + return true; + } + } + } + + preempt_enable(); + return false; } #else /* ... !CONFIG_SMP */ -static inline void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) +static inline void __percpu *mod_percpu(struct module *mod) { return NULL; } -static inline void percpu_modfree(void *pcpuptr) +static inline int percpu_modalloc(struct module *mod, + unsigned long size, unsigned long align) +{ + return -ENOMEM; +} +static inline void percpu_modfree(struct module *mod) { - BUG(); } static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -425,12 +474,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, { return 0; } -static inline void percpu_modcopy(void *pcpudst, const void *src, - unsigned long size) +static inline void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) { /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); } +bool is_module_percpu_address(unsigned long addr) +{ + return false; +} #endif /* CONFIG_SMP */ @@ -467,35 +520,34 @@ MODINFO_ATTR(srcversion); static char last_unloaded_module[MODULE_NAME_LEN+1]; #ifdef CONFIG_MODULE_UNLOAD + +EXPORT_TRACEPOINT_SYMBOL(module_get); + /* Init the unload section of the module. */ static void module_unload_init(struct module *mod) { int cpu; - INIT_LIST_HEAD(&mod->modules_which_use_me); - for_each_possible_cpu(cpu) - per_cpu_ptr(mod->refptr, cpu)->count = 0; + INIT_LIST_HEAD(&mod->source_list); + INIT_LIST_HEAD(&mod->target_list); + for_each_possible_cpu(cpu) { + per_cpu_ptr(mod->refptr, cpu)->incs = 0; + per_cpu_ptr(mod->refptr, cpu)->decs = 0; + } /* Hold reference count during initialization. */ - __this_cpu_write(mod->refptr->count, 1); + __this_cpu_write(mod->refptr->incs, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } -/* modules using other modules */ -struct module_use -{ - struct list_head list; - struct module *module_which_uses; -}; - /* Does a already use b? */ static int already_uses(struct module *a, struct module *b) { struct module_use *use; - list_for_each_entry(use, &b->modules_which_use_me, list) { - if (use->module_which_uses == a) { + list_for_each_entry(use, &b->source_list, source_list) { + if (use->source == a) { DEBUGP("%s uses %s!\n", a->name, b->name); return 1; } @@ -504,62 +556,68 @@ static int already_uses(struct module *a, struct module *b) return 0; } -/* Module a uses b */ -int use_module(struct module *a, struct module *b) +/* + * Module a uses b + * - we add 'a' as a "source", 'b' as a "target" of module use + * - the module_use is added to the list of 'b' sources (so + * 'b' can walk the list to see who sourced them), and of 'a' + * targets (so 'a' can see what modules it targets). + */ +static int add_module_usage(struct module *a, struct module *b) { struct module_use *use; - int no_warn, err; - if (b == NULL || already_uses(a, b)) return 1; + DEBUGP("Allocating new usage for %s.\n", a->name); + use = kmalloc(sizeof(*use), GFP_ATOMIC); + if (!use) { + printk(KERN_WARNING "%s: out of memory loading\n", a->name); + return -ENOMEM; + } + + use->source = a; + use->target = b; + list_add(&use->source_list, &b->source_list); + list_add(&use->target_list, &a->target_list); + return 0; +} - /* If we're interrupted or time out, we fail. */ - if (wait_event_interruptible_timeout( - module_wq, (err = strong_try_module_get(b)) != -EBUSY, - 30 * HZ) <= 0) { - printk("%s: gave up waiting for init of module %s.\n", - a->name, b->name); +/* Module a uses b: caller needs module_mutex() */ +int ref_module(struct module *a, struct module *b) +{ + int err; + + if (b == NULL || already_uses(a, b)) return 0; - } - /* If strong_try_module_get() returned a different error, we fail. */ + /* If module isn't available, we fail. */ + err = strong_try_module_get(b); if (err) - return 0; + return err; - DEBUGP("Allocating new usage for %s.\n", a->name); - use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - printk("%s: out of memory loading\n", a->name); + err = add_module_usage(a, b); + if (err) { module_put(b); - return 0; + return err; } - - use->module_which_uses = a; - list_add(&use->list, &b->modules_which_use_me); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); - return 1; + return 0; } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) { - struct module *i; - - list_for_each_entry(i, &modules, list) { - struct module_use *use; + struct module_use *use, *tmp; - list_for_each_entry(use, &i->modules_which_use_me, list) { - if (use->module_which_uses == mod) { - DEBUGP("%s unusing %s\n", mod->name, i->name); - module_put(i); - list_del(&use->list); - kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); - /* There can be at most one match. */ - break; - } - } + mutex_lock(&module_mutex); + list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { + struct module *i = use->target; + DEBUGP("%s unusing %s\n", mod->name, i->name); + module_put(i); + list_del(&use->source_list); + list_del(&use->target_list); + kfree(use); } + mutex_unlock(&module_mutex); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -616,12 +674,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced) unsigned int module_refcount(struct module *mod) { - unsigned int total = 0; + unsigned int incs = 0, decs = 0; int cpu; for_each_possible_cpu(cpu) - total += per_cpu_ptr(mod->refptr, cpu)->count; - return total; + decs += per_cpu_ptr(mod->refptr, cpu)->decs; + /* + * ensure the incs are added up after the decs. + * module_put ensures incs are visible before decs with smp_wmb. + * + * This 2-count scheme avoids the situation where the refcount + * for CPU0 is read, then CPU0 increments the module refcount, + * then CPU1 drops that refcount, then the refcount for CPU1 is + * read. We would record a decrement but not its corresponding + * increment so we would see a low count (disaster). + * + * Rare situation? But module_refcount can be preempted, and we + * might be tallying up 4096+ CPUs. So it is not impossible. + */ + smp_rmb(); + for_each_possible_cpu(cpu) + incs += per_cpu_ptr(mod->refptr, cpu)->incs; + return incs - decs; } EXPORT_SYMBOL(module_refcount); @@ -657,16 +731,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - /* Create stop_machine threads since free_module relies on - * a non-failing stop_machine call. */ - ret = stop_machine_create(); - if (ret) - return ret; - - if (mutex_lock_interruptible(&module_mutex) != 0) { - ret = -EINTR; - goto out_stop; - } + if (mutex_lock_interruptible(&module_mutex) != 0) + return -EINTR; mod = find_module(name); if (!mod) { @@ -674,7 +740,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, goto out; } - if (!list_empty(&mod->modules_which_use_me)) { + if (!list_empty(&mod->source_list)) { /* Other modules depend on us: get rid of them first. */ ret = -EWOULDBLOCK; goto out; @@ -718,16 +784,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); async_synchronize_full(); - mutex_lock(&module_mutex); + /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - ddebug_remove_module(mod->name); - free_module(mod); - out: + free_module(mod); + return 0; +out: mutex_unlock(&module_mutex); -out_stop: - stop_machine_destroy(); return ret; } @@ -740,9 +804,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ - list_for_each_entry(use, &mod->modules_which_use_me, list) { + list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; - seq_printf(m, "%s,", use->module_which_uses->name); + seq_printf(m, "%s,", use->source->name); } if (mod->init != NULL && mod->exit == NULL) { @@ -798,10 +862,10 @@ void module_put(struct module *module) { if (module) { preempt_disable(); - __this_cpu_dec(module->refptr->count); + smp_wmb(); /* see comment in module_refcount */ + __this_cpu_inc(module->refptr->decs); - trace_module_put(module, _RET_IP_, - __this_cpu_read(module->refptr->count)); + trace_module_put(module, _RET_IP_); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -821,11 +885,11 @@ static inline void module_unload_free(struct module *mod) { } -int use_module(struct module *a, struct module *b) +int ref_module(struct module *a, struct module *b) { - return strong_try_module_get(b) == 0; + return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); static inline void module_unload_init(struct module *mod) { @@ -942,6 +1006,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; + /* Since this should be found in kernel (which can't be removed), + * no locking is necessary. */ if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, &crc, true, false)) BUG(); @@ -984,29 +1050,62 @@ static inline int same_magic(const char *amagic, const char *bmagic, } #endif /* CONFIG_MODVERSIONS */ -/* Resolve a symbol for this module. I.e. if we find one, record usage. - Must be holding module_mutex. */ +/* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, unsigned int versindex, const char *name, - struct module *mod) + struct module *mod, + char ownername[]) { struct module *owner; const struct kernel_symbol *sym; const unsigned long *crc; + int err; + mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); - /* use_module can fail due to OOM, - or module initialization or unloading */ - if (sym) { - if (!check_version(sechdrs, versindex, name, mod, crc, owner) - || !use_module(mod, owner)) - sym = NULL; + if (!sym) + goto unlock; + + if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + sym = ERR_PTR(-EINVAL); + goto getname; } + + err = ref_module(mod, owner); + if (err) { + sym = ERR_PTR(err); + goto getname; + } + +getname: + /* We must make copy under the lock if we failed to get ref. */ + strncpy(ownername, module_name(owner), MODULE_NAME_LEN); +unlock: + mutex_unlock(&module_mutex); return sym; } +static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *name, + struct module *mod) +{ + const struct kernel_symbol *ksym; + char ownername[MODULE_NAME_LEN]; + + if (wait_event_interruptible_timeout(module_wq, + !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, + mod, ownername)) || + PTR_ERR(ksym) != -EBUSY, + 30 * HZ) <= 0) { + printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", + mod->name, ownername); + } + return ksym; +} + /* * /sys/module/foo/sections stuff * J. Corbet <corbet@lwn.net> @@ -1125,7 +1224,7 @@ struct module_notes_attrs { struct bin_attribute attrs[0]; }; -static ssize_t module_notes_read(struct kobject *kobj, +static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t count) { @@ -1236,7 +1335,34 @@ static inline void remove_notes_attrs(struct module *mod) #endif #ifdef CONFIG_SYSFS -int module_add_modinfo_attrs(struct module *mod) +static void add_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + int nowarn; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) { + nowarn = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + } + mutex_unlock(&module_mutex); +#endif +} + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); +#endif +} + +static int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; struct module_attribute *temp_attr; @@ -1262,7 +1388,7 @@ int module_add_modinfo_attrs(struct module *mod) return error; } -void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { struct module_attribute *attr; int i; @@ -1278,7 +1404,7 @@ void module_remove_modinfo_attrs(struct module *mod) kfree(mod->modinfo_attrs); } -int mod_sysfs_init(struct module *mod) +static int mod_sysfs_init(struct module *mod) { int err; struct kobject *kobj; @@ -1312,12 +1438,16 @@ out: return err; } -int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, struct kernel_param *kparam, unsigned int num_params) { int err; + err = mod_sysfs_init(mod); + if (err) + goto out; + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); if (!mod->holders_dir) { err = -ENOMEM; @@ -1332,6 +1462,8 @@ int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; + add_usage_links(mod); + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1341,6 +1473,7 @@ out_unreg_holders: kobject_put(mod->holders_dir); out_unreg: kobject_put(&mod->mkobj.kobj); +out: return err; } @@ -1351,14 +1484,40 @@ static void mod_sysfs_fini(struct module *mod) #else /* CONFIG_SYSFS */ +static inline int mod_sysfs_init(struct module *mod) +{ + return 0; +} + +static inline int mod_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline int module_add_modinfo_attrs(struct module *mod) +{ + return 0; +} + +static inline void module_remove_modinfo_attrs(struct module *mod) +{ +} + static void mod_sysfs_fini(struct module *mod) { } +static void del_usage_links(struct module *mod) +{ +} + #endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { + del_usage_links(mod); module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); @@ -1377,17 +1536,22 @@ static int __unlink_module(void *_mod) return 0; } -/* Free a module, remove from lists, etc (must hold module_mutex). */ +/* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { trace_module_free(mod); /* Delete from various lists */ + mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); remove_notes_attrs(mod); remove_sect_attrs(mod); mod_kobject_remove(mod); + /* Remove dynamic debug info */ + ddebug_remove_module(mod->name); + /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1400,8 +1564,7 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); - if (mod->percpu) - percpu_modfree(mod->percpu); + percpu_modfree(mod); #if defined(CONFIG_MODULE_UNLOAD) if (mod->refptr) free_percpu(mod->refptr); @@ -1435,6 +1598,8 @@ EXPORT_SYMBOL_GPL(__symbol_get); /* * Ensure that an exported symbol [global namespace] does not already exist * in the kernel or in some other module's exported symbol table. + * + * You must hold the module_mutex. */ static int verify_export_symbols(struct module *mod) { @@ -1500,27 +1665,29 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol(sechdrs, versindex, - strtab + sym[i].st_name, mod); + ksym = resolve_symbol_wait(sechdrs, versindex, + strtab + sym[i].st_name, + mod); /* Ok if resolved. */ - if (ksym) { + if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; break; } /* Ok if weak. */ - if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) + if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - printk(KERN_WARNING "%s: Unknown symbol %s\n", - mod->name, strtab + sym[i].st_name); - ret = -ENOENT; + printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", + mod->name, strtab + sym[i].st_name, + PTR_ERR(ksym)); + ret = PTR_ERR(ksym) ?: -ENOENT; break; default: /* Divert to percpu allocation if a percpu var. */ if (sym[i].st_shndx == pcpuindex) - secbase = (unsigned long)mod->percpu; + secbase = (unsigned long)mod_percpu(mod); else secbase = sechdrs[sym[i].st_shndx].sh_addr; sym[i].st_value += secbase; @@ -1897,16 +2064,24 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) #endif } +static void dynamic_debug_remove(struct _ddebug *debug) +{ + if (debug) + ddebug_remove_module(debug->modname); +} + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); if (ret) { + mutex_lock(&module_mutex); /* Update module bounds. */ if ((unsigned long)ret < module_addr_min) module_addr_min = (unsigned long)ret; if ((unsigned long)ret + size > module_addr_max) module_addr_max = (unsigned long)ret + size; + mutex_unlock(&module_mutex); } return ret; } @@ -1954,8 +2129,11 @@ static noinline struct module *load_module(void __user *umod, unsigned int modindex, versindex, infoindex, pcpuindex; struct module *mod; long err = 0; - void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; + void __percpu *percpu; + struct _ddebug *debug = NULL; + unsigned int num_debug = 0; mm_segment_t old_fs; @@ -2080,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; } - if (find_module(mod->name)) { - err = -EEXIST; - goto free_mod; - } - mod->state = MODULE_STATE_COMING; /* Allow arches to frob section contents and sizes. */ @@ -2094,16 +2267,14 @@ static noinline struct module *load_module(void __user *umod, if (pcpuindex) { /* We have a special allocation for this section. */ - percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign, - mod->name); - if (!percpu) { - err = -ENOMEM; + err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, + sechdrs[pcpuindex].sh_addralign); + if (err) goto free_mod; - } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - mod->percpu = percpu; } + /* Keep this around for failure path. */ + percpu = mod_percpu(mod); /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any @@ -2177,11 +2348,6 @@ static noinline struct module *load_module(void __user *umod, /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); - /* add kobject, so we can reference it. */ - err = mod_sysfs_init(mod); - if (err) - goto free_unload; - /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); @@ -2306,18 +2472,13 @@ static noinline struct module *load_module(void __user *umod, goto cleanup; } - /* Find duplicate symbols */ - err = verify_export_symbols(mod); - if (err < 0) - goto cleanup; - /* Set up and sort exception table */ mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); sort_extable(mod->extable, mod->extable + mod->num_exentries); /* Finally, copy percpu area over. */ - percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, + percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, @@ -2325,15 +2486,9 @@ static noinline struct module *load_module(void __user *umod, kfree(strmap); strmap = NULL; - if (!mod->taints) { - struct _ddebug *debug; - unsigned int num_debug; - + if (!mod->taints) debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); - if (debug) - dynamic_debug_setup(debug, num_debug); - } err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2369,7 +2524,22 @@ static noinline struct module *load_module(void __user *umod, * function to insert in a way safe to concurrent readers. * The mutex protects against concurrent writers. */ + mutex_lock(&module_mutex); + if (find_module(mod->name)) { + err = -EEXIST; + goto unlock; + } + + if (debug) + dynamic_debug_setup(debug, num_debug); + + /* Find duplicate symbols */ + err = verify_export_symbols(mod); + if (err < 0) + goto ddebug; + list_add_rcu(&mod->list, &modules); + mutex_unlock(&module_mutex); err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) @@ -2378,6 +2548,7 @@ static noinline struct module *load_module(void __user *umod, err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); if (err < 0) goto unlink; + add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); @@ -2390,15 +2561,17 @@ static noinline struct module *load_module(void __user *umod, return mod; unlink: + mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + ddebug: + dynamic_debug_remove(debug); + unlock: + mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); cleanup: free_modinfo(mod); - kobject_del(&mod->mkobj.kobj); - kobject_put(&mod->mkobj.kobj); - free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) free_percpu(mod->refptr); @@ -2409,8 +2582,7 @@ static noinline struct module *load_module(void __user *umod, module_free(mod, mod->module_core); /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: - if (percpu) - percpu_modfree(percpu); + free_percpu(percpu); free_mod: kfree(args); kfree(strmap); @@ -2446,19 +2618,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - /* Only one module load at a time, please */ - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - /* Do all the hard work */ mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) { - mutex_unlock(&module_mutex); + if (IS_ERR(mod)) return PTR_ERR(mod); - } - - /* Drop lock so they can recurse */ - mutex_unlock(&module_mutex); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -2475,9 +2638,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); - mutex_lock(&module_mutex); free_module(mod); - mutex_unlock(&module_mutex); wake_up(&module_wq); return ret; } diff --git a/kernel/mutex.c b/kernel/mutex.c index 632f04c..4c0b7b3 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -172,6 +172,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct thread_info *owner; /* + * If we own the BKL, then don't spin. The owner of + * the mutex might be waiting on us to release the BKL. + */ + if (unlikely(current->lock_depth >= 0)) + break; + + /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 2ab6723..f74e6c0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -13,6 +13,7 @@ * Pavel Emelianov <xemul@openvz.org> */ +#include <linux/slab.h> #include <linux/module.h> #include <linux/nsproxy.h> #include <linux/init_task.h> diff --git a/kernel/padata.c b/kernel/padata.c index 93caf65..7510194 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -25,18 +25,20 @@ #include <linux/padata.h> #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/slab.h> +#include <linux/sysfs.h> #include <linux/rcupdate.h> -#define MAX_SEQ_NR INT_MAX - NR_CPUS -#define MAX_OBJ_NUM 10000 * NR_CPUS +#define MAX_SEQ_NR (INT_MAX - NR_CPUS) +#define MAX_OBJ_NUM 1000 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; - target_cpu = cpumask_first(pd->cpumask); + target_cpu = cpumask_first(pd->cpumask.pcpu); for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask); + target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); return target_cpu; } @@ -52,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata) * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *work) +static void padata_parallel_worker(struct work_struct *parallel_work) { - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, pwork); - pd = queue->pd; + pqueue = container_of(parallel_work, + struct padata_parallel_queue, work); + pd = pqueue->pd; pinst = pd->pinst; - spin_lock(&queue->parallel.lock); - list_replace_init(&queue->parallel.list, &local_list); - spin_unlock(&queue->parallel.lock); + spin_lock(&pqueue->parallel.lock); + list_replace_init(&pqueue->parallel.list, &local_list); + spin_unlock(&pqueue->parallel.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -87,13 +90,13 @@ static void padata_parallel_worker(struct work_struct *work) local_bh_enable(); } -/* +/** * padata_do_parallel - padata parallelization function * * @pinst: padata instance * @padata: object to be parallelized * @cb_cpu: cpu the serialization callback function will run on, - * must be in the cpumask of padata. + * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). * * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel @@ -103,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu) { int target_cpu, err; - struct padata_queue *queue; + struct padata_parallel_queue *queue; struct parallel_data *pd; rcu_read_lock_bh(); pd = rcu_dereference(pinst->pd); - err = 0; - if (!(pinst->flags & PADATA_INIT)) + err = -EINVAL; + if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) + goto out; + + if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) goto out; err = -EBUSY; @@ -121,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst, if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) goto out; - err = -EINVAL; - if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) - goto out; - - err = -EINPROGRESS; + err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = cb_cpu; @@ -136,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst, padata->seq_nr = atomic_inc_return(&pd->seq_nr); target_cpu = padata_cpu_hash(padata); - queue = per_cpu_ptr(pd->queue, target_cpu); + queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->wq, &queue->pwork); + queue_work_on(target_cpu, pinst->wq, &queue->work); out: rcu_read_unlock_bh(); @@ -151,86 +153,72 @@ out: } EXPORT_SYMBOL(padata_do_parallel); +/* + * padata_get_next - Get the next object that needs serialization. + * + * Return values are: + * + * A pointer to the control struct of the next object that needs + * serialization, if present in one of the percpu reorder queues. + * + * NULL, if all percpu reorder queues are empty. + * + * -EINPROGRESS, if the next object that needs serialization will + * be parallel processed by another cpu and is not yet present in + * the cpu's reorder queue. + * + * -ENODATA, if this cpu has to do the parallel processing for + * the next object. + */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus, empty, calc_seq_nr; - int seq_nr, next_nr, overrun, next_overrun; - struct padata_queue *queue, *next_queue; + int cpu, num_cpus; + int next_nr, next_index; + struct padata_parallel_queue *queue, *next_queue; struct padata_priv *padata; struct padata_list *reorder; - empty = 0; - next_nr = -1; - next_overrun = 0; - next_queue = NULL; - - num_cpus = cpumask_weight(pd->cpumask); - - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - reorder = &queue->reorder; - - /* - * Calculate the seq_nr of the object that should be - * next in this queue. - */ - overrun = 0; - calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) - + queue->cpu_index; - - if (unlikely(calc_seq_nr > pd->max_seq_nr)) { - calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; - overrun = 1; - } - - if (!list_empty(&reorder->list)) { - padata = list_entry(reorder->list.next, - struct padata_priv, list); - - seq_nr = padata->seq_nr; - BUG_ON(calc_seq_nr != seq_nr); - } else { - seq_nr = calc_seq_nr; - empty++; - } + num_cpus = cpumask_weight(pd->cpumask.pcpu); - if (next_nr < 0 || seq_nr < next_nr - || (next_overrun && !overrun)) { - next_nr = seq_nr; - next_overrun = overrun; - next_queue = queue; - } + /* + * Calculate the percpu reorder queue and the sequence + * number of the next object. + */ + next_nr = pd->processed; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + + if (unlikely(next_nr > pd->max_seq_nr)) { + next_nr = next_nr - pd->max_seq_nr - 1; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + pd->processed = 0; } padata = NULL; - if (empty == num_cpus) - goto out; - reorder = &next_queue->reorder; if (!list_empty(&reorder->list)) { padata = list_entry(reorder->list.next, struct padata_priv, list); - if (unlikely(next_overrun)) { - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - atomic_set(&queue->num_obj, 0); - } - } + BUG_ON(next_nr != padata->seq_nr); spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); spin_unlock(&reorder->lock); - atomic_inc(&next_queue->num_obj); + pd->processed++; goto out; } - if (next_nr % num_cpus == next_queue->cpu_index) { + queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); + if (queue->cpu_index == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); goto out; } @@ -243,55 +231,90 @@ out: static void padata_reorder(struct parallel_data *pd) { struct padata_priv *padata; - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; -try_again: + /* + * We need to ensure that only one cpu can work on dequeueing of + * the reorder queue the time. Calculating in which percpu reorder + * queue the next object will arrive takes some time. A spinlock + * would be highly contended. Also it is not clear in which order + * the objects arrive to the reorder queues. So a cpu could wait to + * get the lock just to notice that there is nothing to do at the + * moment. Therefore we use a trylock and let the holder of the lock + * care for all the objects enqueued during the holdtime of the lock. + */ if (!spin_trylock_bh(&pd->lock)) - goto out; + return; while (1) { padata = padata_get_next(pd); + /* + * All reorder queues are empty, or the next object that needs + * serialization is parallel processed by another cpu and is + * still on it's way to the cpu's reorder queue, nothing to + * do for now. + */ if (!padata || PTR_ERR(padata) == -EINPROGRESS) break; + /* + * This cpu has to do the parallel processing of the next + * object. It's waiting in the cpu's parallelization queue, + * so exit imediately. + */ if (PTR_ERR(padata) == -ENODATA) { + del_timer(&pd->timer); spin_unlock_bh(&pd->lock); - goto out; + return; } - queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); - spin_lock(&queue->serial.lock); - list_add_tail(&padata->list, &queue->serial.list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_add_tail(&padata->list, &squeue->serial.list); + spin_unlock(&squeue->serial.lock); - queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); } spin_unlock_bh(&pd->lock); - if (atomic_read(&pd->reorder_objects)) - goto try_again; + /* + * The next object that needs serialization might have arrived to + * the reorder queues in the meantime, we will be called again + * from the timer function if noone else cares for it. + */ + if (atomic_read(&pd->reorder_objects) + && !(pinst->flags & PADATA_RESET)) + mod_timer(&pd->timer, jiffies + HZ); + else + del_timer(&pd->timer); -out: return; } -static void padata_serial_worker(struct work_struct *work) +static void padata_reorder_timer(unsigned long arg) +{ + struct parallel_data *pd = (struct parallel_data *)arg; + + padata_reorder(pd); +} + +static void padata_serial_worker(struct work_struct *serial_work) { - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct parallel_data *pd; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, swork); - pd = queue->pd; + squeue = container_of(serial_work, struct padata_serial_queue, work); + pd = squeue->pd; - spin_lock(&queue->serial.lock); - list_replace_init(&queue->serial.list, &local_list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_replace_init(&squeue->serial.list, &local_list); + spin_unlock(&squeue->serial.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -307,7 +330,7 @@ static void padata_serial_worker(struct work_struct *work) local_bh_enable(); } -/* +/** * padata_do_serial - padata serialization function * * @padata: object to be serialized. @@ -318,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work) void padata_do_serial(struct padata_priv *padata) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; pd = padata->pd; cpu = get_cpu(); - queue = per_cpu_ptr(pd->queue, cpu); + pqueue = per_cpu_ptr(pd->pqueue, cpu); - spin_lock(&queue->reorder.lock); + spin_lock(&pqueue->reorder.lock); atomic_inc(&pd->reorder_objects); - list_add_tail(&padata->list, &queue->reorder.list); - spin_unlock(&queue->reorder.lock); + list_add_tail(&padata->list, &pqueue->reorder.list); + spin_unlock(&pqueue->reorder.lock); put_cpu(); @@ -337,55 +360,90 @@ void padata_do_serial(struct padata_priv *padata) } EXPORT_SYMBOL(padata_do_serial); -static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, - const struct cpumask *cpumask) +static int padata_setup_cpumasks(struct parallel_data *pd, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { - int cpu, cpu_index, num_cpus; - struct padata_queue *queue; - struct parallel_data *pd; + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) + return -ENOMEM; - cpu_index = 0; + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pd->cpumask.cbcpu); + return -ENOMEM; + } - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); - if (!pd) - goto err; + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + return 0; +} - pd->queue = alloc_percpu(struct padata_queue); - if (!pd->queue) - goto err_free_pd; +static void __padata_list_init(struct padata_list *pd_list) +{ + INIT_LIST_HEAD(&pd_list->list); + spin_lock_init(&pd_list->lock); +} - if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) - goto err_free_queue; +/* Initialize all percpu queues used by serial workers */ +static void padata_init_squeues(struct parallel_data *pd) +{ + int cpu; + struct padata_serial_queue *squeue; + + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + squeue->pd = pd; + __padata_list_init(&squeue->serial); + INIT_WORK(&squeue->work, padata_serial_worker); + } +} - for_each_possible_cpu(cpu) { - queue = per_cpu_ptr(pd->queue, cpu); +/* Initialize all percpu queues used by parallel workers */ +static void padata_init_pqueues(struct parallel_data *pd) +{ + int cpu_index, num_cpus, cpu; + struct padata_parallel_queue *pqueue; - queue->pd = pd; + cpu_index = 0; + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + pqueue->pd = pd; + pqueue->cpu_index = cpu_index; + cpu_index++; + + __padata_list_init(&pqueue->reorder); + __padata_list_init(&pqueue->parallel); + INIT_WORK(&pqueue->work, padata_parallel_worker); + atomic_set(&pqueue->num_obj, 0); + } - if (cpumask_test_cpu(cpu, cpumask) - && cpumask_test_cpu(cpu, cpu_active_mask)) { - queue->cpu_index = cpu_index; - cpu_index++; - } else - queue->cpu_index = -1; + num_cpus = cpumask_weight(pd->cpumask.pcpu); + pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; +} - INIT_LIST_HEAD(&queue->reorder.list); - INIT_LIST_HEAD(&queue->parallel.list); - INIT_LIST_HEAD(&queue->serial.list); - spin_lock_init(&queue->reorder.lock); - spin_lock_init(&queue->parallel.lock); - spin_lock_init(&queue->serial.lock); +/* Allocate and initialize the internal cpumask dependend resources. */ +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ + struct parallel_data *pd; - INIT_WORK(&queue->pwork, padata_parallel_worker); - INIT_WORK(&queue->swork, padata_serial_worker); - atomic_set(&queue->num_obj, 0); - } + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; - cpumask_and(pd->cpumask, cpumask, cpu_active_mask); + pd->pqueue = alloc_percpu(struct padata_parallel_queue); + if (!pd->pqueue) + goto err_free_pd; - num_cpus = cpumask_weight(pd->cpumask); - pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + pd->squeue = alloc_percpu(struct padata_serial_queue); + if (!pd->squeue) + goto err_free_pqueue; + if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) + goto err_free_squeue; + padata_init_pqueues(pd); + padata_init_squeues(pd); + setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); @@ -394,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, return pd; -err_free_queue: - free_percpu(pd->queue); +err_free_squeue: + free_percpu(pd->squeue); +err_free_pqueue: + free_percpu(pd->pqueue); err_free_pd: kfree(pd); err: @@ -404,15 +464,63 @@ err: static void padata_free_pd(struct parallel_data *pd) { - free_cpumask_var(pd->cpumask); - free_percpu(pd->queue); + free_cpumask_var(pd->cpumask.pcpu); + free_cpumask_var(pd->cpumask.cbcpu); + free_percpu(pd->pqueue); + free_percpu(pd->squeue); kfree(pd); } +/* Flush all objects out of the padata queues. */ +static void padata_flush_queues(struct parallel_data *pd) +{ + int cpu; + struct padata_parallel_queue *pqueue; + struct padata_serial_queue *squeue; + + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + flush_work(&pqueue->work); + } + + del_timer_sync(&pd->timer); + + if (atomic_read(&pd->reorder_objects)) + padata_reorder(pd); + + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + flush_work(&squeue->work); + } + + BUG_ON(atomic_read(&pd->refcnt) != 0); +} + +static void __padata_start(struct padata_instance *pinst) +{ + pinst->flags |= PADATA_INIT; +} + +static void __padata_stop(struct padata_instance *pinst) +{ + if (!(pinst->flags & PADATA_INIT)) + return; + + pinst->flags &= ~PADATA_INIT; + + synchronize_rcu(); + + get_online_cpus(); + padata_flush_queues(pinst->pd); + put_online_cpus(); +} + +/* Replace the internal control stucture with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { struct parallel_data *pd_old = pinst->pd; + int notification_mask = 0; pinst->flags |= PADATA_RESET; @@ -420,43 +528,162 @@ static void padata_replace(struct padata_instance *pinst, synchronize_rcu(); - while (atomic_read(&pd_old->refcnt) != 0) - yield(); - - flush_workqueue(pinst->wq); + if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) + notification_mask |= PADATA_CPU_PARALLEL; + if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) + notification_mask |= PADATA_CPU_SERIAL; + padata_flush_queues(pd_old); padata_free_pd(pd_old); + if (notification_mask) + blocking_notifier_call_chain(&pinst->cpumask_change_notifier, + notification_mask, + &pd_new->cpumask); + pinst->flags &= ~PADATA_RESET; } -/* - * padata_set_cpumask - set the cpumask that padata should use +/** + * padata_register_cpumask_notifier - Registers a notifier that will be called + * if either pcpu or cbcpu or both cpumasks change. * - * @pinst: padata instance - * @cpumask: the cpumask to use + * @pinst: A poineter to padata instance + * @nblock: A pointer to notifier block. */ -int padata_set_cpumask(struct padata_instance *pinst, - cpumask_var_t cpumask) +int padata_register_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) { - struct parallel_data *pd; - int err = 0; + return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_register_cpumask_notifier); - might_sleep(); +/** + * padata_unregister_cpumask_notifier - Unregisters cpumask notifier + * registered earlier using padata_register_cpumask_notifier + * + * @pinst: A pointer to data instance. + * @nlock: A pointer to notifier block. + */ +int padata_unregister_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) +{ + return blocking_notifier_chain_unregister( + &pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_unregister_cpumask_notifier); - mutex_lock(&pinst->lock); - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) { - err = -ENOMEM; - goto out; +/* If cpumask contains no active cpu, we mark the instance as invalid. */ +static bool padata_validate_cpumask(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + if (!cpumask_intersects(cpumask, cpu_active_mask)) { + pinst->flags |= PADATA_INVALID; + return false; + } + + pinst->flags &= ~PADATA_INVALID; + return true; +} + +static int __padata_set_cpumasks(struct padata_instance *pinst, + cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int valid; + struct parallel_data *pd; + + valid = padata_validate_cpumask(pinst, pcpumask); + if (!valid) { + __padata_stop(pinst); + goto out_replace; } - cpumask_copy(pinst->cpumask, cpumask); + valid = padata_validate_cpumask(pinst, cbcpumask); + if (!valid) + __padata_stop(pinst); + +out_replace: + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + return -ENOMEM; + + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); padata_replace(pinst, pd); + if (valid) + __padata_start(pinst); + + return 0; +} + +/** + * padata_set_cpumasks - Set both parallel and serial cpumasks. The first + * one is used by parallel workers and the second one + * by the wokers doing serialization. + * + * @pinst: padata instance + * @pcpumask: the cpumask to use for parallel workers + * @cbcpumask: the cpumsak to use for serial workers + */ +int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int err; + + mutex_lock(&pinst->lock); + get_online_cpus(); + + err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); + + put_online_cpus(); + mutex_unlock(&pinst->lock); + + return err; + +} +EXPORT_SYMBOL(padata_set_cpumasks); + +/** + * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value + * equivalent to @cpumask. + * + * @pinst: padata instance + * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding + * to parallel and serial cpumasks respectively. + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, + cpumask_var_t cpumask) +{ + struct cpumask *serial_mask, *parallel_mask; + int err = -EINVAL; + + mutex_lock(&pinst->lock); + get_online_cpus(); + + switch (cpumask_type) { + case PADATA_CPU_PARALLEL: + serial_mask = pinst->cpumask.cbcpu; + parallel_mask = cpumask; + break; + case PADATA_CPU_SERIAL: + parallel_mask = pinst->cpumask.pcpu; + serial_mask = cpumask; + break; + default: + goto out; + } + + err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); + out: + put_online_cpus(); mutex_unlock(&pinst->lock); return err; @@ -468,32 +695,50 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) struct parallel_data *pd; if (cpumask_test_cpu(cpu, cpu_active_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; padata_replace(pinst, pd); + + if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && + padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_start(pinst); } return 0; } -/* - * padata_add_cpu - add a cpu to the padata cpumask + /** + * padata_add_cpu - add a cpu to one or both(parallel and serial) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to add + * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_add_cpu(struct padata_instance *pinst, int cpu) + +int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; - might_sleep(); + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; mutex_lock(&pinst->lock); - cpumask_set_cpu(cpu, pinst->cpumask); + get_online_cpus(); + if (mask & PADATA_CPU_SERIAL) + cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_set_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_add_cpu(pinst, cpu); + put_online_cpus(); mutex_unlock(&pinst->lock); @@ -503,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu); static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd; + struct parallel_data *pd = NULL; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + + if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || + !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_stop(pinst); + + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; @@ -516,22 +767,34 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return 0; } -/* - * padata_remove_cpu - remove a cpu from the padata cpumask + /** + * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to remove + * @mask: bitmask specifying from which cpumask @cpu should be removed + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_remove_cpu(struct padata_instance *pinst, int cpu) +int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; - might_sleep(); + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; mutex_lock(&pinst->lock); - cpumask_clear_cpu(cpu, pinst->cpumask); + get_online_cpus(); + if (mask & PADATA_CPU_SERIAL) + cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_remove_cpu(pinst, cpu); + put_online_cpus(); mutex_unlock(&pinst->lock); @@ -539,38 +802,52 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu) } EXPORT_SYMBOL(padata_remove_cpu); -/* +/** * padata_start - start the parallel processing * * @pinst: padata instance to start */ -void padata_start(struct padata_instance *pinst) +int padata_start(struct padata_instance *pinst) { - might_sleep(); + int err = 0; mutex_lock(&pinst->lock); - pinst->flags |= PADATA_INIT; + + if (pinst->flags & PADATA_INVALID) + err =-EINVAL; + + __padata_start(pinst); + mutex_unlock(&pinst->lock); + + return err; } EXPORT_SYMBOL(padata_start); -/* +/** * padata_stop - stop the parallel processing * * @pinst: padata instance to stop */ void padata_stop(struct padata_instance *pinst) { - might_sleep(); - mutex_lock(&pinst->lock); - pinst->flags &= ~PADATA_INIT; + __padata_stop(pinst); mutex_unlock(&pinst->lock); } EXPORT_SYMBOL(padata_stop); -static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +#ifdef CONFIG_HOTPLUG_CPU + +static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) +{ + return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || + cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); +} + + +static int padata_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { int err; struct padata_instance *pinst; @@ -581,29 +858,29 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_add_cpu(pinst, cpu); mutex_unlock(&pinst->lock); if (err) - return NOTIFY_BAD; + return notifier_from_errno(err); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_remove_cpu(pinst, cpu); mutex_unlock(&pinst->lock); if (err) - return NOTIFY_BAD; + return notifier_from_errno(err); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_remove_cpu(pinst, cpu); @@ -611,7 +888,7 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_add_cpu(pinst, cpu); @@ -620,77 +897,239 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } +#endif + +static void __padata_free(struct padata_instance *pinst) +{ +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&pinst->cpu_notifier); +#endif + + padata_stop(pinst); + padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); + kfree(pinst); +} + +#define kobj2pinst(_kobj) \ + container_of(_kobj, struct padata_instance, kobj) +#define attr2pentry(_attr) \ + container_of(_attr, struct padata_sysfs_entry, attr) + +static void padata_sysfs_release(struct kobject *kobj) +{ + struct padata_instance *pinst = kobj2pinst(kobj); + __padata_free(pinst); +} + +struct padata_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct padata_instance *, struct attribute *, char *); + ssize_t (*store)(struct padata_instance *, struct attribute *, + const char *, size_t); +}; + +static ssize_t show_cpumask(struct padata_instance *pinst, + struct attribute *attr, char *buf) +{ + struct cpumask *cpumask; + ssize_t len; + + mutex_lock(&pinst->lock); + if (!strcmp(attr->name, "serial_cpumask")) + cpumask = pinst->cpumask.cbcpu; + else + cpumask = pinst->cpumask.pcpu; + + len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), + nr_cpu_ids); + if (PAGE_SIZE - len < 2) + len = -EINVAL; + else + len += sprintf(buf + len, "\n"); + + mutex_unlock(&pinst->lock); + return len; +} + +static ssize_t store_cpumask(struct padata_instance *pinst, + struct attribute *attr, + const char *buf, size_t count) +{ + cpumask_var_t new_cpumask; + ssize_t ret; + int mask_type; + + if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), + nr_cpumask_bits); + if (ret < 0) + goto out; + + mask_type = !strcmp(attr->name, "serial_cpumask") ? + PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; + ret = padata_set_cpumask(pinst, mask_type, new_cpumask); + if (!ret) + ret = count; + +out: + free_cpumask_var(new_cpumask); + return ret; +} + +#define PADATA_ATTR_RW(_name, _show_name, _store_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0644, _show_name, _store_name) +#define PADATA_ATTR_RO(_name, _show_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0400, _show_name, NULL) + +PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); +PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); /* - * padata_alloc - allocate and initialize a padata instance + * Padata sysfs provides the following objects: + * serial_cpumask [RW] - cpumask for serial workers + * parallel_cpumask [RW] - cpumask for parallel workers + */ +static struct attribute *padata_default_attrs[] = { + &serial_cpumask_attr.attr, + ¶llel_cpumask_attr.attr, + NULL, +}; + +static ssize_t padata_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->show(pinst, attr, buf); + + return ret; +} + +static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->store(pinst, attr, buf, count); + + return ret; +} + +static const struct sysfs_ops padata_sysfs_ops = { + .show = padata_sysfs_show, + .store = padata_sysfs_store, +}; + +static struct kobj_type padata_attr_type = { + .sysfs_ops = &padata_sysfs_ops, + .default_attrs = padata_default_attrs, + .release = padata_sysfs_release, +}; + +/** + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. * - * @cpumask: cpumask that padata uses for parallelization * @wq: workqueue to use for the allocated padata instance */ -struct padata_instance *padata_alloc(const struct cpumask *cpumask, - struct workqueue_struct *wq) +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + +/** + * padata_alloc - allocate and initialize a padata instance and specify + * cpumasks for serial and parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + * @pcpumask: cpumask that will be used for padata parallelization + * @cbcpumask: cpumask that will be used for padata serialization + */ +struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { - int err; struct padata_instance *pinst; - struct parallel_data *pd; + struct parallel_data *pd = NULL; pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); if (!pinst) goto err; - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) + get_online_cpus(); + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pinst->cpumask.pcpu); goto err_free_inst; + } + if (!padata_validate_cpumask(pinst, pcpumask) || + !padata_validate_cpumask(pinst, cbcpumask)) + goto err_free_masks; - if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) - goto err_free_pd; + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + goto err_free_masks; rcu_assign_pointer(pinst->pd, pd); pinst->wq = wq; - cpumask_copy(pinst->cpumask, cpumask); + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); pinst->flags = 0; +#ifdef CONFIG_HOTPLUG_CPU pinst->cpu_notifier.notifier_call = padata_cpu_callback; pinst->cpu_notifier.priority = 0; - err = register_hotcpu_notifier(&pinst->cpu_notifier); - if (err) - goto err_free_cpumask; + register_hotcpu_notifier(&pinst->cpu_notifier); +#endif + + put_online_cpus(); + BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); + kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); return pinst; -err_free_cpumask: - free_cpumask_var(pinst->cpumask); -err_free_pd: - padata_free_pd(pd); +err_free_masks: + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); + put_online_cpus(); err: return NULL; } EXPORT_SYMBOL(padata_alloc); -/* +/** * padata_free - free a padata instance * - * @ padata_inst: padata instance to free + * @padata_inst: padata instance to free */ void padata_free(struct padata_instance *pinst) { - padata_stop(pinst); - - synchronize_rcu(); - - while (atomic_read(&pinst->pd->refcnt) != 0) - yield(); - - unregister_hotcpu_notifier(&pinst->cpu_notifier); - padata_free_pd(pinst->pd); - free_cpumask_var(pinst->cpumask); - kfree(pinst); + kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); diff --git a/kernel/panic.c b/kernel/panic.c index 13d966b..3b16cd9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -87,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...) */ preempt_disable(); + console_verbose(); bust_spinlocks(1); va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); @@ -178,6 +179,7 @@ static const struct tnt tnts[] = { { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, { TAINT_WARN, 'W', ' ' }, { TAINT_CRAP, 'C', ' ' }, + { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, }; /** @@ -194,6 +196,7 @@ static const struct tnt tnts[] = { * 'A' - ACPI table overridden. * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. + * 'I' - Working around severe firmware bug. * * The string is overwritten by the next call to print_tainted(). */ @@ -365,7 +368,8 @@ struct slowpath_args { va_list args; }; -static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) +static void warn_slowpath_common(const char *file, int line, void *caller, + unsigned taint, struct slowpath_args *args) { const char *board; @@ -381,7 +385,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc print_modules(); dump_stack(); print_oops_end_marker(); - add_taint(TAINT_WARN); + add_taint(taint); } void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) @@ -390,14 +394,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), &args); + warn_slowpath_common(file, line, __builtin_return_address(0), + TAINT_WARN, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); +void warn_slowpath_fmt_taint(const char *file, int line, + unsigned taint, const char *fmt, ...) +{ + struct slowpath_args args; + + args.fmt = fmt; + va_start(args.args, fmt); + warn_slowpath_common(file, line, __builtin_return_address(0), + taint, &args); + va_end(args.args); +} +EXPORT_SYMBOL(warn_slowpath_fmt_taint); + void warn_slowpath_null(const char *file, int line) { - warn_slowpath_common(file, line, __builtin_return_address(0), NULL); + warn_slowpath_common(file, line, __builtin_return_address(0), + TAINT_WARN, NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 574ee58..ff86c55 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -15,6 +15,8 @@ #include <linux/smp.h> #include <linux/file.h> #include <linux/poll.h> +#include <linux/slab.h> +#include <linux/hash.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> @@ -81,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -int __weak -hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - return 0; -} - void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); @@ -261,6 +255,18 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -277,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *group_leader = event->group_leader; + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; /* - * Depending on whether it is a standalone or sibling event, - * add it straight to the context's event list, or to the group - * leader's sibling list: + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. */ - if (group_leader == event) { + if (event->group_leader == event) { struct list_head *list; if (is_software_event(event)) @@ -292,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); - } else { - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; - - list_add_tail(&event->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; } list_add_rcu(&event->event_entry, &ctx->event_list); @@ -307,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader; + + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; +} + /* * Remove a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -314,21 +332,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *sibling, *tmp; - - if (list_empty(&event->group_entry)) + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; - list_del_init(&event->group_entry); list_del_rcu(&event->event_entry); - if (event->group_leader != event) - event->group_leader->nr_siblings--; + if (event->group_leader == event) + list_del_init(&event->group_entry); - update_event_times(event); + update_group_times(event); /* * If event was in error state, then keep it @@ -339,17 +360,41 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; +} + +static void perf_group_detach(struct perf_event *event) +{ + struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + return; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them - * to the context list directly: + * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - struct list_head *list; - - list = ctx_group_list(event, ctx); - list_move_tail(&sibling->group_entry, list); + if (list) + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ @@ -504,18 +549,6 @@ retry: } /* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - -/* * Cross CPU call to disable a performance event */ static void __perf_event_disable(void *info) @@ -639,18 +672,26 @@ group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - struct perf_event *event, *partial_group; + struct perf_event *event, *partial_group = NULL; + const struct pmu *pmu = group_event->pmu; + bool txn = false; int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); - if (ret) - return ret < 0 ? ret : 0; + /* Check if group transaction availabe */ + if (pmu->start_txn) + txn = true; - if (event_sched_in(group_event, cpuctx, ctx)) + if (txn) + pmu->start_txn(pmu); + + if (event_sched_in(group_event, cpuctx, ctx)) { + if (txn) + pmu->cancel_txn(pmu); return -EAGAIN; + } /* * Schedule in siblings as one group (if any): @@ -662,7 +703,14 @@ group_sched_in(struct perf_event *group_event, } } - return 0; + if (!txn) + return 0; + + ret = pmu->commit_txn(pmu); + if (!ret) { + pmu->cancel_txn(pmu); + return 0; + } group_error: /* @@ -676,6 +724,9 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); + if (txn) + pmu->cancel_txn(pmu); + return -EAGAIN; } @@ -714,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); + perf_group_attach(event); event->tstamp_enabled = ctx->time; event->tstamp_running = ctx->time; event->tstamp_stopped = ctx->time; @@ -1164,11 +1216,9 @@ void perf_event_task_sched_out(struct task_struct *task, struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; - struct pt_regs *regs; int do_switch = 1; - regs = task_pt_regs(task); - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; @@ -1368,6 +1418,8 @@ void perf_event_task_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) return; + perf_disable(); + /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1380,6 +1432,8 @@ void perf_event_task_sched_in(struct task_struct *task) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); cpuctx->task_ctx = ctx; + + perf_enable(); } #define MAX_INTERRUPTS (~0ULL) @@ -1453,6 +1507,9 @@ do { \ divisor = nsec * frequency; } + if (!divisor) + return dividend; + return div64_u64(dividend, divisor); } @@ -1475,7 +1532,7 @@ static int perf_event_start(struct perf_event *event) static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; - u64 period, sample_period; + s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); @@ -1826,6 +1883,7 @@ static void free_event_rcu(struct rcu_head *head) } static void perf_pending_sync(struct perf_event *event); +static void perf_mmap_data_put(struct perf_mmap_data *data); static void free_event(struct perf_event *event) { @@ -1841,9 +1899,9 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); } - if (event->output) { - fput(event->output->filp); - event->output = NULL; + if (event->data) { + perf_mmap_data_put(event->data); + event->data = NULL; } if (event->destroy) @@ -1857,9 +1915,30 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + /* + * Remove from the PMU, can't get re-enabled since we got + * here because the last ref went. + */ + perf_event_disable(event); + WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_event_remove_from_context(event); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&ctx->lock); + perf_group_detach(event); + list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); mutex_lock(&event->owner->perf_event_mutex); @@ -2139,7 +2218,27 @@ unlock: return ret; } -static int perf_event_set_output(struct perf_event *event, int output_fd); +static const struct file_operations perf_fops; + +static struct perf_event *perf_fget_light(int fd, int *fput_needed) +{ + struct file *file; + + file = fget_light(fd, fput_needed); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &perf_fops) { + fput_light(file, *fput_needed); + *fput_needed = 0; + return ERR_PTR(-EBADF); + } + + return file->private_data; +} + +static int perf_event_set_output(struct perf_event *event, + struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -2166,7 +2265,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return perf_event_period(event, (u64 __user *)arg); case PERF_EVENT_IOC_SET_OUTPUT: - return perf_event_set_output(event, arg); + { + struct perf_event *output_event = NULL; + int fput_needed = 0; + int ret; + + if (arg != -1) { + output_event = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_event)) + return PTR_ERR(output_event); + } + + ret = perf_event_set_output(event, output_event); + if (output_event) + fput_light(output_event->filp, fput_needed); + + return ret; + } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); @@ -2261,11 +2376,6 @@ unlock: rcu_read_unlock(); } -static unsigned long perf_data_size(struct perf_mmap_data *data) -{ - return data->nr_pages << (PAGE_SHIFT + data->data_order); -} - #ifndef CONFIG_PERF_USE_VMALLOC /* @@ -2284,6 +2394,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) return virt_to_page(data->data_pages[pgoff - 1]); } +static void *perf_mmap_alloc_page(int cpu) +{ + struct page *page; + int node; + + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NULL; + + return page_address(page); +} + static struct perf_mmap_data * perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { @@ -2291,8 +2414,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) unsigned long size; int i; - WARN_ON(atomic_read(&event->mmap_count)); - size = sizeof(struct perf_mmap_data); size += nr_pages * sizeof(void *); @@ -2300,17 +2421,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) if (!data) goto fail; - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + data->user_page = perf_mmap_alloc_page(event->cpu); if (!data->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + data->data_pages[i] = perf_mmap_alloc_page(event->cpu); if (!data->data_pages[i]) goto fail_data_pages; } - data->data_order = 0; data->nr_pages = nr_pages; return data; @@ -2346,6 +2466,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) kfree(data); } +static inline int page_order(struct perf_mmap_data *data) +{ + return 0; +} + #else /* @@ -2354,10 +2479,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) * Required for architectures that have d-cache aliasing issues. */ +static inline int page_order(struct perf_mmap_data *data) +{ + return data->page_order; +} + static struct page * perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) { - if (pgoff > (1UL << data->data_order)) + if (pgoff > (1UL << page_order(data))) return NULL; return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); @@ -2377,7 +2507,7 @@ static void perf_mmap_data_free_work(struct work_struct *work) int i, nr; data = container_of(work, struct perf_mmap_data, work); - nr = 1 << data->data_order; + nr = 1 << page_order(data); base = data->user_page; for (i = 0; i < nr + 1; i++) @@ -2399,8 +2529,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) unsigned long size; void *all_buf; - WARN_ON(atomic_read(&event->mmap_count)); - size = sizeof(struct perf_mmap_data); size += sizeof(void *); @@ -2416,7 +2544,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) data->user_page = all_buf; data->data_pages[0] = all_buf + PAGE_SIZE; - data->data_order = ilog2(nr_pages); + data->page_order = ilog2(nr_pages); data->nr_pages = 1; return data; @@ -2430,6 +2558,11 @@ fail: #endif +static unsigned long perf_data_size(struct perf_mmap_data *data) +{ + return data->nr_pages << (PAGE_SHIFT + page_order(data)); +} + static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; @@ -2470,8 +2603,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) { long max_size = perf_data_size(data); - atomic_set(&data->lock, -1); - if (event->attr.watermark) { data->watermark = min_t(long, max_size, event->attr.wakeup_watermark); @@ -2480,7 +2611,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) if (!data->watermark) data->watermark = max_size / 2; - + atomic_set(&data->refcount, 1); rcu_assign_pointer(event->data, data); } @@ -2492,13 +2623,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) perf_mmap_data_free(data); } -static void perf_mmap_data_release(struct perf_event *event) +static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) { - struct perf_mmap_data *data = event->data; + struct perf_mmap_data *data; - WARN_ON(atomic_read(&event->mmap_count)); + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) { + if (!atomic_inc_not_zero(&data->refcount)) + data = NULL; + } + rcu_read_unlock(); + + return data; +} + +static void perf_mmap_data_put(struct perf_mmap_data *data) +{ + if (!atomic_dec_and_test(&data->refcount)) + return; - rcu_assign_pointer(event->data, NULL); call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } @@ -2513,15 +2657,18 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - WARN_ON_ONCE(event->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { unsigned long size = perf_data_size(event->data); - struct user_struct *user = current_user(); + struct user_struct *user = event->mmap_user; + struct perf_mmap_data *data = event->data; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->data->nr_locked; - perf_mmap_data_release(event); + vma->vm_mm->locked_vm -= event->mmap_locked; + rcu_assign_pointer(event->data, NULL); mutex_unlock(&event->mmap_mutex); + + perf_mmap_data_put(data); + free_uid(user); } } @@ -2544,6 +2691,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) long user_extra, extra; int ret = 0; + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same buffer. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; @@ -2565,13 +2720,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&event->mmap_count)) { - if (nr_pages != event->data->nr_pages) + if (event->data) { + if (event->data->nr_pages == nr_pages) + atomic_inc(&event->data->refcount); + else ret = -EINVAL; goto unlock; } @@ -2603,21 +2755,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON(event->data); data = perf_mmap_data_alloc(event, nr_pages); - ret = -ENOMEM; - if (!data) + if (!data) { + ret = -ENOMEM; goto unlock; + } - ret = 0; perf_mmap_data_init(event, data); - - atomic_set(&event->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - event->data->nr_locked = extra; if (vma->vm_flags & VM_WRITE) event->data->writable = 1; + atomic_long_add(user_extra, &user->locked_vm); + event->mmap_locked = extra; + event->mmap_user = get_current_user(); + vma->vm_mm->locked_vm += event->mmap_locked; + unlock: + if (!ret) + atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); vma->vm_flags |= VM_RESERVED; @@ -2643,6 +2797,7 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { + .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, @@ -2786,12 +2941,32 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } -#ifdef CONFIG_EVENT_TRACING __weak void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { } -#endif + + +/* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); /* * Output @@ -2828,127 +3003,87 @@ static void perf_output_wakeup(struct perf_output_handle *handle) } /* - * Curious locking construct. - * * We need to ensure a later event_id doesn't publish a head when a former - * event_id isn't done writing. However since we need to deal with NMIs we + * event isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * * We only publish the head (and generate a wakeup) when the outer-most - * event_id completes. + * event completes. */ -static void perf_output_lock(struct perf_output_handle *handle) +static void perf_output_get_handle(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int cur, cpu = get_cpu(); - handle->locked = 0; - - for (;;) { - cur = atomic_cmpxchg(&data->lock, -1, cpu); - if (cur == -1) { - handle->locked = 1; - break; - } - if (cur == cpu) - break; - - cpu_relax(); - } + preempt_disable(); + local_inc(&data->nest); + handle->wakeup = local_read(&data->wakeup); } -static void perf_output_unlock(struct perf_output_handle *handle) +static void perf_output_put_handle(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; + head = local_read(&data->head); /* - * NMI can happen here, which means we can miss a done_head update. + * IRQ/NMI can happen here, which means we can miss a head update. */ - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); + if (!local_dec_and_test(&data->nest)) + goto out; /* - * Therefore we have to validate we did not indeed do so. + * Publish the known good head. Rely on the full barrier implied + * by atomic_dec_and_test() order the data->head read and this + * write. */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); + data->user_page->data_head = head; + /* + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read data->head. + */ + if (unlikely(head != local_read(&data->head))) { + local_inc(&data->nest); goto again; } - if (atomic_xchg(&data->wakeup, 0)) + if (handle->wakeup != local_read(&data->wakeup)) perf_output_wakeup(handle); -out: - put_cpu(); + + out: + preempt_enable(); } -void perf_output_copy(struct perf_output_handle *handle, +__always_inline void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { - unsigned int pages_mask; - unsigned long offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - do { - unsigned long page_offset; - unsigned long page_size; - int nr; + unsigned long size = min_t(unsigned long, handle->size, len); - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); - page_offset = offset & (page_size - 1); - size = min_t(unsigned int, page_size - page_offset, len); + memcpy(handle->addr, buf, size); - memcpy(pages[nr] + page_offset, buf, size); + len -= size; + handle->addr += size; + buf += size; + handle->size -= size; + if (!handle->size) { + struct perf_mmap_data *data = handle->data; - len -= size; - buf += size; - offset += size; + handle->page++; + handle->page &= data->nr_pages - 1; + handle->addr = data->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(data); + } } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); } int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_event *output_event; struct perf_mmap_data *data; unsigned long tail, offset, head; int have_lost; @@ -2965,10 +3100,6 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - output_event = rcu_dereference(event->output); - if (output_event) - event = output_event; - data = rcu_dereference(event->data); if (!data) goto out; @@ -2979,13 +3110,13 @@ int perf_output_begin(struct perf_output_handle *handle, handle->sample = sample; if (!data->nr_pages) - goto fail; + goto out; - have_lost = atomic_read(&data->lost); + have_lost = local_read(&data->lost); if (have_lost) size += sizeof(lost_event); - perf_output_lock(handle); + perf_output_get_handle(handle); do { /* @@ -2995,24 +3126,28 @@ int perf_output_begin(struct perf_output_handle *handle, */ tail = ACCESS_ONCE(data->user_page->data_tail); smp_rmb(); - offset = head = atomic_long_read(&data->head); + offset = head = local_read(&data->head); head += size; if (unlikely(!perf_output_space(data, tail, offset, head))) goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + } while (local_cmpxchg(&data->head, offset, head) != offset); - handle->offset = offset; - handle->head = head; + if (head - local_read(&data->wakeup) > data->watermark) + local_add(data->watermark, &data->wakeup); - if (head - tail > data->watermark) - atomic_set(&data->wakeup, 1); + handle->page = offset >> (PAGE_SHIFT + page_order(data)); + handle->page &= data->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); + handle->addr = data->data_pages[handle->page]; + handle->addr += handle->size; + handle->size = (PAGE_SIZE << page_order(data)) - handle->size; if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; - lost_event.lost = atomic_xchg(&data->lost, 0); + lost_event.lost = local_xchg(&data->lost, 0); perf_output_put(handle, lost_event); } @@ -3020,8 +3155,8 @@ int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); + local_inc(&data->lost); + perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -3036,14 +3171,14 @@ void perf_output_end(struct perf_output_handle *handle) int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); + int events = local_inc_return(&data->events); if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); + local_sub(wakeup_events, &data->events); + local_inc(&data->wakeup); } } - perf_output_unlock(handle); + perf_output_put_handle(handle); rcu_read_unlock(); } @@ -3378,9 +3513,8 @@ static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; - int size; struct task_struct *task = task_event->task; - int ret; + int size, ret; size = task_event->event_id.header.size; ret = perf_output_begin(&handle, event, size, 0, 0); @@ -3736,7 +3870,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) .event_id = { .header = { .type = PERF_RECORD_MMAP, - .misc = 0, + .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ @@ -3924,13 +4058,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } } -static void perf_swevent_unthrottle(struct perf_event *event) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - static void perf_swevent_add(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) @@ -3954,39 +4081,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, nmi, data, regs); } -static int perf_swevent_is_counting(struct perf_event *event) -{ - /* - * The event is active, we're good! - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - return 1; - - /* - * The event is off/error, not counting. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE) - return 0; - - /* - * The event is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (event->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data); - static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { @@ -4007,12 +4101,6 @@ static int perf_swevent_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - if (event->cpu != -1 && event->cpu != smp_processor_id()) - return 0; - - if (!perf_swevent_is_counting(event)) - return 0; - if (event->attr.type != type) return 0; @@ -4022,30 +4110,88 @@ static int perf_swevent_match(struct perf_event *event, if (perf_exclude_event(event, regs)) return 0; - if (event->attr.type == PERF_TYPE_TRACEPOINT && - !perf_tp_event_match(event, data)) - return 0; - return 1; } -static void perf_swevent_ctx_event(struct perf_event_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static inline struct hlist_head * +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) +{ + u64 hash = swevent_hash(type, event_id); + + return &hlist->heads[hash]; +} + +/* For the read side: events when they trigger */ +static inline struct hlist_head * +find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) +{ + struct swevent_hlist *hlist; + + hlist = rcu_dereference(ctx->swevent_hlist); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +/* For the event head insertion and removal in the hlist */ +static inline struct hlist_head * +find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) +{ + struct swevent_hlist *hlist; + u32 event_id = event->attr.config; + u64 type = event->attr.type; + + /* + * Event scheduling is always serialized against hlist allocation + * and release. Which makes the protected version suitable here. + * The context lock guarantees that. + */ + hlist = rcu_dereference_protected(ctx->swevent_hlist, + lockdep_is_held(&event->ctx->lock)); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { + struct perf_cpu_context *cpuctx; struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + cpuctx = &__get_cpu_var(perf_cpu_context); + + rcu_read_lock(); + + head = find_swevent_head_rcu(cpuctx, type, event_id); + + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } +end: + rcu_read_unlock(); } int perf_swevent_get_recursion_context(void) { - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); int rctx; if (in_nmi()) @@ -4057,10 +4203,8 @@ int perf_swevent_get_recursion_context(void) else rctx = 0; - if (cpuctx->recursion[rctx]) { - put_cpu_var(perf_cpu_context); + if (cpuctx->recursion[rctx]) return -1; - } cpuctx->recursion[rctx]++; barrier(); @@ -4074,31 +4218,9 @@ void perf_swevent_put_recursion_context(int rctx) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); barrier(); cpuctx->recursion[rctx]--; - put_cpu_var(perf_cpu_context); } EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, - nr, nmi, data, regs); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); - rcu_read_unlock(); -} void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4106,6 +4228,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct perf_sample_data data; int rctx; + preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (rctx < 0) return; @@ -4115,6 +4238,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); } static void perf_swevent_read(struct perf_event *event) @@ -4124,23 +4248,46 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_enable(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct perf_cpu_context *cpuctx; + struct hlist_head *head; + + cpuctx = &__get_cpu_var(perf_cpu_context); if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } + + head = find_swevent_head(cpuctx, event); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + return 0; } static void perf_swevent_disable(struct perf_event *event) { + hlist_del_rcu(&event->hlist_entry); +} + +static void perf_swevent_void(struct perf_event *event) +{ +} + +static int perf_swevent_int(struct perf_event *event) +{ + return 0; } static const struct pmu perf_ops_generic = { .enable = perf_swevent_enable, .disable = perf_swevent_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, .read = perf_swevent_read, - .unthrottle = perf_swevent_unthrottle, + .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ }; /* @@ -4161,15 +4308,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((event->attr.exclude_kernel || !regs) && - !event->attr.exclude_user) - regs = task_pt_regs(current); - if (regs) { + if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) if (perf_event_overflow(event, 0, &data, regs)) ret = HRTIMER_NORESTART; @@ -4317,27 +4457,124 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_TRACING +/* Deref the hlist from the update side */ +static inline struct swevent_hlist * +swevent_hlist_deref(struct perf_cpu_context *cpuctx) +{ + return rcu_dereference_protected(cpuctx->swevent_hlist, + lockdep_is_held(&cpuctx->hlist_mutex)); +} -void perf_tp_event(int event_id, u64 addr, u64 count, void *record, - int entry_size, struct pt_regs *regs) +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) { - struct perf_sample_data data; - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; + struct swevent_hlist *hlist; - perf_sample_data_init(&data, addr); - data.raw = &raw; + hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); + kfree(hlist); +} + +static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +{ + struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); - /* Trace events already protected against recursion */ - do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); + if (!hlist) + return; + + rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); } -EXPORT_SYMBOL_GPL(perf_tp_event); -static int perf_tp_event_match(struct perf_event *event, +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + + mutex_lock(&cpuctx->hlist_mutex); + + if (!--cpuctx->hlist_refcount) + swevent_hlist_release(cpuctx); + + mutex_unlock(&cpuctx->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + int err = 0; + + mutex_lock(&cpuctx->hlist_mutex); + + if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + cpuctx->hlist_refcount++; + exit: + mutex_unlock(&cpuctx->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; + fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + +#ifdef CONFIG_EVENT_TRACING + +static const struct pmu perf_ops_tracepoint = { + .enable = perf_trace_enable, + .disable = perf_trace_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, + .read = perf_swevent_read, + .unthrottle = perf_swevent_void, +}; + +static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { void *record = data->raw->data; @@ -4347,13 +4584,55 @@ static int perf_tp_event_match(struct perf_event *event, return 0; } +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* + * All tracepoints are from kernel-space. + */ + if (event->attr.exclude_kernel) + return 0; + + if (!perf_tp_filter_match(event, data)) + return 0; + + return 1; +} + +void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, + struct pt_regs *regs, struct hlist_head *head) +{ + struct perf_sample_data data; + struct perf_event *event; + struct hlist_node *node; + + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + perf_sample_data_init(&data, addr); + data.raw = &raw; + + rcu_read_lock(); + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_add(event, count, 1, &data, regs); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + static void tp_perf_event_destroy(struct perf_event *event) { - perf_trace_disable(event->attr.config); + perf_trace_destroy(event); } static const struct pmu *tp_perf_event_init(struct perf_event *event) { + int err; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4363,12 +4642,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (perf_trace_enable(event->attr.config)) + err = perf_trace_init(event); + if (err) return NULL; event->destroy = tp_perf_event_destroy; - return &perf_ops_generic; + return &perf_ops_tracepoint; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4396,12 +4676,6 @@ static void perf_event_free_filter(struct perf_event *event) #else -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data) -{ - return 1; -} - static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; @@ -4467,6 +4741,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); atomic_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); } static const struct pmu *sw_perf_event_init(struct perf_event *event) @@ -4505,6 +4780,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_ALIGNMENT_FAULTS: case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return ERR_PTR(err); + atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -4723,54 +5004,53 @@ err_size: goto out; } -static int perf_event_set_output(struct perf_event *event, int output_fd) +static int +perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_event *output_event = NULL; - struct file *output_file = NULL; - struct perf_event *old_output; - int fput_needed = 0; + struct perf_mmap_data *data = NULL, *old_data = NULL; int ret = -EINVAL; - if (!output_fd) + if (!output_event) goto set; - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) + /* don't allow circular references */ + if (event == output_event) goto out; - output_event = output_file->private_data; - - /* Don't chain output fds */ - if (output_event->output) + /* + * Don't allow cross-cpu buffers + */ + if (output_event->cpu != event->cpu) goto out; - /* Don't set an output fd when we already have an output channel */ - if (event->data) + /* + * If its not a per-cpu buffer, it must be the same task. + */ + if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; - atomic_long_inc(&output_file->f_count); - set: mutex_lock(&event->mmap_mutex); - old_output = event->output; - rcu_assign_pointer(event->output, output_event); - mutex_unlock(&event->mmap_mutex); + /* Can't redirect output if we've got an active mmap() */ + if (atomic_read(&event->mmap_count)) + goto unlock; - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this event. - */ - synchronize_rcu(); - fput(old_output->filp); + if (output_event) { + /* get the buffer we want to redirect to */ + data = perf_mmap_data_get(output_event); + if (!data) + goto unlock; } + old_data = event->data; + rcu_assign_pointer(event->data, data); ret = 0; +unlock: + mutex_unlock(&event->mmap_mutex); + + if (old_data) + perf_mmap_data_put(old_data); out: - fput_light(output_file, fput_needed); return ret; } @@ -4786,13 +5066,13 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_event *event, *group_leader; + struct perf_event *event, *group_leader = NULL, *output_event = NULL; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct file *group_file = NULL; + int event_fd; int fput_needed = 0; - int fput_needed2 = 0; int err; /* for future expandability... */ @@ -4813,26 +5093,38 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + event_fd = get_unused_fd_flags(O_RDWR); + if (event_fd < 0) + return event_fd; + /* * Get the target context (task or percpu): */ ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_fd; + } + + if (group_fd != -1) { + group_leader = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_leader)) { + err = PTR_ERR(group_leader); + goto err_put_context; + } + group_file = group_leader->filp; + if (flags & PERF_FLAG_FD_OUTPUT) + output_event = group_leader; + if (flags & PERF_FLAG_FD_NO_GROUP) + group_leader = NULL; + } /* * Look up the group leader (we will attach this event to it): */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + if (group_leader) { err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - group_leader = group_file->private_data; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): @@ -4854,22 +5146,21 @@ SYSCALL_DEFINE5(perf_event_open, event = perf_event_alloc(&attr, cpu, ctx, group_leader, NULL, NULL, GFP_KERNEL); - err = PTR_ERR(event); - if (IS_ERR(event)) + if (IS_ERR(event)) { + err = PTR_ERR(event); goto err_put_context; + } - err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); - if (err < 0) - goto err_free_put_context; + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_free_put_context; + } - event_file = fget_light(err, &fput_needed2); - if (!event_file) + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); goto err_free_put_context; - - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_event_set_output(event, group_fd); - if (err) - goto err_fput_free_put_context; } event->filp = event_file; @@ -4885,19 +5176,23 @@ SYSCALL_DEFINE5(perf_event_open, list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); -err_fput_free_put_context: - fput_light(event_file, fput_needed2); + /* + * Drop the reference on the group_event after placing the + * new event on the sibling_list. This ensures destruction + * of the group leader will find the pointer to itself in + * perf_group_detach(). + */ + fput_light(group_file, fput_needed); + fd_install(event_fd, event_file); + return event_fd; err_free_put_context: - if (err < 0) - kfree(event); - + free_event(event); err_put_context: - if (err < 0) - put_ctx(ctx); - fput_light(group_file, fput_needed); - + put_ctx(ctx); +err_fd: + put_unused_fd(event_fd); return err; } @@ -5169,7 +5464,7 @@ void perf_event_exit_task(struct task_struct *child) * * But since its the parent context it won't be the same instance. */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&child_ctx->mutex); again: list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, @@ -5208,6 +5503,7 @@ static void perf_free_event(struct perf_event *event, fput(parent->filp); + perf_group_detach(event); list_del_event(event, ctx); free_event(event); } @@ -5377,6 +5673,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); + mutex_init(&cpuctx->hlist_mutex); __perf_event_init_context(&cpuctx->ctx, NULL); } } @@ -5390,6 +5687,16 @@ static void __cpuinit perf_event_init_cpu(int cpu) spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); + + mutex_lock(&cpuctx->hlist_mutex); + if (cpuctx->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + WARN_ON_ONCE(!hlist); + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + mutex_unlock(&cpuctx->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU @@ -5409,6 +5716,10 @@ static void perf_event_exit_cpu(int cpu) struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_event_context *ctx = &cpuctx->ctx; + mutex_lock(&cpuctx->hlist_mutex); + swevent_hlist_release(cpuctx); + mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); mutex_unlock(&ctx->mutex); diff --git a/kernel/pid.c b/kernel/pid.c index aebb30d..e9fd8c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -513,6 +513,13 @@ void __init pidhash_init(void) void __init pidmap_init(void) { + /* bump default and minimum pid_max based on number of cpus */ + pid_max = min(pid_max_max, max_t(int, pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + pid_max_min = max_t(int, pid_max_min, + PIDS_PER_CPU_MIN * num_possible_cpus()); + pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); + init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 79aac93..a5aff94 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -13,6 +13,7 @@ #include <linux/syscalls.h> #include <linux/err.h> #include <linux/acct.h> +#include <linux/slab.h> #define BITS_PER_PAGE (PAGE_SIZE*8) diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 3db49b9..996a4de 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -2,7 +2,7 @@ * This module exposes the interface to kernel space for specifying * QoS dependencies. It provides infrastructure for registration of: * - * Dependents on a QoS value : register requirements + * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. @@ -14,19 +14,21 @@ * timeout: usec <-- currently not used. * throughput: kbs (kilo byte / sec) * - * There are lists of pm_qos_objects each one wrapping requirements, notifiers + * There are lists of pm_qos_objects each one wrapping requests, notifiers * - * User mode requirements on a QOS parameter register themselves to the + * User mode requests on a QOS parameter register themselves to the * subsystem by opening the device node /dev/... and writing there request to * the node. As long as the process holds a file handle open to the node the * client continues to be accounted for. Upon file release the usermode - * requirement is removed and a new qos target is computed. This way when the - * requirement that the application has is cleaned up when closes the file + * request is removed and a new qos target is computed. This way when the + * request that the application has is cleaned up when closes the file * pointer or exits the pm_qos_object will get an opportunity to clean up. * * Mark Gross <mgross@linux.intel.com> */ +/*#define DEBUG*/ + #include <linux/pm_qos_params.h> #include <linux/sched.h> #include <linux/spinlock.h> @@ -42,64 +44,53 @@ #include <linux/uaccess.h> /* - * locking rule: all changes to requirements or notifiers lists + * locking rule: all changes to requests or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct requirement_list { - struct list_head list; - union { - s32 value; - s32 usec; - s32 kbps; - }; - char *name; +enum pm_qos_type { + PM_QOS_MAX, /* return the largest value */ + PM_QOS_MIN /* return the smallest value */ }; -static s32 max_compare(s32 v1, s32 v2); -static s32 min_compare(s32 v1, s32 v2); - struct pm_qos_object { - struct requirement_list requirements; + struct plist_head requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - atomic_t target_value; - s32 (*comparitor)(s32, s32); + enum pm_qos_type type; }; +static DEFINE_SPINLOCK(pm_qos_lock); + static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN, }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requirements = - {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)}, + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = ATOMIC_INIT(0), - .comparitor = max_compare + .type = PM_QOS_MAX, }; @@ -110,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = { &network_throughput_pm_qos }; -static DEFINE_SPINLOCK(pm_qos_lock); - static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); @@ -123,43 +112,55 @@ static const struct file_operations pm_qos_power_fops = { .release = pm_qos_power_release, }; -/* static helper functions */ -static s32 max_compare(s32 v1, s32 v2) +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_object *o) { - return max(v1, v2); -} + if (plist_head_empty(&o->requests)) + return o->default_value; -static s32 min_compare(s32 v1, s32 v2) -{ - return min(v1, v2); -} + switch (o->type) { + case PM_QOS_MIN: + return plist_last(&o->requests)->prio; + case PM_QOS_MAX: + return plist_first(&o->requests)->prio; -static void update_target(int target) + default: + /* runtime check for not using enum */ + BUG(); + } +} + +static void update_target(struct pm_qos_object *o, struct plist_node *node, + int del, int value) { - s32 extreme_value; - struct requirement_list *node; unsigned long flags; - int call_notifier = 0; + int prev_value, curr_value; spin_lock_irqsave(&pm_qos_lock, flags); - extreme_value = pm_qos_array[target]->default_value; - list_for_each_entry(node, - &pm_qos_array[target]->requirements.list, list) { - extreme_value = pm_qos_array[target]->comparitor( - extreme_value, node->value); - } - if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { - call_notifier = 1; - atomic_set(&pm_qos_array[target]->target_value, extreme_value); - pr_debug(KERN_ERR "new target for qos %d is %d\n", target, - atomic_read(&pm_qos_array[target]->target_value)); + prev_value = pm_qos_get_value(o); + /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ + if (value != PM_QOS_DEFAULT_VALUE) { + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &o->requests); + plist_node_init(node, value); + plist_add(node, &o->requests); + } else if (del) { + plist_del(node, &o->requests); + } else { + plist_add(node, &o->requests); } + curr_value = pm_qos_get_value(o); spin_unlock_irqrestore(&pm_qos_lock, flags); - if (call_notifier) - blocking_notifier_call_chain(pm_qos_array[target]->notifiers, - (unsigned long) extreme_value, NULL); + if (prev_value != curr_value) + blocking_notifier_call_chain(o->notifiers, + (unsigned long)curr_value, + NULL); } static int register_pm_qos_misc(struct pm_qos_object *qos) @@ -185,125 +186,123 @@ static int find_pm_qos_object_by_minor(int minor) } /** - * pm_qos_requirement - returns current system wide qos expectation + * pm_qos_request - returns current system wide qos expectation * @pm_qos_class: identification of which qos value is requested * * This function returns the current target value in an atomic manner. */ -int pm_qos_requirement(int pm_qos_class) +int pm_qos_request(int pm_qos_class) { - return atomic_read(&pm_qos_array[pm_qos_class]->target_value); + unsigned long flags; + int value; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(pm_qos_array[pm_qos_class]); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return value; } -EXPORT_SYMBOL_GPL(pm_qos_requirement); +EXPORT_SYMBOL_GPL(pm_qos_request); + +int pm_qos_request_active(struct pm_qos_request_list *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); /** - * pm_qos_add_requirement - inserts new qos request into the list + * pm_qos_add_request - inserts new qos request into the list * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters. + * for the pm_qos_class of parameters, and returns the pm_qos_request list + * element as a handle for use in updating and removal. Call needs to save + * this handle for later use. */ -int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) +void pm_qos_add_request(struct pm_qos_request_list *dep, + int pm_qos_class, s32 value) { - struct requirement_list *dep; - unsigned long flags; + struct pm_qos_object *o = pm_qos_array[pm_qos_class]; + int new_value; - dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); - if (dep) { - if (value == PM_QOS_DEFAULT_VALUE) - dep->value = pm_qos_array[pm_qos_class]->default_value; - else - dep->value = value; - dep->name = kstrdup(name, GFP_KERNEL); - if (!dep->name) - goto cleanup; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_add(&dep->list, - &pm_qos_array[pm_qos_class]->requirements.list); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(pm_qos_class); - - return 0; + if (pm_qos_request_active(dep)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; } - -cleanup: - kfree(dep); - return -ENOMEM; + if (value == PM_QOS_DEFAULT_VALUE) + new_value = o->default_value; + else + new_value = value; + plist_node_init(&dep->list, new_value); + dep->pm_qos_class = pm_qos_class; + update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); } -EXPORT_SYMBOL_GPL(pm_qos_add_requirement); +EXPORT_SYMBOL_GPL(pm_qos_add_request); /** - * pm_qos_update_requirement - modifies an existing qos request - * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request + * pm_qos_update_request - modifies an existing qos request + * @pm_qos_req : handle to list element holding a pm_qos request to use * @value: defines the qos request * - * Updates an existing qos requirement for the pm_qos_class of parameters along + * Updates an existing qos request for the pm_qos_class of parameters along * with updating the target pm_qos_class value. * - * If the named request isn't in the list then no change is made. + * Attempts are made to make this code callable on hot code paths. */ -int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) +void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, + s32 new_value) { - unsigned long flags; - struct requirement_list *node; - int pending_update = 0; + s32 temp; + struct pm_qos_object *o; - spin_lock_irqsave(&pm_qos_lock, flags); - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requirements.list, list) { - if (strcmp(node->name, name) == 0) { - if (new_value == PM_QOS_DEFAULT_VALUE) - node->value = - pm_qos_array[pm_qos_class]->default_value; - else - node->value = new_value; - pending_update = 1; - break; - } + if (!pm_qos_req) /*guard against callers passing in null */ + return; + + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_class); - return 0; + o = pm_qos_array[pm_qos_req->pm_qos_class]; + + if (new_value == PM_QOS_DEFAULT_VALUE) + temp = o->default_value; + else + temp = new_value; + + if (temp != pm_qos_req->list.prio) + update_target(o, &pm_qos_req->list, 0, temp); } -EXPORT_SYMBOL_GPL(pm_qos_update_requirement); +EXPORT_SYMBOL_GPL(pm_qos_update_request); /** - * pm_qos_remove_requirement - modifies an existing qos request - * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request + * pm_qos_remove_request - modifies an existing qos request + * @pm_qos_req: handle to request list element * - * Will remove named qos request from pm_qos_class list of parameters and - * recompute the current target value for the pm_qos_class. + * Will remove pm qos request from the list of requests and + * recompute the current target value for the pm_qos_class. Call this + * on slow code paths. */ -void pm_qos_remove_requirement(int pm_qos_class, char *name) +void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) { - unsigned long flags; - struct requirement_list *node; - int pending_update = 0; + struct pm_qos_object *o; - spin_lock_irqsave(&pm_qos_lock, flags); - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requirements.list, list) { - if (strcmp(node->name, name) == 0) { - kfree(node->name); - list_del(&node->list); - kfree(node); - pending_update = 1; - break; - } + if (pm_qos_req == NULL) + return; + /* silent return to keep pcm code cleaner */ + + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_class); + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); + memset(pm_qos_req, 0, sizeof(*pm_qos_req)); } -EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); +EXPORT_SYMBOL_GPL(pm_qos_remove_request); /** * pm_qos_add_notifier - sets notification entry for changes to target value @@ -313,7 +312,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); * will register the notifier into a notification chain that gets called * upon changes to the pm_qos_class target value. */ - int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) +int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) { int retval; @@ -343,21 +342,20 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) } EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); -#define PID_NAME_LEN 32 - static int pm_qos_power_open(struct inode *inode, struct file *filp) { - int ret; long pm_qos_class; - char name[PID_NAME_LEN]; pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - filp->private_data = (void *)pm_qos_class; - snprintf(name, PID_NAME_LEN, "process_%d", current->pid); - ret = pm_qos_add_requirement(pm_qos_class, name, - PM_QOS_DEFAULT_VALUE); - if (ret >= 0) + struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; + + if (filp->private_data) return 0; } return -EPERM; @@ -365,32 +363,41 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) static int pm_qos_power_release(struct inode *inode, struct file *filp) { - int pm_qos_class; - char name[PID_NAME_LEN]; + struct pm_qos_request_list *req; - pm_qos_class = (long)filp->private_data; - snprintf(name, PID_NAME_LEN, "process_%d", current->pid); - pm_qos_remove_requirement(pm_qos_class, name); + req = filp->private_data; + pm_qos_remove_request(req); + kfree(req); return 0; } + static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; - int pm_qos_class; - char name[PID_NAME_LEN]; - - pm_qos_class = (long)filp->private_data; - if (count != sizeof(s32)) + int x; + char ascii_value[11]; + struct pm_qos_request_list *pm_qos_req; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else if (count == 11) { /* len('0x12345678/0') */ + if (copy_from_user(ascii_value, buf, 11)) + return -EFAULT; + x = sscanf(ascii_value, "%x", &value); + if (x != 1) + return -EINVAL; + pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); + } else return -EINVAL; - if (copy_from_user(&value, buf, sizeof(s32))) - return -EFAULT; - snprintf(name, PID_NAME_LEN, "process_%d", current->pid); - pm_qos_update_requirement(pm_qos_class, name, value); - return sizeof(s32); + pm_qos_req = (struct pm_qos_request_list *)filp->private_data; + pm_qos_update_request(pm_qos_req, value); + + return count; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 1a22dfd..9829646 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -11,19 +11,18 @@ #include <trace/events/timer.h> /* - * Called after updating RLIMIT_CPU to set timer expiration if necessary. + * Called after updating RLIMIT_CPU to run cpu timer and update + * tsk->signal->cputime_expires expiration cache if necessary. Needs + * siglock protection since other code may update expiration cache as + * well. */ void update_rlimit_cpu(unsigned long rlim_new) { cputime_t cputime = secs_to_cputime(rlim_new); - struct signal_struct *const sig = current->signal; - if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || - cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - } + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(¤t->sighand->siglock); } static int check_clock(const clockid_t which_clock) @@ -364,7 +363,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) } } else { read_lock(&tasklist_lock); - if (thread_group_leader(p) && p->signal) { + if (thread_group_leader(p) && p->sighand) { error = cpu_clock_sample_group(which_clock, p, &rtn); @@ -440,7 +439,7 @@ int posix_cpu_timer_del(struct k_itimer *timer) if (likely(p != NULL)) { read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { + if (unlikely(p->sighand == NULL)) { /* * We raced with the reaping of the task. * The deletion should have cleared us off the list. @@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp) cputime_gt(expires, new_exp); } -static inline int expires_le(cputime_t expires, cputime_t new_exp) -{ - return !cputime_eq(expires, cputime_zero) && - cputime_le(expires, new_exp); -} /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held - * for reading, and interrupts disabled. + * for reading, interrupts disabled and p->sighand->siglock taken. */ -static void arm_timer(struct k_itimer *timer, union cpu_time_count now) +static void arm_timer(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; struct list_head *head, *listpos; + struct task_cputime *cputime_expires; struct cpu_timer_list *const nt = &timer->it.cpu; struct cpu_timer_list *next; - unsigned long i; - head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? - p->cpu_timers : p->signal->cpu_timers); + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + head = p->cpu_timers; + cputime_expires = &p->cputime_expires; + } else { + head = p->signal->cpu_timers; + cputime_expires = &p->signal->cputime_expires; + } head += CPUCLOCK_WHICH(timer->it_clock); - BUG_ON(!irqs_disabled()); - spin_lock(&p->sighand->siglock); - listpos = head; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - list_for_each_entry(next, head, entry) { - if (next->expires.sched > nt->expires.sched) - break; - listpos = &next->entry; - } - } else { - list_for_each_entry(next, head, entry) { - if (cputime_gt(next->expires.cpu, nt->expires.cpu)) - break; - listpos = &next->entry; - } + list_for_each_entry(next, head, entry) { + if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + break; + listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { + union cpu_time_count *exp = &nt->expires; + /* - * We are the new earliest-expiring timer. - * If we are a thread timer, there can always - * be a process timer telling us to stop earlier. + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - union cpu_time_count *exp = &nt->expires; - - switch (CPUCLOCK_WHICH(timer->it_clock)) { - default: - BUG(); - case CPUCLOCK_PROF: - if (expires_gt(p->cputime_expires.prof_exp, - exp->cpu)) - p->cputime_expires.prof_exp = exp->cpu; - break; - case CPUCLOCK_VIRT: - if (expires_gt(p->cputime_expires.virt_exp, - exp->cpu)) - p->cputime_expires.virt_exp = exp->cpu; - break; - case CPUCLOCK_SCHED: - if (p->cputime_expires.sched_exp == 0 || - p->cputime_expires.sched_exp > exp->sched) - p->cputime_expires.sched_exp = - exp->sched; - break; - } - } else { - struct signal_struct *const sig = p->signal; - union cpu_time_count *exp = &timer->it.cpu.expires; - - /* - * For a process timer, set the cached expiration time. - */ - switch (CPUCLOCK_WHICH(timer->it_clock)) { - default: - BUG(); - case CPUCLOCK_VIRT: - if (expires_le(sig->it[CPUCLOCK_VIRT].expires, - exp->cpu)) - break; - sig->cputime_expires.virt_exp = exp->cpu; - break; - case CPUCLOCK_PROF: - if (expires_le(sig->it[CPUCLOCK_PROF].expires, - exp->cpu)) - break; - i = sig->rlim[RLIMIT_CPU].rlim_cur; - if (i != RLIM_INFINITY && - i <= cputime_to_secs(exp->cpu)) - break; - sig->cputime_expires.prof_exp = exp->cpu; - break; - case CPUCLOCK_SCHED: - sig->cputime_expires.sched_exp = exp->sched; - break; - } + switch (CPUCLOCK_WHICH(timer->it_clock)) { + case CPUCLOCK_PROF: + if (expires_gt(cputime_expires->prof_exp, exp->cpu)) + cputime_expires->prof_exp = exp->cpu; + break; + case CPUCLOCK_VIRT: + if (expires_gt(cputime_expires->virt_exp, exp->cpu)) + cputime_expires->virt_exp = exp->cpu; + break; + case CPUCLOCK_SCHED: + if (cputime_expires->sched_exp == 0 || + cputime_expires->sched_exp > exp->sched) + cputime_expires->sched_exp = exp->sched; + break; } } - - spin_unlock(&p->sighand->siglock); } /* @@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) */ static void cpu_timer_fire(struct k_itimer *timer) { - if (unlikely(timer->sigq == NULL)) { + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + /* + * User don't want any signal. + */ + timer->it.cpu.expires.sched = 0; + } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. @@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, val; + union cpu_time_count old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -736,10 +691,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, read_lock(&tasklist_lock); /* * We need the tasklist_lock to protect against reaping that - * clears p->signal. If p has just been reaped, we can no + * clears p->sighand. If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(p->signal == NULL)) { + if (unlikely(p->sighand == NULL)) { read_unlock(&tasklist_lock); put_task_struct(p); timer->it.cpu.task = NULL; @@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, BUG_ON(!irqs_disabled()); ret = 0; + old_incr = timer->it.cpu.incr; spin_lock(&p->sighand->siglock); old_expires = timer->it.cpu.expires; if (unlikely(timer->it.cpu.firing)) { @@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, ret = TIMER_RETRY; } else list_del_init(&timer->it.cpu.entry); - spin_unlock(&p->sighand->siglock); /* * We need to sample the current value to convert the new @@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, * disable this firing since we are already reporting * it as an overrun (thanks to bump_cpu_timer above). */ + spin_unlock(&p->sighand->siglock); read_unlock(&tasklist_lock); goto out; } @@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, */ timer->it.cpu.expires = new_expires; if (new_expires.sched != 0 && - (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && cpu_time_before(timer->it_clock, val, new_expires)) { - arm_timer(timer, val); + arm_timer(timer); } + spin_unlock(&p->sighand->siglock); read_unlock(&tasklist_lock); /* @@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun = -1; if (new_expires.sched != 0 && - (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && !cpu_time_before(timer->it_clock, val, new_expires)) { /* * The designated time already passed, so we notify @@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, out: if (old) { sample_to_timespec(timer->it_clock, - timer->it.cpu.incr, &old->it_interval); + old_incr, &old->it_interval); } return ret; } @@ -908,7 +863,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) clear_dead = p->exit_state; } else { read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { + if (unlikely(p->sighand == NULL)) { /* * The process has been reaped. * We can't even collect a sample any more. @@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) read_unlock(&tasklist_lock); } - if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - if (timer->it.cpu.incr.sched == 0 && - cpu_time_before(timer->it_clock, - timer->it.cpu.expires, now)) { - /* - * Do-nothing timer expired and has no reload, - * so it's as if it was never set. - */ - timer->it.cpu.expires.sched = 0; - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; - return; - } - /* - * Account for any expirations and reloads that should - * have happened. - */ - bump_cpu_timer(timer, now); - } - if (unlikely(clear_dead)) { /* * We've noticed that the thread is dead, but @@ -1061,14 +997,11 @@ static void check_thread_timers(struct task_struct *tsk, } } -static void stop_process_timers(struct task_struct *tsk) +static void stop_process_timers(struct signal_struct *sig) { - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; - if (!cputimer->running) - return; - spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; spin_unlock_irqrestore(&cputimer->lock, flags); @@ -1108,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, } } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) + return 1; + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1125,19 +1075,6 @@ static void check_process_timers(struct task_struct *tsk, unsigned long soft; /* - * Don't sample the current process CPU clocks if there are no timers. - */ - if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && - sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && - list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && - list_empty(&timers[CPUCLOCK_SCHED])) { - stop_process_timers(tsk); - return; - } - - /* * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); @@ -1226,18 +1163,11 @@ static void check_process_timers(struct task_struct *tsk, } } - if (!cputime_eq(prof_expires, cputime_zero) && - (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || - cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) - sig->cputime_expires.prof_exp = prof_expires; - if (!cputime_eq(virt_expires, cputime_zero) && - (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) || - cputime_gt(sig->cputime_expires.virt_exp, virt_expires))) - sig->cputime_expires.virt_exp = virt_expires; - if (sched_expires != 0 && - (sig->cputime_expires.sched_exp == 0 || - sig->cputime_expires.sched_exp > sched_expires)) - sig->cputime_expires.sched_exp = sched_expires; + sig->cputime_expires.prof_exp = prof_expires; + sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.sched_exp = sched_expires; + if (task_cputime_zero(&sig->cputime_expires)) + stop_process_timers(sig); } /* @@ -1266,9 +1196,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) goto out; } read_lock(&tasklist_lock); /* arm_timer needs it. */ + spin_lock(&p->sighand->siglock); } else { read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { + if (unlikely(p->sighand == NULL)) { /* * The process has been reaped. * We can't even collect a sample any more. @@ -1286,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) clear_dead_task(timer, now); goto out_unlock; } + spin_lock(&p->sighand->siglock); cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); /* Leave the tasklist_lock locked for the call below. */ @@ -1294,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - arm_timer(timer, now); + BUG_ON(!irqs_disabled()); + arm_timer(timer); + spin_unlock(&p->sighand->siglock); out_unlock: read_unlock(&tasklist_lock); @@ -1306,23 +1240,6 @@ out: } /** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ - if (cputime_eq(cputime->utime, cputime_zero) && - cputime_eq(cputime->stime, cputime_zero) && - cputime->sum_exec_runtime == 0) - return 1; - return 0; -} - -/** * task_cputime_expired - Compare two task_cputime entities. * * @sample: The task_cputime structure to be checked for expiration. @@ -1378,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) } sig = tsk->signal; - if (!task_cputime_zero(&sig->cputime_expires)) { + if (sig->cputimer.running) { struct task_cputime group_sample; thread_group_cputimer(tsk, &group_sample); @@ -1386,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) return 1; } - return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; + return 0; } /* @@ -1415,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk) * put them on the firing list. */ check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); + /* + * If there are any active process wide timers (POSIX 1.b, itimers, + * RLIMIT_CPU) cputimer must be running. + */ + if (tsk->signal->cputimer.running) + check_process_timers(tsk, &firing); /* * We must release these locks before taking any timer's lock. @@ -1452,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk) } /* - * Set one of the process-wide special case CPU timers. + * Set one of the process-wide special case CPU timers or RLIMIT_CPU. * The tsk->sighand->siglock must be held by the caller. - * The *newval argument is relative and we update it to be absolute, *oldval - * is absolute and we update it to be relative. */ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { union cpu_time_count now; - struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { + /* + * We are setting itimer. The *oldval is absolute and we update + * it to be relative, *newval argument is relative and we update + * it to be absolute. + */ if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ @@ -1479,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (cputime_eq(*newval, cputime_zero)) return; *newval = cputime_add(*newval, now.cpu); - - /* - * If the RLIMIT_CPU timer will expire before the - * ITIMER_PROF timer, we have nothing else to do. - */ - if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur - < cputime_to_secs(*newval)) - return; } /* - * Check whether there are any process timers already set to fire - * before this one. If so, we don't have anything more to do. + * Update expiration cache if we are the earliest timer, or eventually + * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. */ - head = &tsk->signal->cpu_timers[clock_idx]; - if (list_empty(head) || - cputime_ge(list_first_entry(head, - struct cpu_timer_list, entry)->expires.cpu, - *newval)) { - switch (clock_idx) { - case CPUCLOCK_PROF: + switch (clock_idx) { + case CPUCLOCK_PROF: + if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) tsk->signal->cputime_expires.prof_exp = *newval; - break; - case CPUCLOCK_VIRT: + break; + case CPUCLOCK_VIRT: + if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) tsk->signal->cputime_expires.virt_exp = *newval; - break; - } + break; } } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 00d1fda..ad72342 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -559,14 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->it_overrun = -1; - error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); - if (error) - goto out; - /* - * return the timer_id now. The next step is hard to - * back out if there is an error. - */ if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; @@ -597,6 +590,10 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; + error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); + if (error) + goto out; + spin_lock_irq(¤t->sighand->siglock); new_timer->it_signal = current->signal; list_add(&new_timer->list, ¤t->signal->posix_timers); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5c36ea9..ca6066a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n +config SUSPEND_NVS + bool + config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE + select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -130,13 +134,10 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. -config HIBERNATION_NVS - bool - config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE - select HIBERNATION_NVS if HAS_IOMEM + select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 4319181..f9063c6 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o -obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ + block_io.o +obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c new file mode 100644 index 0000000..97024fd --- /dev/null +++ b/kernel/power/block_io.c @@ -0,0 +1,103 @@ +/* + * This file provides functions for block I/O operations on swap/file. + * + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * This file is released under the GPLv2. + */ + +#include <linux/bio.h> +#include <linux/kernel.h> +#include <linux/pagemap.h> +#include <linux/swap.h> + +#include "power.h" + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * @bio_chain: list of pending biod (for async reading) + * + * Straight from the textbook - allocate and initialize the bio. + * If we're reading, make sure the page is marked as dirty. + * Then submit it and, if @bio_chain == NULL, wait. + */ +static int submit(int rw, struct block_device *bdev, sector_t sector, + struct page *page, struct bio **bio_chain) +{ + const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + struct bio *bio; + + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = end_swap_bio_read; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", + (unsigned long long)sector); + bio_put(bio); + return -EFAULT; + } + + lock_page(page); + bio_get(bio); + + if (bio_chain == NULL) { + submit_bio(bio_rw, bio); + wait_on_page_locked(page); + if (rw == READ) + bio_set_pages_dirty(bio); + bio_put(bio); + } else { + if (rw == READ) + get_page(page); /* These pages are freed later */ + bio->bi_private = *bio_chain; + *bio_chain = bio; + submit_bio(bio_rw, bio); + } + return 0; +} + +int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_wait_on_bio_chain(struct bio **bio_chain) +{ + struct bio *bio; + struct bio *next_bio; + int ret = 0; + + if (bio_chain == NULL) + return 0; + + bio = *bio_chain; + if (bio == NULL) + return 0; + while (bio) { + struct page *page; + + next_bio = bio->bi_private; + page = bio->bi_io_vec[0].bv_page; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + put_page(page); + bio_put(bio); + bio = next_bio; + } + *bio_chain = NULL; + return ret; +} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index da5288e..8dc31e0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * * This file is released under the GPLv2. @@ -22,6 +22,7 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> +#include <linux/gfp.h> #include <scsi/scsi_scan.h> #include <asm/suspend.h> @@ -276,7 +277,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE)) + if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) goto Power_up; in_suspend = 1; @@ -287,8 +288,10 @@ static int create_image(int platform_mode) error); /* Restore control flow magically appears here */ restore_processor_state(); - if (!in_suspend) + if (!in_suspend) { + events_check_enabled = false; platform_leave(platform_mode); + } Power_up: sysdev_resume(); @@ -327,7 +330,7 @@ int hibernation_snapshot(int platform_mode) error = platform_begin(platform_mode); if (error) - return error; + goto Close; /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); @@ -510,18 +513,24 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); + if (!pm_check_wakeup_events()) { + error = -EAGAIN; + goto Power_up; + } + hibernation_ops->enter(); /* We should never get here */ while (1); - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ + Power_up: + sysdev_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + Platform_finish: hibernation_ops->finish(); - dpm_suspend_noirq(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index b58800b..62b0bc6 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(state); +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. + */ + +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned long val; + + return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; +} + +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (sscanf(buf, "%lu", &val) == 1) { + if (pm_save_wakeup_count(val)) + return n; + } + return -EINVAL; +} + +power_attr(wakeup_count); +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -236,6 +290,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, + &wakeup_count_attr.attr, #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c index 39ac698..1836db6 100644 --- a/kernel/power/hibernate_nvs.c +++ b/kernel/power/nvs.c @@ -10,11 +10,12 @@ #include <linux/kernel.h> #include <linux/list.h> #include <linux/mm.h> +#include <linux/slab.h> #include <linux/suspend.h> /* * Platforms, like ACPI, may want us to save some memory used by them during - * hibernation and to restore the contents of this memory during the subsequent + * suspend and to restore the contents of this memory during the subsequent * resume. The code below implements a mechanism allowing us to do that. */ @@ -29,7 +30,7 @@ struct nvs_page { static LIST_HEAD(nvs_list); /** - * hibernate_nvs_register - register platform NVS memory region to save + * suspend_nvs_register - register platform NVS memory region to save * @start - physical address of the region * @size - size of the region * @@ -37,7 +38,7 @@ static LIST_HEAD(nvs_list); * things so that the data from page-aligned addresses in this region will * be copied into separate RAM pages. */ -int hibernate_nvs_register(unsigned long start, unsigned long size) +int suspend_nvs_register(unsigned long start, unsigned long size) { struct nvs_page *entry, *next; @@ -67,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size) } /** - * hibernate_nvs_free - free data pages allocated for saving NVS regions + * suspend_nvs_free - free data pages allocated for saving NVS regions */ -void hibernate_nvs_free(void) +void suspend_nvs_free(void) { struct nvs_page *entry; @@ -85,16 +86,16 @@ void hibernate_nvs_free(void) } /** - * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + * suspend_nvs_alloc - allocate memory necessary for saving NVS regions */ -int hibernate_nvs_alloc(void) +int suspend_nvs_alloc(void) { struct nvs_page *entry; list_for_each_entry(entry, &nvs_list, node) { entry->data = (void *)__get_free_page(GFP_KERNEL); if (!entry->data) { - hibernate_nvs_free(); + suspend_nvs_free(); return -ENOMEM; } } @@ -102,9 +103,9 @@ int hibernate_nvs_alloc(void) } /** - * hibernate_nvs_save - save NVS memory regions + * suspend_nvs_save - save NVS memory regions */ -void hibernate_nvs_save(void) +void suspend_nvs_save(void) { struct nvs_page *entry; @@ -118,12 +119,12 @@ void hibernate_nvs_save(void) } /** - * hibernate_nvs_restore - restore NVS memory regions + * suspend_nvs_restore - restore NVS memory regions * * This function is going to be called with interrupts disabled, so it * cannot iounmap the virtual addresses used to access the NVS region. */ -void hibernate_nvs_restore(void) +void suspend_nvs_restore(void) { struct nvs_page *entry; diff --git a/kernel/power/power.h b/kernel/power/power.h index 46c5a26..006270f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void); */ struct snapshot_handle { - loff_t offset; /* number of the last byte ready for reading - * or writing in the sequence - */ unsigned int cur; /* number of the block of PAGE_SIZE bytes the * next operation will refer to (ie. current) */ - unsigned int cur_offset; /* offset with respect to the current - * block (for the next operation) - */ - unsigned int prev; /* number of the block of PAGE_SIZE bytes that - * was the current one previously - */ void *buffer; /* address of the block to read from * or write to */ - unsigned int buf_offset; /* location to read from or write to, - * given as a displacement from 'buffer' - */ int sync_read; /* Set to one to notify the caller of * snapshot_write_next() that it may * need to call wait_on_bio_chain() @@ -125,12 +113,12 @@ struct snapshot_handle { * snapshot_read_next()/snapshot_write_next() is allowed to * read/write data after the function returns */ -#define data_of(handle) ((handle).buffer + (handle).buf_offset) +#define data_of(handle) ((handle).buffer) extern unsigned int snapshot_additional_pages(struct zone *zone); extern unsigned long snapshot_get_image_size(void); -extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); -extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); +extern int snapshot_read_next(struct snapshot_handle *handle); +extern int snapshot_write_next(struct snapshot_handle *handle); extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); @@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); extern void swsusp_close(fmode_t); +/* kernel/power/block_io.c */ +extern struct block_device *hib_resume_bdev; + +extern int hib_bio_read_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_bio_write_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_wait_on_bio_chain(struct bio **bio_chain); + struct timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(struct timeval *, struct timeval *, diff --git a/kernel/power/process.c b/kernel/power/process.c index 5ade1bd..71ae290 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only) printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " "(%d tasks refusing to freeze):\n", elapsed_csecs / 100, elapsed_csecs % 100, todo); - show_state(); read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); if (freezing(p) && !freezer_should_skip(p)) - printk(KERN_ERR " %s\n", p->comm); + sched_show_task(p); cancel_freezing(p); task_unlock(p); } while_each_thread(g, p); @@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only) if (nosig_only && should_send_signal(p)) continue; - if (cgroup_frozen(p)) + if (cgroup_freezing_or_frozen(p)) continue; thaw_process(p); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 830cade..f6cd6fa 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -3,7 +3,7 @@ * * This file provides system snapshot/restore functionality for swsusp. * - * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -26,6 +26,7 @@ #include <linux/console.h> #include <linux/highmem.h> #include <linux/list.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -1603,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) * snapshot_handle structure. The structure gets updated and a pointer * to it should be passed to this function every next time. * - * The @count parameter should contain the number of bytes the caller - * wants to read from the snapshot. It must not be zero. - * * On success the function returns a positive number. Then, the caller * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the read would - * cross a page boundary otherwise. + * location computed by the data_of() macro. * * The function returns 0 to indicate the end of data stream condition, * and a negative number is returned on error. In such cases the @@ -1618,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) * any more. */ -int snapshot_read_next(struct snapshot_handle *handle, size_t count) +int snapshot_read_next(struct snapshot_handle *handle) { if (handle->cur > nr_meta_pages + nr_copy_pages) return 0; @@ -1629,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) if (!buffer) return -ENOMEM; } - if (!handle->offset) { + if (!handle->cur) { int error; error = init_header((struct swsusp_info *)buffer); @@ -1638,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) handle->buffer = buffer; memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); - } - if (handle->prev < handle->cur) { - if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); - pack_pfns(buffer, &orig_bm); - } else { - struct page *page; + } else if (handle->cur <= nr_meta_pages) { + memset(buffer, 0, PAGE_SIZE); + pack_pfns(buffer, &orig_bm); + } else { + struct page *page; - page = pfn_to_page(memory_bm_next_pfn(©_bm)); - if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, - * because we can't return with a kmapped - * highmem page (we may not be called again). - */ - void *kaddr; + page = pfn_to_page(memory_bm_next_pfn(©_bm)); + if (PageHighMem(page)) { + /* Highmem pages are copied to the buffer, + * because we can't return with a kmapped + * highmem page (we may not be called again). + */ + void *kaddr; - kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - handle->buffer = buffer; - } else { - handle->buffer = page_address(page); - } + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buffer, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + handle->buffer = buffer; + } else { + handle->buffer = page_address(page); } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; - } else { - handle->cur_offset += count; } - handle->offset += count; - return count; + handle->cur++; + return PAGE_SIZE; } /** @@ -2132,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) * snapshot_handle structure. The structure gets updated and a pointer * to it should be passed to this function every next time. * - * The @count parameter should contain the number of bytes the caller - * wants to write to the image. It must not be zero. - * * On success the function returns a positive number. Then, the caller * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the write would - * cross a page boundary otherwise. + * location computed by the data_of() macro. * * The function returns 0 to indicate the "end of file" condition, * and a negative number is returned on error. In such cases the @@ -2147,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) * any more. */ -int snapshot_write_next(struct snapshot_handle *handle, size_t count) +int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; int error = 0; /* Check if we have already loaded the entire image */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) return 0; - if (handle->offset == 0) { + handle->sync_read = 1; + + if (!handle->cur) { if (!buffer) /* This makes the buffer be freed by swsusp_free() */ buffer = get_image_page(GFP_ATOMIC, PG_ANY); @@ -2165,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return -ENOMEM; handle->buffer = buffer; - } - handle->sync_read = 1; - if (handle->prev < handle->cur) { - if (handle->prev == 0) { - error = load_header(buffer); - if (error) - return error; + } else if (handle->cur == 1) { + error = load_header(buffer); + if (error) + return error; - error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); - if (error) - return error; + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + } else if (handle->cur <= nr_meta_pages + 1) { + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; - } else if (handle->prev <= nr_meta_pages) { - error = unpack_orig_pfns(buffer, ©_bm); + if (handle->cur == nr_meta_pages + 1) { + error = prepare_image(&orig_bm, ©_bm); if (error) return error; - if (handle->prev == nr_meta_pages) { - error = prepare_image(&orig_bm, ©_bm); - if (error) - return error; - - chain_init(&ca, GFP_ATOMIC, PG_SAFE); - memory_bm_position_reset(&orig_bm); - restore_pblist = NULL; - handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; - if (IS_ERR(handle->buffer)) - return PTR_ERR(handle->buffer); - } - } else { - copy_last_highmem_page(); + chain_init(&ca, GFP_ATOMIC, PG_SAFE); + memory_bm_position_reset(&orig_bm); + restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); + handle->sync_read = 0; if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); - if (handle->buffer != buffer) - handle->sync_read = 0; } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; } else { - handle->cur_offset += count; + copy_last_highmem_page(); + handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); + if (handle->buffer != buffer) + handle->sync_read = 0; } - handle->offset += count; - return count; + handle->cur++; + return PAGE_SIZE; } /** @@ -2229,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); /* Free only if we have loaded the image entirely */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); free_highmem_data(); } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 44cce10..7335952 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -15,6 +15,13 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/syscalls.h> +#include <linux/gfp.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/suspend.h> #include "power.h" @@ -129,19 +136,19 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - return error; + goto Platform_finish; } error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platfrom_finish; + goto Platform_finish; } if (suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) - goto Power_up_devices; + goto Platform_wake; } if (suspend_test(TEST_PLATFORM)) @@ -156,8 +163,10 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE)) + if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { error = suspend_ops->enter(state); + events_check_enabled = false; + } sysdev_resume(); } @@ -171,10 +180,9 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->wake) suspend_ops->wake(); - Power_up_devices: dpm_resume_noirq(PMSG_RESUME); - Platfrom_finish: + Platform_finish: if (suspend_ops->finish) suspend_ops->finish(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 1d57573..e6a5bdf 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -4,7 +4,7 @@ * This file provides functions for reading the suspend image from * and writing it to a swap partition. * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -23,11 +23,46 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> +#include <linux/slab.h> #include "power.h" #define SWSUSP_SIG "S1SUSPEND" +/* + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. + * These structures are stored on the swap and linked together with the + * help of the .next_swap member. + * + * The swap map is created during suspend. The swap map pages are + * allocated and populated one at a time, so we only need one memory + * page to set up the entire structure. + * + * During resume we also only need to use one swap_map_page structure + * at a time. + */ + +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) + +struct swap_map_page { + sector_t entries[MAP_PAGE_ENTRIES]; + sector_t next_swap; +}; + +/** + * The swap_map_handle structure is used for handling swap in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + sector_t cur_swap; + sector_t first_sector; + unsigned int k; +}; + struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; sector_t image; @@ -113,7 +148,7 @@ sector_t alloc_swapdev_block(int swap) /** * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been + * It also frees the extents used to register which swap entries had been * allocated. */ @@ -144,110 +179,24 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -static struct block_device *resume_bdev; - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, pgoff_t page_off, struct page *page, - struct bio **bio_chain) -{ - const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); - struct bio *bio; - - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %ld\n", - page_off); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(bio_rw, bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(bio_rw, bio); - } - return 0; -} - -static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, page_off, virt_to_page(addr), bio_chain); -} - -static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(WRITE, page_off, virt_to_page(addr), bio_chain); -} - -static int wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} +struct block_device *hib_resume_bdev; /* * Saving part */ -static int mark_swapfiles(sector_t start, unsigned int flags) +static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig,SWSUSP_SIG, 10); - swsusp_header->image = start; + swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; - error = bio_write_page(swsusp_resume_block, + error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -259,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags) /** * swsusp_swap_check - check if the resume device is a swap device * and get its index (if so) + * + * This is called before saving image */ - -static int swsusp_swap_check(void) /* This is called before saving image */ +static int swsusp_swap_check(void) { int res; res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &resume_bdev); + &hib_resume_bdev); if (res < 0) return res; root_swap = res; - res = blkdev_get(resume_bdev, FMODE_WRITE); + res = blkdev_get(hib_resume_bdev, FMODE_WRITE); if (res) return res; - res = set_blocksize(resume_bdev, PAGE_SIZE); + res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) - blkdev_put(resume_bdev, FMODE_WRITE); + blkdev_put(hib_resume_bdev, FMODE_WRITE); return res; } @@ -308,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) } else { src = buf; } - return bio_write_page(offset, src, bio_chain); + return hib_bio_write_page(offset, src, bio_chain); } -/* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. - * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. - * - * During resume we also only need to use one swap_map_page structure - * at a time. - */ - -#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) - -struct swap_map_page { - sector_t entries[MAP_PAGE_ENTRIES]; - sector_t next_swap; -}; - -/** - * The swap_map_handle structure is used for handling swap in - * a file-alike way - */ - -struct swap_map_handle { - struct swap_map_page *cur; - sector_t cur_swap; - unsigned int k; -}; - static void release_swap_writer(struct swap_map_handle *handle) { if (handle->cur) @@ -353,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle) static int get_swap_writer(struct swap_map_handle *handle) { + int ret; + + ret = swsusp_swap_check(); + if (ret) { + if (ret != -ENOSPC) + printk(KERN_ERR "PM: Cannot find swap device, try " + "swapon -a.\n"); + return ret; + } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); - if (!handle->cur) - return -ENOMEM; + if (!handle->cur) { + ret = -ENOMEM; + goto err_close; + } handle->cur_swap = alloc_swapdev_block(root_swap); if (!handle->cur_swap) { - release_swap_writer(handle); - return -ENOSPC; + ret = -ENOSPC; + goto err_rel; } handle->k = 0; + handle->first_sector = handle->cur_swap; return 0; +err_rel: + release_swap_writer(handle); +err_close: + swsusp_close(FMODE_WRITE); + return ret; } static int swap_write_page(struct swap_map_handle *handle, void *buf, @@ -379,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); + error = hib_wait_on_bio_chain(bio_chain); if (error) goto out; offset = alloc_swapdev_block(root_swap); @@ -405,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle) return -EINVAL; } +static int swap_writer_finish(struct swap_map_handle *handle, + unsigned int flags, int error) +{ + if (!error) { + flush_swap_writer(handle); + printk(KERN_INFO "PM: S"); + error = mark_swapfiles(handle, flags); + printk("|\n"); + } + + if (error) + free_all_swap_pages(root_swap); + release_swap_writer(handle); + swsusp_close(FMODE_WRITE); + + return error; +} + /** * save_image - save the suspend image data */ @@ -430,7 +382,7 @@ static int save_image(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); while (1) { - ret = snapshot_read_next(snapshot, PAGE_SIZE); + ret = snapshot_read_next(snapshot); if (ret <= 0) break; ret = swap_write_page(handle, data_of(*snapshot), &bio); @@ -440,7 +392,7 @@ static int save_image(struct swap_map_handle *handle, printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); nr_pages++; } - err2 = wait_on_bio_chain(&bio); + err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); if (!ret) ret = err2; @@ -482,50 +434,34 @@ int swsusp_write(unsigned int flags) struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; + unsigned long pages; int error; - error = swsusp_swap_check(); + pages = snapshot_get_image_size(); + error = get_swap_writer(&handle); if (error) { - printk(KERN_ERR "PM: Cannot find swap device, try " - "swapon -a.\n"); + printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } + if (!enough_swap(pages)) { + printk(KERN_ERR "PM: Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; + } memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_read_next(&snapshot, PAGE_SIZE); + error = snapshot_read_next(&snapshot); if (error < PAGE_SIZE) { if (error >= 0) error = -EFAULT; - goto out; + goto out_finish; } header = (struct swsusp_info *)data_of(snapshot); - if (!enough_swap(header->pages)) { - printk(KERN_ERR "PM: Not enough free swap\n"); - error = -ENOSPC; - goto out; - } - error = get_swap_writer(&handle); - if (!error) { - sector_t start = handle.cur_swap; - - error = swap_write_page(&handle, header, NULL); - if (!error) - error = save_image(&handle, &snapshot, - header->pages - 1); - - if (!error) { - flush_swap_writer(&handle); - printk(KERN_INFO "PM: S"); - error = mark_swapfiles(start, flags); - printk("|\n"); - } - } - if (error) - free_all_swap_pages(root_swap); - - release_swap_writer(&handle); - out: - swsusp_close(FMODE_WRITE); + error = swap_write_page(&handle, header, NULL); + if (!error) + error = save_image(&handle, &snapshot, pages - 1); +out_finish: + error = swap_writer_finish(&handle, flags, error); return error; } @@ -541,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle) handle->cur = NULL; } -static int get_swap_reader(struct swap_map_handle *handle, sector_t start) +static int get_swap_reader(struct swap_map_handle *handle, + unsigned int *flags_p) { int error; - if (!start) + *flags_p = swsusp_header->flags; + + if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); if (!handle->cur) return -ENOMEM; - error = bio_read_page(start, handle->cur, NULL); + error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); if (error) { release_swap_reader(handle); return error; @@ -572,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = bio_read_page(offset, buf, bio_chain); + error = hib_bio_read_page(offset, buf, bio_chain); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); + error = hib_wait_on_bio_chain(bio_chain); handle->k = 0; offset = handle->cur->next_swap; if (!offset) release_swap_reader(handle); else if (!error) - error = bio_read_page(offset, handle->cur, NULL); + error = hib_bio_read_page(offset, handle->cur, NULL); } return error; } +static int swap_reader_finish(struct swap_map_handle *handle) +{ + release_swap_reader(handle); + + return 0; +} + /** * load_image - load the image using the swap map handle * @handle and the snapshot handle @snapshot @@ -614,21 +560,21 @@ static int load_image(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); for ( ; ; ) { - error = snapshot_write_next(snapshot, PAGE_SIZE); + error = snapshot_write_next(snapshot); if (error <= 0) break; error = swap_read_page(handle, data_of(*snapshot), &bio); if (error) break; if (snapshot->sync_read) - error = wait_on_bio_chain(&bio); + error = hib_wait_on_bio_chain(&bio); if (error) break; if (!(nr_pages % m)) printk("\b\b\b\b%3d%%", nr_pages / m); nr_pages++; } - err2 = wait_on_bio_chain(&bio); + err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); if (!error) error = err2; @@ -656,20 +602,20 @@ int swsusp_read(unsigned int *flags_p) struct snapshot_handle snapshot; struct swsusp_info *header; - *flags_p = swsusp_header->flags; - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_write_next(&snapshot, PAGE_SIZE); + error = snapshot_write_next(&snapshot); if (error < PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); - error = get_swap_reader(&handle, swsusp_header->image); + error = get_swap_reader(&handle, flags_p); + if (error) + goto end; if (!error) error = swap_read_page(&handle, header, NULL); if (!error) error = load_image(&handle, &snapshot, header->pages - 1); - release_swap_reader(&handle); - + swap_reader_finish(&handle); +end: if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -685,11 +631,11 @@ int swsusp_check(void) { int error; - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); - if (!IS_ERR(resume_bdev)) { - set_blocksize(resume_bdev, PAGE_SIZE); + hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + if (!IS_ERR(hib_resume_bdev)) { + set_blocksize(hib_resume_bdev, PAGE_SIZE); memset(swsusp_header, 0, PAGE_SIZE); - error = bio_read_page(swsusp_resume_block, + error = hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -697,7 +643,7 @@ int swsusp_check(void) if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = bio_write_page(swsusp_resume_block, + error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; @@ -705,11 +651,11 @@ int swsusp_check(void) put: if (error) - blkdev_put(resume_bdev, FMODE_READ); + blkdev_put(hib_resume_bdev, FMODE_READ); else pr_debug("PM: Signature found, resuming\n"); } else { - error = PTR_ERR(resume_bdev); + error = PTR_ERR(hib_resume_bdev); } if (error) @@ -724,12 +670,12 @@ put: void swsusp_close(fmode_t mode) { - if (IS_ERR(resume_bdev)) { + if (IS_ERR(hib_resume_bdev)) { pr_debug("PM: Image device not initialised\n"); return; } - blkdev_put(resume_bdev, mode); + blkdev_put(hib_resume_bdev, mode); } static int swsusp_header_init(void) diff --git a/kernel/power/user.c b/kernel/power/user.c index 4d22896..e819e17 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, { struct snapshot_data *data; ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; mutex_lock(&pm_mutex); @@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, res = -ENODATA; goto Unlock; } - res = snapshot_read_next(&data->handle, count); - if (res > 0) { - if (copy_to_user(buf, data_of(data->handle), res)) - res = -EFAULT; - else - *offp = data->handle.offset; + if (!pg_offp) { /* on page boundary? */ + res = snapshot_read_next(&data->handle); + if (res <= 0) + goto Unlock; + } else { + res = PAGE_SIZE - pg_offp; } + res = simple_read_from_buffer(buf, count, &pg_offp, + data_of(data->handle), res); + if (res > 0) + *offp += res; + Unlock: mutex_unlock(&pm_mutex); @@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, { struct snapshot_data *data; ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; mutex_lock(&pm_mutex); data = filp->private_data; - res = snapshot_write_next(&data->handle, count); - if (res > 0) { - if (copy_from_user(data_of(data->handle), buf, res)) - res = -EFAULT; - else - *offp = data->handle.offset; + + if (!pg_offp) { + res = snapshot_write_next(&data->handle); + if (res <= 0) + goto unlock; + } else { + res = PAGE_SIZE - pg_offp; } + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, + buf, count); + if (res > 0) + *offp += res; +unlock: mutex_unlock(&pm_mutex); return res; @@ -420,7 +433,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * User space encodes device types as two-byte values, * so we need to recode them */ - swdev = old_decode_dev(swap_area.dev); + swdev = new_decode_dev(swap_area.dev); if (swdev) { offset = swap_area.offset; data->swap = swap_type_of(swdev, offset, NULL); diff --git a/kernel/printk.c b/kernel/printk.c index 75077ad..444b770 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -33,6 +33,7 @@ #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/kexec.h> +#include <linux/kdb.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> @@ -413,6 +414,22 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } +#ifdef CONFIG_KGDB_KDB +/* kdb dmesg command needs access to the syslog buffer. do_syslog() + * uses locks so it cannot be used during debugging. Just tell kdb + * where the start and end of the physical and logical logs are. This + * is equivalent to do_syslog(3). + */ +void kdb_syslog_data(char *syslog_data[4]) +{ + syslog_data[0] = log_buf; + syslog_data[1] = log_buf + log_buf_len; + syslog_data[2] = log_buf + log_end - + (logged_chars < log_buf_len ? logged_chars : log_buf_len); + syslog_data[3] = log_buf + log_end; +} +#endif /* CONFIG_KGDB_KDB */ + /* * Call the console drivers on a range of log_buf */ @@ -586,6 +603,14 @@ asmlinkage int printk(const char *fmt, ...) va_list args; int r; +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk)) { + va_start(args, fmt); + r = vkdb_printf(fmt, args); + va_end(args); + return r; + } +#endif va_start(args, fmt); r = vprintk(fmt, args); va_end(args); diff --git a/kernel/profile.c b/kernel/profile.c index a55d3a3..b22a899 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -127,8 +127,10 @@ int __ref profile_init(void) return 0; prof_buffer = vmalloc(buffer_bytes); - if (prof_buffer) + if (prof_buffer) { + memset(prof_buffer, 0, buffer_bytes); return 0; + } free_cpumask_var(prof_cpu_mask); return -ENOMEM; @@ -363,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - node = cpu_to_node(cpu); + node = cpu_to_mem(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { @@ -386,7 +388,7 @@ out_free: page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); per_cpu(cpu_profile_hits, cpu)[1] = NULL; __free_page(page); - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); case CPU_ONLINE: case CPU_ONLINE_FROZEN: if (prof_cpu_mask != NULL) @@ -565,7 +567,7 @@ static int create_hash_tables(void) int cpu; for_each_online_cpu(cpu) { - int node = cpu_to_node(cpu); + int node = cpu_to_mem(cpu); struct page *page; page = alloc_pages_exact_node(node, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 42ad8ae..74a3d69 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -14,7 +14,6 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/ptrace.h> #include <linux/security.h> #include <linux/signal.h> @@ -76,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); - arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } @@ -596,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_detach(child, data); break; +#ifdef CONFIG_BINFMT_ELF_FDPIC + case PTRACE_GETFDPIC: { + struct mm_struct *mm = get_task_mm(child); + unsigned long tmp = 0; + + ret = -ESRCH; + if (!mm) + break; + + switch (addr) { + case PTRACE_GETFDPIC_EXEC: + tmp = mm->context.exec_fdpic_loadmap; + break; + case PTRACE_GETFDPIC_INTERP: + tmp = mm->context.interp_fdpic_loadmap; + break; + default: + break; + } + mmput(mm); + + ret = put_user(tmp, (unsigned long __user *) data); + break; + } +#endif + #ifdef PTRACE_SINGLESTEP case PTRACE_SINGLESTEP: #endif @@ -666,10 +690,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) struct task_struct *child; long ret; - /* - * This lock_kernel fixes a subtle race with suid exec - */ - lock_kernel(); if (request == PTRACE_TRACEME) { ret = ptrace_traceme(); if (!ret) @@ -703,7 +723,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) out_put_task_struct: put_task_struct(child); out: - unlock_kernel(); return ret; } @@ -813,10 +832,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, struct task_struct *child; long ret; - /* - * This lock_kernel fixes a subtle race with suid exec - */ - lock_kernel(); if (request == PTRACE_TRACEME) { ret = ptrace_traceme(); goto out; @@ -846,7 +861,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, out_put_task_struct: put_task_struct(child); out: - unlock_kernel(); return ret; } #endif /* CONFIG_COMPAT */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f1125c1..72a8dc9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,7 +44,7 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/module.h> -#include <linux/kernel_stat.h> +#include <linux/hardirq.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -63,23 +63,34 @@ struct lockdep_map rcu_sched_lock_map = EXPORT_SYMBOL_GPL(rcu_sched_lock_map); #endif -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#ifdef CONFIG_DEBUG_LOCK_ALLOC -/* - * This function is invoked towards the end of the scheduler's initialization - * process. Before this is called, the idle task might contain - * RCU read-side critical sections (during which time, this idle - * task is booting the system). After this function is called, the - * idle tasks are prohibited from containing RCU read-side critical - * sections. +int debug_lockdep_rcu_enabled(void) +{ + return rcu_scheduler_active && debug_locks && + current->lockdep_recursion == 0; +} +EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); + +/** + * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * + * Check for bottom half being disabled, which covers both the + * CONFIG_PROVE_RCU and not cases. Note that if someone uses + * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) + * will show the situation. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. */ -void rcu_scheduler_starting(void) +int rcu_read_lock_bh_held(void) { - WARN_ON(num_online_cpus() != 1); - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; + if (!debug_lockdep_rcu_enabled()) + return 1; + return in_softirq(); } +EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -92,3 +103,14 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } + +#ifdef CONFIG_PROVE_RCU +/* + * wrapper function to avoid #include problems. + */ +int rcu_my_thread_group_empty(void) +{ + return thread_group_empty(current); +} +EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); +#endif /* #ifdef CONFIG_PROVE_RCU */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 9f6d9ff..38729d3 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -44,9 +44,9 @@ struct rcu_ctrlblk { }; /* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_ctrlblk = { - .donetail = &rcu_ctrlblk.rcucblist, - .curtail = &rcu_ctrlblk.rcucblist, +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { @@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { .curtail = &rcu_bh_ctrlblk.rcucblist, }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + #ifdef CONFIG_NO_HZ static long rcu_dynticks_nesting = 1; @@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) */ void rcu_sched_qs(int cpu) { - if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) + if (rcu_qsctr_help(&rcu_sched_ctrlblk) + + rcu_qsctr_help(&rcu_bh_ctrlblk)) raise_softirq(RCU_SOFTIRQ); } @@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) */ static void rcu_process_callbacks(struct softirq_action *unused) { - __rcu_process_callbacks(&rcu_ctrlblk); + __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); } @@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) * * Cool, huh? (Due to Josh Triplett.) * - * But we want to make this a static inline later. + * But we want to make this a static inline later. The cond_resched() + * currently makes this problematic. */ void synchronize_sched(void) { @@ -195,12 +202,6 @@ void synchronize_sched(void) } EXPORT_SYMBOL_GPL(synchronize_sched); -void synchronize_rcu_bh(void) -{ - synchronize_sched(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_bh); - /* * Helper function for call_rcu() and call_rcu_bh(). */ @@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head, */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_ctrlblk); + __call_rcu(head, func, &rcu_sched_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu); @@ -244,11 +245,13 @@ void rcu_barrier(void) { struct rcu_synchronize rcu; + init_rcu_head_on_stack(&rcu.head); init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu(&rcu.head, wakeme_after_rcu); /* Wait for it. */ wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); } EXPORT_SYMBOL_GPL(rcu_barrier); @@ -256,11 +259,13 @@ void rcu_barrier_bh(void) { struct rcu_synchronize rcu; + init_rcu_head_on_stack(&rcu.head); init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_bh(&rcu.head, wakeme_after_rcu); /* Wait for it. */ wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -268,11 +273,13 @@ void rcu_barrier_sched(void) { struct rcu_synchronize rcu; + init_rcu_head_on_stack(&rcu.head); init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_sched(&rcu.head, wakeme_after_rcu); /* Wait for it. */ wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); @@ -280,3 +287,5 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } + +#include "rcutiny_plugin.h" diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h new file mode 100644 index 0000000..d223a92bc --- /dev/null +++ b/kernel/rcutiny_plugin.h @@ -0,0 +1,39 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptable semantics. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2009 + * + * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +#include <linux/kernel_stat.h> + +/* + * During boot, we forgive RCU lockdep issues. After this function is + * invoked, we start taking RCU lockdep issues seriously. + */ +void rcu_scheduler_starting(void) +{ + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 58df55b..6535ac8 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void) { struct rcu_bh_torture_synchronize rcu; + init_rcu_head_on_stack(&rcu.head); init_completion(&rcu.completion); call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); } static struct rcu_torture_ops rcu_bh_ops = { @@ -669,7 +671,7 @@ static struct rcu_torture_ops sched_expedited_ops = { .sync = synchronize_sched_expedited, .cb_barrier = NULL, .fqs = rcu_sched_force_quiescent_state, - .stats = rcu_expedited_torture_stats, + .stats = NULL, .irq_capable = 1, .name = "sched_expedited" }; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3ec8160..d443734 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -46,6 +46,7 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/time.h> +#include <linux/kernel_stat.h> #include "rcutree.h" @@ -53,8 +54,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; -#define RCU_STATE_INITIALIZER(name) { \ - .level = { &name.node[0] }, \ +#define RCU_STATE_INITIALIZER(structname) { \ + .level = { &structname.node[0] }, \ .levelcnt = { \ NUM_RCU_LVL_0, /* root of hierarchy. */ \ NUM_RCU_LVL_1, \ @@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ .orphan_cbs_list = NULL, \ - .orphan_cbs_tail = &name.orphan_cbs_list, \ + .orphan_cbs_tail = &structname.orphan_cbs_list, \ .orphan_qlen = 0, \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ + .name = #structname, \ } struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); @@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); + /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(int cpu) { - struct rcu_data *rdp; + struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - rdp = &per_cpu(rcu_sched_data, cpu); rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; - rcu_preempt_note_context_switch(cpu); } void rcu_bh_qs(int cpu) { - struct rcu_data *rdp; + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp = &per_cpu(rcu_bh_data, cpu); rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; } +/* + * Note a context switch. This is a quiescent state for RCU-sched, + * and requires special handling for preemptible RCU. + */ +void rcu_note_context_switch(int cpu) +{ + rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); +} + #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = 1, @@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR +int rcu_cpu_stall_panicking __read_mostly; + static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; @@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp) /* OK, time to rat on our buddy... */ - printk(KERN_ERR "INFO: RCU detected CPU stalls:"); + printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", + rsp->name); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); rcu_print_task_stall(rnp); @@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) if (rnp->qsmask & (1UL << cpu)) printk(" %d", rnp->grplo + cpu); } - printk(" (detected by %d, t=%ld jiffies)\n", + printk("} (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); trigger_all_cpu_backtrace(); @@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", - smp_processor_id(), jiffies - rsp->gp_start); + printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", + rsp->name, smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); raw_spin_lock_irqsave(&rnp->lock, flags); @@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) long delta; struct rcu_node *rnp; + if (rcu_cpu_stall_panicking) + return; delta = jiffies - rsp->jiffies_stall; rnp = rdp->mynode; if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { @@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) } } +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_panicking = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static void __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); +} + #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void record_gp_stall_check_time(struct rcu_state *rsp) @@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { } +static void __init check_cpu_stall_init(void) +{ +} + #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -1125,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { - if (!rcu_pending(cpu)) - return; /* if nothing for RCU to do. */ if (user || (idle_cpu(cpu) && rcu_scheduler_active && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { @@ -1158,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_bh_qs(cpu); } rcu_preempt_check_callbacks(cpu); - raise_softirq(RCU_SOFTIRQ); + if (rcu_pending(cpu)) + raise_softirq(RCU_SOFTIRQ); } #ifdef CONFIG_SMP @@ -1236,11 +1271,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: - - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) break; /* So gcc recognizes the dead code. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + /* Record dyntick-idle state. */ force_qs_rnp(rsp, dyntick_save_progress_counter); raw_spin_lock(&rnp->lock); /* irqs already disabled */ @@ -1449,11 +1484,13 @@ void synchronize_sched(void) if (rcu_blocking_is_gp()) return; + init_rcu_head_on_stack(&rcu.head); init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_sched(&rcu.head, wakeme_after_rcu); /* Wait for it. */ wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -1473,11 +1510,13 @@ void synchronize_rcu_bh(void) if (rcu_blocking_is_gp()) return; + init_rcu_head_on_stack(&rcu.head); init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_bh(&rcu.head, wakeme_after_rcu); /* Wait for it. */ wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); @@ -1498,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending) { + if (rdp->qs_pending && !rdp->passed_quiesc) { + + /* + * If force_quiescent_state() coming soon and this CPU + * needs a quiescent state, and this is either RCU-sched + * or RCU-bh, force a local reschedule. + */ rdp->n_rp_qs_pending++; + if (!rdp->preemptable && + ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, + jiffies)) + set_need_resched(); + } else if (rdp->qs_pending && rdp->passed_quiesc) { + rdp->n_rp_report_qs++; return 1; } @@ -1767,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, } /* + * This function is invoked towards the end of the scheduler's initialization + * process. Before this is called, the idle task might contain + * RCU read-side critical sections (during which time, this idle + * task is booting the system). After this function is called, the + * idle tasks are prohibited from containing RCU read-side critical + * sections. This function also enables RCU lockdep checking. + */ +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + +/* * Compute the per-level fanout, either using the exact fanout specified * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. */ @@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp) INIT_LIST_HEAD(&rnp->blocked_tasks[3]); } } + + rnp = rsp->level[NUM_RCU_LVLS - 1]; + for_each_possible_cpu(i) { + while (i > rnp->grphi) + rnp++; + rsp->rda[i]->mynode = rnp; + rcu_boot_init_percpu_data(i, rsp); + } } /* @@ -1859,19 +1933,11 @@ static void __init rcu_init_one(struct rcu_state *rsp) #define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ int i; \ - int j; \ - struct rcu_node *rnp; \ \ - rcu_init_one(rsp); \ - rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ - j = 0; \ for_each_possible_cpu(i) { \ - if (i > rnp[j].grphi) \ - j++; \ - per_cpu(rcu_data, i).mynode = &rnp[j]; \ (rsp)->rda[i] = &per_cpu(rcu_data, i); \ - rcu_boot_init_percpu_data(i, rsp); \ } \ + rcu_init_one(rsp); \ } while (0) void __init rcu_init(void) @@ -1879,12 +1945,6 @@ void __init rcu_init(void) int cpu; rcu_bootup_announce(); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -#if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n"); -#endif /* #if NUM_RCU_LVL_4 != 0 */ RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); __rcu_init_preempt(); @@ -1898,6 +1958,7 @@ void __init rcu_init(void) cpu_notifier(rcu_cpu_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + check_cpu_stall_init(); } #include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4a525a3..14c040b 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -223,6 +223,7 @@ struct rcu_data { /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ unsigned long n_rp_qs_pending; + unsigned long n_rp_report_qs; unsigned long n_rp_cb_ready; unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; @@ -326,6 +327,7 @@ struct rcu_state { unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + char *name; /* Name of structure. */ }; /* Return values for rcu_preempt_offline_tasks(). */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 79b53bd..0e4f420 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -26,6 +26,45 @@ #include <linux/delay.h> +/* + * Check the RCU kernel configuration parameters and print informative + * messages about anything out of the ordinary. If you like #ifdef, you + * will love this function. + */ +static void __init rcu_bootup_announce_oddness(void) +{ +#ifdef CONFIG_RCU_TRACE + printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); +#endif +#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) + printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + CONFIG_RCU_FANOUT); +#endif +#ifdef CONFIG_RCU_FANOUT_EXACT + printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); +#endif +#ifdef CONFIG_RCU_FAST_NO_HZ + printk(KERN_INFO + "\tRCU dyntick-idle grace-period acceleration is enabled.\n"); +#endif +#ifdef CONFIG_PROVE_RCU + printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); +#endif +#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE + printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); +#endif +#ifndef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO + "\tRCU-based detection of stalled CPUs is disabled.\n"); +#endif +#ifndef CONFIG_RCU_CPU_STALL_VERBOSE + printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); +#endif +#if NUM_RCU_LVL_4 != 0 + printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); +#endif +} + #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); @@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO - "Experimental preemptable hierarchical RCU implementation.\n"); + printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); + rcu_bootup_announce_oddness(); } /* @@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. */ static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; } /* @@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs(cpu); local_irq_save(flags); - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + rcu_preempt_qs(cpu); local_irq_restore(flags); } @@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t) */ special = t->rcu_read_unlock_special; if (special & RCU_READ_UNLOCK_NEED_QS) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; rcu_preempt_qs(smp_processor_id()); } @@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu) struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; rcu_preempt_qs(cpu); return; } @@ -515,11 +557,13 @@ void synchronize_rcu(void) if (!rcu_scheduler_active) return; + init_rcu_head_on_stack(&rcu.head); init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu(&rcu.head, wakeme_after_rcu); /* Wait for it. */ wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -754,6 +798,7 @@ void exit_rcu(void) static void __init rcu_bootup_announce(void) { printk(KERN_INFO "Hierarchical RCU implementation.\n"); + rcu_bootup_announce_oddness(); } /* @@ -1008,6 +1053,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); int rcu_needs_cpu(int cpu) { int c = 0; + int snap; + int snap_nmi; int thatcpu; /* Check for being in the holdoff period. */ @@ -1015,12 +1062,18 @@ int rcu_needs_cpu(int cpu) return rcu_needs_cpu_quick_check(cpu); /* Don't bother unless we are the last non-dyntick-idle CPU. */ - for_each_cpu_not(thatcpu, nohz_cpu_mask) - if (thatcpu != cpu) { + for_each_online_cpu(thatcpu) { + if (thatcpu == cpu) + continue; + snap = per_cpu(rcu_dynticks, thatcpu).dynticks; + snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; + smp_mb(); /* Order sampling of snap with end of grace period. */ + if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; return rcu_needs_cpu_quick_check(cpu); } + } /* Check and update the rcu_dyntick_drain sequencing. */ if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d45db2e..36c95b4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = { static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { seq_printf(m, "%3d%cnp=%ld " - "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", + "qsp=%ld rpq=%ld cbr=%ld cng=%ld " + "gpc=%ld gps=%ld nf=%ld nn=%ld\n", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->n_rcu_pending, rdp->n_rp_qs_pending, + rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp, rdp->n_rp_gp_completed, diff --git a/kernel/relay.c b/kernel/relay.c index 3d97f28..c7cf397 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, "relay_hotcpu_callback: cpu %d buffer " "creation failed\n", hotcpu); mutex_unlock(&relay_channels_mutex); - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); } } mutex_unlock(&relay_channels_mutex); @@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in, size_t read_subbuf = read_start / subbuf_size; size_t padding = rbuf->padding[read_subbuf]; size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages, .nr_pages = 0, @@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in, if (rbuf->subbufs_produced == rbuf->subbufs_consumed) return 0; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; /* * Adjust read len, if longer than what is available @@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); + nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; @@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in, } } + ret = 0; if (!spd.nr_pages) - return 0; + goto out; ret = *nonpad_ret = splice_to_pipe(pipe, &spd); if (ret < 0 || ret < total_len) - return ret; + goto out; if (read_start + ret == nonpad_end) ret += padding; +out: + splice_shrink_spd(pipe, &spd); return ret; } diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bcdabf3..c7eaa37 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -10,7 +10,6 @@ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> -#include <linux/slab.h> #include <linux/res_counter.h> #include <linux/uaccess.h> #include <linux/mm.h> diff --git a/kernel/resource.c b/kernel/resource.c index 2d5be5d..7b36976 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -15,6 +15,7 @@ #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/proc_fs.h> +#include <linux/sched.h> #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> @@ -219,19 +220,34 @@ void release_child_resources(struct resource *r) } /** - * request_resource - request and reserve an I/O or memory resource + * request_resource_conflict - request and reserve an I/O or memory resource * @root: root resource descriptor * @new: resource descriptor desired by caller * - * Returns 0 for success, negative error code on error. + * Returns 0 for success, conflict resource on error. */ -int request_resource(struct resource *root, struct resource *new) +struct resource *request_resource_conflict(struct resource *root, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __request_resource(root, new); write_unlock(&resource_lock); + return conflict; +} + +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ +int request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + conflict = request_resource_conflict(root, new); return conflict ? -EBUSY : 0; } @@ -474,25 +490,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou } /** - * insert_resource - Inserts a resource in the resource tree + * insert_resource_conflict - Inserts resource in the resource tree * @parent: parent of the new resource * @new: new resource to insert * - * Returns 0 on success, -EBUSY if the resource can't be inserted. + * Returns 0 on success, conflict resource if the resource can't be inserted. * - * This function is equivalent to request_resource when no conflict + * This function is equivalent to request_resource_conflict when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. */ -int insert_resource(struct resource *parent, struct resource *new) +struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __insert_resource(parent, new); write_unlock(&resource_lock); + return conflict; +} + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } @@ -651,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res) * release_region releases a matching busy region. */ +static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait); + /** * __request_region - create a new busy resource region * @parent: parent resource descriptor @@ -663,6 +696,7 @@ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) { + DECLARE_WAITQUEUE(wait, current); struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); if (!res) @@ -687,7 +721,15 @@ struct resource * __request_region(struct resource *parent, if (!(conflict->flags & IORESOURCE_BUSY)) continue; } - + if (conflict->flags & flags & IORESOURCE_MUXED) { + add_wait_queue(&muxed_resource_wait, &wait); + write_unlock(&resource_lock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + remove_wait_queue(&muxed_resource_wait, &wait); + write_lock(&resource_lock); + continue; + } /* Uhhuh, that didn't work out.. */ kfree(res); res = NULL; @@ -761,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start, break; *p = res->sibling; write_unlock(&resource_lock); + if (res->flags & IORESOURCE_MUXED) + wake_up(&muxed_resource_wait); kfree(res); return; } diff --git a/kernel/sched.c b/kernel/sched.c index 9ab3cd7..f52a880 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -55,9 +55,9 @@ #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/percpu.h> -#include <linux/kthread.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> @@ -71,6 +71,7 @@ #include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -305,42 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD; */ struct task_group init_task_group; -/* return group to which a task belongs */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - -#ifdef CONFIG_CGROUP_SCHED - tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), - struct task_group, css); -#else - tg = &init_task_group; -#endif - return tg; -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif -} - -#else - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - #endif /* CONFIG_CGROUP_SCHED */ /* CFS-related fields in a runqueue */ @@ -492,8 +457,11 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ + u64 nohz_stamp; unsigned char in_nohz_recently; #endif + unsigned int skip_clock_update; + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -530,20 +498,20 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned long cpu_power; + unsigned char idle_at_tick; /* For active balancing */ int post_schedule; int active_balance; int push_cpu; + struct cpu_stop_work active_balance_work; /* cpu of this runqueue: */ int cpu; int online; unsigned long avg_load_per_task; - struct task_struct *migration_thread; - struct list_head migration_queue; - u64 rt_avg; u64 age_stamp; u64 idle_stamp; @@ -591,6 +559,13 @@ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { rq->curr->sched_class->check_preempt_curr(rq, p, flags); + + /* + * A queue event has occurred, and we're going to schedule. In + * this case, we can save a useless back to back clock update. + */ + if (test_tsk_need_resched(p)) + rq->skip_clock_update = 1; } static inline int cpu_of(struct rq *rq) @@ -623,9 +598,53 @@ static inline int cpu_of(struct rq *rq) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification + * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * holds that lock for each task it moves into the cgroup. Therefore + * by holding that lock, we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct cgroup_subsys_state *css; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&task_rq(p)->lock)); + return container_of(css, struct task_group, css); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + inline void update_rq_clock(struct rq *rq) { - rq->clock = sched_clock_cpu(cpu_of(rq)); + if (!rq->skip_clock_update) + rq->clock = sched_clock_cpu(cpu_of(rq)); } /* @@ -903,16 +922,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* - * Check whether the task is waking, we use this to synchronize against - * ttwu() so that task_cpu() reports a stable number. - * - * We need to make an exception for PF_STARTING tasks because the fork - * path might require task_rq_lock() to work, eg. it can call - * set_cpus_allowed_ptr() from the cpuset clone_ns code. + * Check whether the task is waking, we use this to synchronize ->cpus_allowed + * against ttwu(). */ static inline int task_is_waking(struct task_struct *p) { - return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); + return unlikely(p->state == TASK_WAKING); } /* @@ -925,11 +940,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) struct rq *rq; for (;;) { - while (task_is_waking(p)) - cpu_relax(); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_is_waking(p))) + if (likely(rq == task_rq(p))) return rq; raw_spin_unlock(&rq->lock); } @@ -946,25 +959,15 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) struct rq *rq; for (;;) { - while (task_is_waking(p)) - cpu_relax(); local_irq_save(*flags); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_is_waking(p))) + if (likely(rq == task_rq(p))) return rq; raw_spin_unlock_irqrestore(&rq->lock, *flags); } } -void task_rq_unlock_wait(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - raw_spin_unlock_wait(&rq->lock); -} - static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { @@ -1228,6 +1231,17 @@ void wake_up_idle_cpu(int cpu) if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); } + +int nohz_ratelimit(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 diff = rq->clock - rq->nohz_stamp; + + rq->nohz_stamp = rq->clock; + + return diff < (NSEC_PER_SEC / HZ) >> 1; +} + #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -1240,6 +1254,12 @@ static void sched_avg_update(struct rq *rq) s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } @@ -1484,24 +1504,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - static unsigned long power_of(int cpu) { - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; + return cpu_rq(cpu)->cpu_power; } static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); @@ -1658,9 +1663,6 @@ static void update_shares(struct sched_domain *sd) static void update_h_load(long cpu) { - if (root_task_group_empty()) - return; - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } @@ -1770,8 +1772,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } - update_rq_clock(rq1); - update_rq_clock(rq2); } /* @@ -1802,7 +1802,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif -static void calc_load_account_active(struct rq *this_rq); +static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); @@ -1841,8 +1841,8 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; + p->se.load.weight = 0; + p->se.load.inv_weight = WMULT_CONST; return; } @@ -1859,62 +1859,43 @@ static void set_load_weight(struct task_struct *p) p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - -static void -enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { - if (wakeup) - p->se.start_runtime = p->se.sum_exec_runtime; - + update_rq_clock(rq); sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup, head); + p->sched_class->enqueue_task(rq, p, flags); p->se.on_rq = 1; } -static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - if (sleep) { - if (p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; - } else { - update_avg(&p->se.avg_wakeup, - sysctl_sched_wakeup_granularity); - } - } - + update_rq_clock(rq); sched_info_dequeued(p); - p->sched_class->dequeue_task(rq, p, sleep); + p->sched_class->dequeue_task(rq, p, flags); p->se.on_rq = 0; } /* * activate_task - move a task to the runqueue. */ -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) +static void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup, false); + enqueue_task(rq, p, flags); inc_nr_running(rq); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) +static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; - dequeue_task(rq, p, sleep); + dequeue_task(rq, p, flags); dec_nr_running(rq); } @@ -2043,21 +2024,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } -struct migration_req { - struct list_head list; - +struct migration_arg { struct task_struct *task; int dest_cpu; - - struct completion done; }; +static int migration_cpu_stop(void *data); + /* * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int -migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) +static bool migrate_task(struct task_struct *p, int dest_cpu) { struct rq *rq = task_rq(p); @@ -2065,58 +2043,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - if (!p->se.on_rq && !task_running(rq, p)) - return 0; - - init_completion(&req->done); - req->task = p; - req->dest_cpu = dest_cpu; - list_add(&req->list, &rq->migration_queue); - - return 1; -} - -/* - * wait_task_context_switch - wait for a thread to complete at least one - * context switch. - * - * @p must not be current. - */ -void wait_task_context_switch(struct task_struct *p) -{ - unsigned long nvcsw, nivcsw, flags; - int running; - struct rq *rq; - - nvcsw = p->nvcsw; - nivcsw = p->nivcsw; - for (;;) { - /* - * The runqueue is assigned before the actual context - * switch. We need to take the runqueue lock. - * - * We could check initially without the lock but it is - * very likely that we need to take the lock in every - * iteration. - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); - task_rq_unlock(rq, &flags); - - if (likely(!running)) - break; - /* - * The switch count is incremented before the actual - * context switch. We thus wait for two switches to be - * sure at least one completed. - */ - if ((p->nvcsw - nvcsw) > 1) - break; - if ((p->nivcsw - nivcsw) > 1) - break; - - cpu_relax(); - } + return p->se.on_rq || task_running(rq, p); } /* @@ -2174,7 +2101,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * just go back and repeat. */ rq = task_rq_lock(p, &flags); - trace_sched_wait_task(rq, p); + trace_sched_wait_task(p); running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; @@ -2272,6 +2199,9 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +/* + * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. + */ static int select_fallback_rq(int cpu, struct task_struct *p) { int dest_cpu; @@ -2288,12 +2218,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p) return dest_cpu; /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - rcu_read_lock(); - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - rcu_read_unlock(); - dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); - + if (unlikely(dest_cpu >= nr_cpu_ids)) { + dest_cpu = cpuset_cpus_allowed_fallback(p); /* * Don't tell them about moving exiting tasks or * kernel threads (both mm NULL), since they never @@ -2310,17 +2236,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * Gets called from 3 sites (exec, fork, wakeup), since it is called without - * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done - * by: - * - * exec: is unstable, retry loop - * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING + * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2338,6 +2259,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) return cpu; } + +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} #endif /*** @@ -2359,16 +2286,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; + unsigned long en_flags = ENQUEUE_WAKEUP; struct rq *rq; - if (!sched_feat(SYNC_WAKEUPS)) - wake_flags &= ~WF_SYNC; - this_cpu = get_cpu(); smp_wmb(); rq = task_rq_lock(p, &flags); - update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2388,28 +2312,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, * * First fix up the nr_uninterruptible count: */ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; + if (task_contributes_to_load(p)) { + if (likely(cpu_online(orig_cpu))) + rq->nr_uninterruptible--; + else + this_rq()->nr_uninterruptible--; + } p->state = TASK_WAKING; - if (p->sched_class->task_waking) + if (p->sched_class->task_waking) { p->sched_class->task_waking(rq, p); + en_flags |= ENQUEUE_WAKING; + } - __task_rq_unlock(rq); - - cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) { - /* - * Since we migrate the task without holding any rq->lock, - * we need to be careful with task_rq_lock(), since that - * might end up locking an invalid rq. - */ + cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); + if (cpu != orig_cpu) set_task_cpu(p, cpu); - } + __task_rq_unlock(rq); rq = cpu_rq(cpu); raw_spin_lock(&rq->lock); - update_rq_clock(rq); /* * We migrated the task without holding either rq->lock, however @@ -2437,36 +2359,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.nr_wakeups); + schedstat_inc(p, se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p, se.nr_wakeups_sync); + schedstat_inc(p, se.statistics.nr_wakeups_sync); if (orig_cpu != cpu) - schedstat_inc(p, se.nr_wakeups_migrate); + schedstat_inc(p, se.statistics.nr_wakeups_migrate); if (cpu == this_cpu) - schedstat_inc(p, se.nr_wakeups_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); else - schedstat_inc(p, se.nr_wakeups_remote); - activate_task(rq, p, 1); + schedstat_inc(p, se.statistics.nr_wakeups_remote); + activate_task(rq, p, en_flags); success = 1; - /* - * Only attribute actual wakeups done by this task. - */ - if (!in_interrupt()) { - struct sched_entity *se = ¤t->se; - u64 sample = se->sum_exec_runtime; - - if (se->last_wakeup) - sample -= se->last_wakeup; - else - sample -= se->start_runtime; - update_avg(&se->avg_wakeup, sample); - - se->last_wakeup = se->sum_exec_runtime; - } - out_running: - trace_sched_wakeup(rq, p, success); + trace_sched_wakeup(p, success); check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -2526,42 +2432,9 @@ static void __sched_fork(struct task_struct *p) p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; - p->se.last_wakeup = 0; - p->se.avg_overlap = 0; - p->se.start_runtime = 0; - p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.wait_max = 0; - p->se.wait_count = 0; - p->se.wait_sum = 0; - - p->se.sleep_start = 0; - p->se.sleep_max = 0; - p->se.sum_sleep_runtime = 0; - - p->se.block_start = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - - p->se.nr_migrations_cold = 0; - p->se.nr_failed_migrations_affine = 0; - p->se.nr_failed_migrations_running = 0; - p->se.nr_failed_migrations_hot = 0; - p->se.nr_forced_migrations = 0; - - p->se.nr_wakeups = 0; - p->se.nr_wakeups_sync = 0; - p->se.nr_wakeups_migrate = 0; - p->se.nr_wakeups_local = 0; - p->se.nr_wakeups_remote = 0; - p->se.nr_wakeups_affine = 0; - p->se.nr_wakeups_affine_attempts = 0; - p->se.nr_wakeups_passive = 0; - p->se.nr_wakeups_idle = 0; - + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); @@ -2582,11 +2455,11 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); /* - * We mark the process as waking here. This guarantees that + * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_WAKING; + p->state = TASK_RUNNING; /* * Revert to default priority/policy on fork if requested. @@ -2621,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + rcu_read_lock(); set_task_cpu(p, cpu); + rcu_read_unlock(); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2650,34 +2532,30 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu = get_cpu(); + int cpu __maybe_unused = get_cpu(); #ifdef CONFIG_SMP + rq = task_rq_lock(p, &flags); + p->state = TASK_WAKING; + /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug * - * We still have TASK_WAKING but PF_STARTING is gone now, meaning - * ->cpus_allowed is stable, we have preemption disabled, meaning - * cpu_online_mask is stable. + * We set TASK_WAKING so that select_task_rq() can drop rq->lock + * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); set_task_cpu(p, cpu); -#endif - /* - * Since the task is not on the rq and we still have TASK_WAKING set - * nobody else will migrate this task. - */ - rq = cpu_rq(cpu); - raw_spin_lock_irqsave(&rq->lock, flags); - - BUG_ON(p->state != TASK_WAKING); p->state = TASK_RUNNING; - update_rq_clock(rq); + task_rq_unlock(rq, &flags); +#endif + + rq = task_rq_lock(p, &flags); activate_task(rq, p, 0); - trace_sched_wakeup_new(rq, p, 1); + trace_sched_wakeup_new(p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) @@ -2897,7 +2775,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_sched_switch(rq, prev, next); + trace_sched_switch(prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -2995,9 +2873,9 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_iowait_cpu(void) +unsigned long nr_iowait_cpu(int cpu) { - struct rq *this = this_rq(); + struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } @@ -3014,6 +2892,61 @@ static unsigned long calc_load_update; unsigned long avenrun[3]; EXPORT_SYMBOL(avenrun); +static long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +#ifdef CONFIG_NO_HZ +/* + * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_tasks_idle; + +static void calc_load_account_idle(struct rq *this_rq) +{ + long delta; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks_idle); +} + +static long calc_load_fold_idle(void) +{ + long delta = 0; + + /* + * Its got a race, we don't care... + */ + if (atomic_long_read(&calc_load_tasks_idle)) + delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + + return delta; +} +#else +static void calc_load_account_idle(struct rq *this_rq) +{ +} + +static inline long calc_load_fold_idle(void) +{ + return 0; +} +#endif + /** * get_avenrun - get the load average array * @loads: pointer to dest load array @@ -3060,20 +2993,22 @@ void calc_global_load(void) } /* - * Either called from update_cpu_load() or from a cpu going idle + * Called from update_cpu_load() to periodically update this CPU's + * active count. */ static void calc_load_account_active(struct rq *this_rq) { - long nr_active, delta; + long delta; - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; + if (time_before(jiffies, this_rq->calc_load_update)) + return; - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; + delta = calc_load_fold_active(this_rq); + delta += calc_load_fold_idle(); + if (delta) atomic_long_add(delta, &calc_load_tasks); - } + + this_rq->calc_load_update += LOAD_FREQ; } /* @@ -3105,10 +3040,7 @@ static void update_cpu_load(struct rq *this_rq) this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } - if (time_after_eq(jiffies, this_rq->calc_load_update)) { - this_rq->calc_load_update += LOAD_FREQ; - calc_load_account_active(this_rq); - } + calc_load_account_active(this_rq); } #ifdef CONFIG_SMP @@ -3120,44 +3052,27 @@ static void update_cpu_load(struct rq *this_rq) void sched_exec(void) { struct task_struct *p = current; - struct migration_req req; - int dest_cpu, this_cpu; unsigned long flags; struct rq *rq; - -again: - this_cpu = get_cpu(); - dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0); - if (dest_cpu == this_cpu) { - put_cpu(); - return; - } + int dest_cpu; rq = task_rq_lock(p, &flags); - put_cpu(); + dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + if (dest_cpu == smp_processor_id()) + goto unlock; /* * select_task_rq() can race against ->cpus_allowed */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) { - task_rq_unlock(rq, &flags); - goto again; - } - - /* force the process onto the specified CPU */ - if (migrate_task(p, dest_cpu, &req)) { - /* Need to wait for migration thread (might exit: take ref). */ - struct task_struct *mt = rq->migration_thread; + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && + likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { + struct migration_arg arg = { p, dest_cpu }; - get_task_struct(mt); task_rq_unlock(rq, &flags); - wake_up_process(mt); - put_task_struct(mt); - wait_for_completion(&req.done); - + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); return; } +unlock: task_rq_unlock(rq, &flags); } @@ -3629,23 +3544,9 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; - - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - - /* - * In order to avoid avg_overlap growing stale when we are - * indeed overlapping and hence not getting put to sleep, grow - * the avg_overlap on preemption. - * - * We use the average preemption runtime because that - * correlates to the amount of cache footprint a task can - * build up. - */ - update_avg(&prev->se.avg_overlap, runtime); - } + if (prev->se.on_rq) + update_rq_clock(rq); + rq->skip_clock_update = 0; prev->sched_class->put_prev_task(rq, prev); } @@ -3695,7 +3596,7 @@ need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_sched_qs(cpu); + rcu_note_context_switch(cpu); prev = rq->curr; switch_count = &prev->nivcsw; @@ -3708,14 +3609,13 @@ need_resched_nonpreemptible: hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; else - deactivate_task(rq, prev, 1); + deactivate_task(rq, prev, DEQUEUE_SLEEP); switch_count = &prev->nvcsw; } @@ -3779,7 +3679,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - goto out; + return 0; #else cpu = owner->cpu; #endif @@ -3789,14 +3689,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - goto out; + return 0; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - goto out; + return 0; rq = cpu_rq(cpu); @@ -3815,7 +3715,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) cpu_relax(); } -out: + return 1; } #endif @@ -3939,6 +3839,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) { __wake_up_common(q, mode, 1, 0, NULL); } +EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { @@ -4038,8 +3939,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) if (!x->done) { DECLARE_WAITQUEUE(wait, current); - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); + __add_wait_queue_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; @@ -4150,6 +4050,23 @@ int __sched wait_for_completion_killable(struct completion *x) EXPORT_SYMBOL(wait_for_completion_killable); /** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + */ +unsigned long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** * try_wait_for_completion - try to decrement a completion without blocking * @x: completion structure * @@ -4265,7 +4182,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); - update_rq_clock(rq); oldprio = p->prio; prev_class = p->sched_class; @@ -4286,7 +4202,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - enqueue_task(rq, p, 0, oldprio < prio); + enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio, running); } @@ -4308,7 +4224,6 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); - update_rq_clock(rq); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -4330,7 +4245,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0, false); + enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4559,16 +4474,6 @@ recheck: } if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; -#endif - retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4584,6 +4489,22 @@ recheck: * runqueue lock must be held. */ rq = __task_rq_lock(p); + +#ifdef CONFIG_RT_GROUP_SCHED + if (user) { + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EPERM; + } + } +#endif + /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; @@ -4591,7 +4512,6 @@ recheck: raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - update_rq_clock(rq); on_rq = p->se.on_rq; running = task_current(rq, p); if (on_rq) @@ -4902,7 +4822,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, int ret; cpumask_var_t mask; - if (len < cpumask_size()) + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) @@ -4910,10 +4832,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; else - ret = cpumask_size(); + ret = retlen; } free_cpumask_var(mask); @@ -5324,17 +5248,15 @@ static inline void sched_init_granularity(void) /* * This is how migration works: * - * 1) we queue a struct migration_req structure in the source CPU's - * runqueue and wake up that CPU's migration thread. - * 2) we down() the locked semaphore => thread blocks. - * 3) migration thread wakes up (implicitly it forces the migrated - * thread off the CPU) - * 4) it gets the migration request and checks whether the migrated - * task is still in the wrong runqueue. - * 5) if it's in the wrong runqueue then the migration thread removes + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes * it and puts it into the right queue. - * 6) migration thread up()s the semaphore. - * 7) we wake up and the migration is done. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. */ /* @@ -5348,12 +5270,23 @@ static inline void sched_init_granularity(void) */ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - struct migration_req req; unsigned long flags; struct rq *rq; + unsigned int dest_cpu; int ret = 0; + /* + * Serialize against TASK_WAKING so that ttwu() and wunt() can + * drop the rq->lock and still rely on ->cpus_allowed. + */ +again: + while (task_is_waking(p)) + cpu_relax(); rq = task_rq_lock(p, &flags); + if (task_is_waking(p)) { + task_rq_unlock(rq, &flags); + goto again; + } if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; @@ -5377,15 +5310,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (migrate_task(p, dest_cpu)) { + struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - struct task_struct *mt = rq->migration_thread; - - get_task_struct(mt); task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); - put_task_struct(mt); - wait_for_completion(&req.done); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } @@ -5443,98 +5373,49 @@ fail: return ret; } -#define RCU_MIGRATION_IDLE 0 -#define RCU_MIGRATION_NEED_QS 1 -#define RCU_MIGRATION_GOT_QS 2 -#define RCU_MIGRATION_MUST_SYNC 3 - /* - * migration_thread - this is a highprio system thread that performs - * thread migration by bumping thread off CPU then 'pushing' onto - * another runqueue. + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. */ -static int migration_thread(void *data) +static int migration_cpu_stop(void *data) { - int badcpu; - int cpu = (long)data; - struct rq *rq; - - rq = cpu_rq(cpu); - BUG_ON(rq->migration_thread != current); - - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - struct migration_req *req; - struct list_head *head; - - raw_spin_lock_irq(&rq->lock); - - if (cpu_is_offline(cpu)) { - raw_spin_unlock_irq(&rq->lock); - break; - } - - if (rq->active_balance) { - active_load_balance(rq, cpu); - rq->active_balance = 0; - } - - head = &rq->migration_queue; - - if (list_empty(head)) { - raw_spin_unlock_irq(&rq->lock); - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - continue; - } - req = list_entry(head->next, struct migration_req, list); - list_del_init(head->next); - - if (req->task != NULL) { - raw_spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); - } else if (likely(cpu == (badcpu = smp_processor_id()))) { - req->dest_cpu = RCU_MIGRATION_GOT_QS; - raw_spin_unlock(&rq->lock); - } else { - req->dest_cpu = RCU_MIGRATION_MUST_SYNC; - raw_spin_unlock(&rq->lock); - WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); - } - local_irq_enable(); - - complete(&req->done); - } - __set_current_state(TASK_RUNNING); - - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) -{ - int ret; + struct migration_arg *arg = data; + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ local_irq_disable(); - ret = __migrate_task(p, src_cpu, dest_cpu); + __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); local_irq_enable(); - return ret; + return 0; } +#ifdef CONFIG_HOTPLUG_CPU /* * Figure out where task on dead CPU should go, use force if necessary. */ -static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - int dest_cpu; + struct rq *rq = cpu_rq(dead_cpu); + int needs_cpu, uninitialized_var(dest_cpu); + unsigned long flags; -again: - dest_cpu = select_fallback_rq(dead_cpu, p); + local_irq_save(flags); - /* It can have affinity changed while we were choosing. */ - if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) - goto again; + raw_spin_lock(&rq->lock); + needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); + if (needs_cpu) + dest_cpu = select_fallback_rq(dead_cpu, p); + raw_spin_unlock(&rq->lock); + /* + * It can only fail if we race with set_cpus_allowed(), + * in the racer should migrate the task anyway. + */ + if (needs_cpu) + __migrate_task(p, dead_cpu, dest_cpu); + local_irq_restore(flags); } /* @@ -5598,7 +5479,6 @@ void sched_idle_next(void) __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - update_rq_clock(rq); activate_task(rq, p, 0); raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -5653,7 +5533,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu) for ( ; ; ) { if (!rq->nr_running) break; - update_rq_clock(rq); next = pick_next_task(rq); if (!next) break; @@ -5876,35 +5755,20 @@ static void set_rq_offline(struct rq *rq) static int __cpuinit migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { - struct task_struct *p; int cpu = (long)hcpu; unsigned long flags; - struct rq *rq; + struct rq *rq = cpu_rq(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); - if (IS_ERR(p)) - return NOTIFY_BAD; - kthread_bind(p, cpu); - /* Must be high prio: stop_machine expects to yield to it. */ - rq = task_rq_lock(p, &flags); - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - task_rq_unlock(rq, &flags); - get_task_struct(p); - cpu_rq(cpu)->migration_thread = p; rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - /* Strictly unnecessary, as first user will wake it. */ - wake_up_process(cpu_rq(cpu)->migration_thread); - /* Update our root-domain */ - rq = cpu_rq(cpu); raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -5915,61 +5779,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!cpu_rq(cpu)->migration_thread) - break; - /* Unbind it from offline cpu so it can run. Fall thru. */ - kthread_bind(cpu_rq(cpu)->migration_thread, - cpumask_any(cpu_online_mask)); - kthread_stop(cpu_rq(cpu)->migration_thread); - put_task_struct(cpu_rq(cpu)->migration_thread); - cpu_rq(cpu)->migration_thread = NULL; - break; - case CPU_DEAD: case CPU_DEAD_FROZEN: - cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); - rq = cpu_rq(cpu); - kthread_stop(rq->migration_thread); - put_task_struct(rq->migration_thread); - rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); raw_spin_unlock_irq(&rq->lock); - cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); calc_global_load_remove(rq); - /* - * No need to migrate the tasks: it was best-effort if - * they didn't take sched_hotcpu_mutex. Just wake up - * the requestors. - */ - raw_spin_lock_irq(&rq->lock); - while (!list_empty(&rq->migration_queue)) { - struct migration_req *req; - - req = list_entry(rq->migration_queue.next, - struct migration_req, list); - list_del_init(&req->list); - raw_spin_unlock_irq(&rq->lock); - complete(&req->done); - raw_spin_lock_irq(&rq->lock); - } - raw_spin_unlock_irq(&rq->lock); break; case CPU_DYING: case CPU_DYING_FROZEN: /* Update our root-domain */ - rq = cpu_rq(cpu); raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -6300,6 +6127,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + for (tmp = sd; tmp; tmp = tmp->parent) + tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); + /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; @@ -7777,16 +7607,15 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->cpu_power = SCHED_LOAD_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; rq->online = 0; - rq->migration_thread = NULL; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif init_rq_hrtick(rq); @@ -7887,7 +7716,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p) { int on_rq; - update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) deactivate_task(rq, p, 0); @@ -7914,9 +7742,9 @@ void normalize_rt_tasks(void) p->se.exec_start = 0; #ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; + p->se.statistics.wait_start = 0; + p->se.statistics.sleep_start = 0; + p->se.statistics.block_start = 0; #endif if (!rt_task(p)) { @@ -7943,9 +7771,9 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#ifdef CONFIG_IA64 +#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) /* - * These functions are only useful for the IA64 MCA handling. + * These functions are only useful for the IA64 MCA handling, or kdb. * * They can only be called when the whole system has been * stopped - every CPU needs to be quiescent, and no scheduling @@ -7965,6 +7793,9 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } +#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ + +#ifdef CONFIG_IA64 /** * set_curr_task - set the current task for a given cpu. * @cpu: the processor in question. @@ -8249,8 +8080,6 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); - update_rq_clock(rq); - running = task_current(rq, tsk); on_rq = tsk->se.on_rq; @@ -8269,7 +8098,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, tsk, 0, false); + enqueue_task(rq, tsk, 0); task_rq_unlock(rq, &flags); } @@ -9083,43 +8912,32 @@ struct cgroup_subsys cpuacct_subsys = { #ifndef CONFIG_SMP -int rcu_expedited_torture_stats(char *page) -{ - return 0; -} -EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); - void synchronize_sched_expedited(void) { + barrier(); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); #else /* #ifndef CONFIG_SMP */ -static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); -static DEFINE_MUTEX(rcu_sched_expedited_mutex); +static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); -#define RCU_EXPEDITED_STATE_POST -2 -#define RCU_EXPEDITED_STATE_IDLE -1 - -static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; - -int rcu_expedited_torture_stats(char *page) +static int synchronize_sched_expedited_cpu_stop(void *data) { - int cnt = 0; - int cpu; - - cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); - for_each_online_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d:%d", - cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); - } - cnt += sprintf(&page[cnt], "\n"); - return cnt; + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; } -EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); - -static long synchronize_sched_expedited_count; /* * Wait for an rcu-sched grace period to elapse, but use "big hammer" @@ -9133,18 +8951,14 @@ static long synchronize_sched_expedited_count; */ void synchronize_sched_expedited(void) { - int cpu; - unsigned long flags; - bool need_full_sync = 0; - struct rq *rq; - struct migration_req *req; - long snap; - int trycount = 0; + int snap, trycount = 0; smp_mb(); /* ensure prior mod happens before capturing snap. */ - snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + snap = atomic_read(&synchronize_sched_expedited_count) + 1; get_online_cpus(); - while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { put_online_cpus(); if (trycount++ < 10) udelay(trycount * num_online_cpus()); @@ -9152,41 +8966,15 @@ void synchronize_sched_expedited(void) synchronize_sched(); return; } - if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { smp_mb(); /* ensure test happens before caller kfree */ return; } get_online_cpus(); } - rcu_expedited_state = RCU_EXPEDITED_STATE_POST; - for_each_online_cpu(cpu) { - rq = cpu_rq(cpu); - req = &per_cpu(rcu_migration_req, cpu); - init_completion(&req->done); - req->task = NULL; - req->dest_cpu = RCU_MIGRATION_NEED_QS; - raw_spin_lock_irqsave(&rq->lock, flags); - list_add(&req->list, &rq->migration_queue); - raw_spin_unlock_irqrestore(&rq->lock, flags); - wake_up_process(rq->migration_thread); - } - for_each_online_cpu(cpu) { - rcu_expedited_state = cpu; - req = &per_cpu(rcu_migration_req, cpu); - rq = cpu_rq(cpu); - wait_for_completion(&req->done); - raw_spin_lock_irqsave(&rq->lock, flags); - if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) - need_full_sync = 1; - req->dest_cpu = RCU_MIGRATION_IDLE; - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; - synchronize_sched_expedited_count++; - mutex_unlock(&rcu_sched_expedited_mutex); + atomic_inc(&synchronize_sched_expedited_count); + smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ put_online_cpus(); - if (need_full_sync) - synchronize_sched(); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 5b49613..906a0f7 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); } +EXPORT_SYMBOL_GPL(sched_clock); static __read_mostly int sched_clock_running; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index fccf9fb..e6871cb 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -27,6 +27,7 @@ * of the License. */ +#include <linux/gfp.h> #include "sched_cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 67f95aa..3556539 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, PN(se->vruntime); PN(se->sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - PN(se->wait_start); - PN(se->sleep_start); - PN(se->block_start); - PN(se->sleep_max); - PN(se->block_max); - PN(se->exec_max); - PN(se->slice_max); - PN(se->wait_max); - PN(se->wait_sum); - P(se->wait_count); + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); #endif P(se->load.weight); #undef PN @@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", SPLIT_NS(p->se.vruntime), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.sum_sleep_runtime)); + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); #else SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); @@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { char path[64]; + rcu_read_lock(); cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); + rcu_read_unlock(); SEQ_printf(m, " %s", path); } #endif @@ -173,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) task_group_path(tg, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); -#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - { - uid_t uid = cfs_rq->tg->uid; - SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid); - } #else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); #endif @@ -384,15 +381,9 @@ __initcall(init_sched_debug_procfs); void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; - unsigned long flags; - int num_threads = 1; - - if (lock_task_sighand(p, &flags)) { - num_threads = atomic_read(&p->signal->count); - unlock_task_sighand(p, &flags); - } - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------\n"); #define __P(F) \ @@ -407,40 +398,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); - PN(se.avg_overlap); - PN(se.avg_wakeup); nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS - PN(se.wait_start); - PN(se.sleep_start); - PN(se.block_start); - PN(se.sleep_max); - PN(se.block_max); - PN(se.exec_max); - PN(se.slice_max); - PN(se.wait_max); - PN(se.wait_sum); - P(se.wait_count); - PN(se.iowait_sum); - P(se.iowait_count); + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); P(sched_info.bkl_count); P(se.nr_migrations); - P(se.nr_migrations_cold); - P(se.nr_failed_migrations_affine); - P(se.nr_failed_migrations_running); - P(se.nr_failed_migrations_hot); - P(se.nr_forced_migrations); - P(se.nr_wakeups); - P(se.nr_wakeups_sync); - P(se.nr_wakeups_migrate); - P(se.nr_wakeups_local); - P(se.nr_wakeups_remote); - P(se.nr_wakeups_affine); - P(se.nr_wakeups_affine_attempts); - P(se.nr_wakeups_passive); - P(se.nr_wakeups_idle); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); { u64 avg_atom, avg_per_cpu; @@ -491,35 +480,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - p->se.wait_max = 0; - p->se.wait_sum = 0; - p->se.wait_count = 0; - p->se.iowait_sum = 0; - p->se.iowait_count = 0; - p->se.sleep_max = 0; - p->se.sum_sleep_runtime = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.nr_migrations = 0; - p->se.nr_migrations_cold = 0; - p->se.nr_failed_migrations_affine = 0; - p->se.nr_failed_migrations_running = 0; - p->se.nr_failed_migrations_hot = 0; - p->se.nr_forced_migrations = 0; - p->se.nr_wakeups = 0; - p->se.nr_wakeups_sync = 0; - p->se.nr_wakeups_migrate = 0; - p->se.nr_wakeups_local = 0; - p->se.nr_wakeups_remote = 0; - p->se.nr_wakeups_affine = 0; - p->se.nr_wakeups_affine_attempts = 0; - p->se.nr_wakeups_passive = 0; - p->se.nr_wakeups_idle = 0; - p->sched_info.bkl_count = 0; + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif - p->se.sum_exec_runtime = 0; - p->se.prev_sum_exec_runtime = 0; - p->nvcsw = 0; - p->nivcsw = 0; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5a5ea2c..a878b53 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -35,8 +35,8 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency = 5000000ULL; -unsigned int normalized_sysctl_sched_latency = 5000000ULL; +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable @@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 1000000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; +unsigned int sysctl_sched_min_granularity = 2000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -static unsigned int sched_nr_latency = 5; +static unsigned int sched_nr_latency = 3; /* * After fork, child runs first. If set to 0 (default) then @@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, { unsigned long delta_exec_weighted; - schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); + schedstat_set(curr->statistics.exec_max, + max((u64)delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); @@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); + schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); } /* @@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->wait_max, max(se->wait_max, - rq_of(cfs_rq)->clock - se->wait_start)); - schedstat_set(se->wait_count, se->wait_count + 1); - schedstat_set(se->wait_sum, se->wait_sum + - rq_of(cfs_rq)->clock - se->wait_start); + schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, + rq_of(cfs_rq)->clock - se->statistics.wait_start)); + schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); + schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + + rq_of(cfs_rq)->clock - se->statistics.wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->wait_start); + rq_of(cfs_rq)->clock - se->statistics.wait_start); } #endif - schedstat_set(se->wait_start, 0); + schedstat_set(se->statistics.wait_start, 0); } static inline void @@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (entity_is_task(se)) tsk = task_of(se); - if (se->sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; + if (se->statistics.sleep_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->sleep_max)) - se->sleep_max = delta; + if (unlikely(delta > se->statistics.sleep_max)) + se->statistics.sleep_max = delta; - se->sleep_start = 0; - se->sum_sleep_runtime += delta; + se->statistics.sleep_start = 0; + se->statistics.sum_sleep_runtime += delta; if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); trace_sched_stat_sleep(tsk, delta); } } - if (se->block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->block_start; + if (se->statistics.block_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; if ((s64)delta < 0) delta = 0; - if (unlikely(delta > se->block_max)) - se->block_max = delta; + if (unlikely(delta > se->statistics.block_max)) + se->statistics.block_max = delta; - se->block_start = 0; - se->sum_sleep_runtime += delta; + se->statistics.block_start = 0; + se->statistics.sum_sleep_runtime += delta; if (tsk) { if (tsk->in_iowait) { - se->iowait_sum += delta; - se->iowait_count++; + se->statistics.iowait_sum += delta; + se->statistics.iowait_count++; trace_sched_stat_iowait(tsk, delta); } @@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime += sched_vslice(cfs_rq, se); /* sleeps up to a single latency don't count. */ - if (!initial && sched_feat(FAIR_SLEEPERS)) { + if (!initial) { unsigned long thresh = sysctl_sched_latency; /* - * Convert the sleeper threshold into virtual time. - * SCHED_IDLE is a special sub-class. We care about - * fairness only relative to other SCHED_IDLE tasks, - * all of which have the same weight. - */ - if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || - task_of(se)->policy != SCHED_IDLE)) - thresh = calc_delta_fair(thresh, se); - - /* * Halve their sleep time's effect, to allow * for a gentler effect of sleepers: */ @@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_MIGRATE 2 - static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; /* @@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update run-time statistics of the 'current'. @@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_curr(cfs_rq); update_stats_dequeue(cfs_rq, se); - if (sleep) { + if (flags & DEQUEUE_SLEEP) { #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->sleep_start = rq_of(cfs_rq)->clock; + se->statistics.sleep_start = rq_of(cfs_rq)->clock; if (tsk->state & TASK_UNINTERRUPTIBLE) - se->block_start = rq_of(cfs_rq)->clock; + se->statistics.block_start = rq_of(cfs_rq)->clock; } #endif } @@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) * update can refer to the ->curr item and we need to reflect this * movement in our normalized position. */ - if (!sleep) + if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; } @@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * when there are only lesser-weight tasks around): */ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->slice_max = max(se->slice_max, + se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } #endif @@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq) * then put the task into the rbtree: */ static void -enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int flags = 0; - - if (wakeup) - flags |= ENQUEUE_WAKEUP; - if (p->state == TASK_WAKING) - flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) @@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, sleep); + dequeue_entity(cfs_rq, se, flags); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; - sleep = 1; + flags |= DEQUEUE_SLEEP; } hrtick_update(rq); @@ -1240,11 +1222,9 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = current; unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; - unsigned int imbalance; struct task_group *tg; unsigned long weight; int balanced; @@ -1255,23 +1235,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (sync) { - if (sched_feat(SYNC_LESS) && - (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; - } else { - if (sched_feat(SYNC_MORE) && - (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost)) - sync = 1; - } - /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load * of the current CPU: */ + rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1283,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) tg = task_group(p); weight = p->se.load.weight; - imbalance = 100 + (sd->imbalance_pct - 100) / 2; - /* * In low-load situations, where prev_cpu is idle and this_cpu is idle * due to the sync cause above having dropped this_load to 0, we'll @@ -1294,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !this_load || - 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= - imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); + if (this_load) { + unsigned long this_eff_load, prev_eff_load; + + this_eff_load = 100; + this_eff_load *= power_of(prev_cpu); + this_eff_load *= this_load + + effective_load(tg, this_cpu, weight, weight); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= power_of(this_cpu); + prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + + balanced = this_eff_load <= prev_eff_load; + } else + balanced = true; + rcu_read_unlock(); /* * If the currently running task will sleep within @@ -1306,7 +1286,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) if (sync && balanced) return 1; - schedstat_inc(p, se.nr_wakeups_affine_attempts); + schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); if (balanced || @@ -1318,7 +1298,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * there is no bad imbalance. */ schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); return 1; } @@ -1406,29 +1386,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int -select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_sibling(struct task_struct *p, int target) { int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); + struct sched_domain *sd; int i; /* - * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE - * test in select_task_rq_fair) and the prev_cpu is idle then that's - * always a better target than the current cpu. + * If the task is going to be woken-up on this cpu and if it is + * already idle, then it is the right target. + */ + if (target == cpu && idle_cpu(cpu)) + return cpu; + + /* + * If the task is going to be woken-up on the cpu where it previously + * ran and if it is currently idle, then it the right target. */ - if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + if (target == prev_cpu && idle_cpu(prev_cpu)) return prev_cpu; /* - * Otherwise, iterate the domain and find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (!cpu_rq(i)->cfs.nr_running) { - target = i; + for_each_domain(target, sd) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; + + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (idle_cpu(i)) { + target = i; + break; + } } + + /* + * Lets stop looking for an idle sibling when we reached + * the domain that spans the current cpu and prev_cpu. + */ + if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && + cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + break; } return target; @@ -1445,7 +1444,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) * * preempt must be disabled. */ -static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +static int +select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -1456,8 +1456,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { - if (sched_feat(AFFINE_WAKEUPS) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) want_affine = 1; new_cpu = prev_cpu; } @@ -1491,34 +1490,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } /* - * While iterating the domains looking for a spanning - * WAKE_AFFINE domain, adjust the affine target to any idle cpu - * in cache sharing domains along the way. + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. */ - if (want_affine) { - int target = -1; - - /* - * If both cpu and prev_cpu are part of this domain, - * cpu is a valid SD_WAKE_AFFINE target. - */ - if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) - target = cpu; - - /* - * If there's an idle sibling in this domain, make that - * the wake_affine target instead of the current cpu. - */ - if (tmp->flags & SD_SHARE_PKG_RESOURCES) - target = select_idle_sibling(p, tmp, target); - - if (target >= 0) { - if (tmp->flags & SD_WAKE_AFFINE) { - affine_sd = tmp; - want_affine = 0; - } - cpu = target; - } + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + affine_sd = tmp; + want_affine = 0; } if (!want_sd && !want_affine) @@ -1531,22 +1509,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag sd = tmp; } +#ifdef CONFIG_FAIR_GROUP_SCHED if (sched_feat(LB_SHARES_UPDATE)) { /* * Pick the largest domain to update shares over */ tmp = sd; - if (affine_sd && (!tmp || - cpumask_weight(sched_domain_span(affine_sd)) > - cpumask_weight(sched_domain_span(sd)))) + if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) tmp = affine_sd; - if (tmp) + if (tmp) { + raw_spin_unlock(&rq->lock); update_shares(tmp); + raw_spin_lock(&rq->lock); + } } +#endif - if (affine_sd && wake_affine(affine_sd, p, sync)) - return cpu; + if (affine_sd) { + if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + return select_idle_sibling(p, cpu); + else + return select_idle_sibling(p, prev_cpu); + } while (sd) { int load_idx = sd->forkexec_idx; @@ -1576,10 +1561,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); + weight = sd->span_weight; sd = NULL; for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) + if (weight <= tmp->span_weight) break; if (tmp->flags & sd_flag) sd = tmp; @@ -1591,63 +1576,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } #endif /* CONFIG_SMP */ -/* - * Adaptive granularity - * - * se->avg_wakeup gives the average time a task runs until it does a wakeup, - * with the limit of wakeup_gran -- when it never does a wakeup. - * - * So the smaller avg_wakeup is the faster we want this task to preempt, - * but we don't want to treat the preemptee unfairly and therefore allow it - * to run for at least the amount of time we'd like to run. - * - * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one - * - * NOTE: we use *nr_running to scale with load, this nicely matches the - * degrading latency on load. - */ -static unsigned long -adaptive_gran(struct sched_entity *curr, struct sched_entity *se) -{ - u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running; - u64 gran = 0; - - if (this_run < expected_wakeup) - gran = expected_wakeup - this_run; - - return min_t(s64, gran, sysctl_sched_wakeup_granularity); -} - static unsigned long wakeup_gran(struct sched_entity *curr, struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; - if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN)) - gran = adaptive_gran(curr, se); - /* * Since its curr running now, convert the gran from real-time * to virtual-time in his units. + * + * By using 'se' instead of 'curr' we penalize light tasks, so + * they get preempted easier. That is, if 'se' < 'curr' then + * the resulting gran will be larger, therefore penalizing the + * lighter, if otoh 'se' > 'curr' then the resulting gran will + * be smaller, again penalizing the lighter task. + * + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. */ - if (sched_feat(ASYM_GRAN)) { - /* - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - } else { - if (unlikely(curr->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, curr); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, se); return gran; } @@ -1705,7 +1653,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; if (unlikely(rt_prio(p->prio))) @@ -1738,14 +1685,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(curr->policy == SCHED_IDLE)) goto preempt; - if (sched_feat(WAKEUP_SYNC) && sync) - goto preempt; - - if (sched_feat(WAKEUP_OVERLAP) && - se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost) - goto preempt; - if (!sched_feat(WAKEUP_PREEMPT)) return; @@ -1844,13 +1783,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { - schedstat_inc(p, se.nr_failed_migrations_affine); + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } *all_pinned = 0; if (task_running(rq, p)) { - schedstat_inc(p, se.nr_failed_migrations_running); + schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -1866,14 +1805,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { schedstat_inc(sd, lb_hot_gained[idle]); - schedstat_inc(p, se.nr_forced_migrations); + schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif return 1; } if (tsk_cache_hot) { - schedstat_inc(p, se.nr_failed_migrations_hot); + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); return 0; } return 1; @@ -2311,7 +2250,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; smt_gain /= weight; @@ -2344,7 +2283,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; @@ -2370,6 +2309,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) if (!power) power = 1; + cpu_rq(cpu)->cpu_power = power; sdg->cpu_power = power; } @@ -2870,6 +2810,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } +static int active_load_balance_cpu_stop(void *data); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2959,8 +2901,9 @@ redo: if (need_active_balance(sd, sd_idle, idle)) { raw_spin_lock_irqsave(&busiest->lock, flags); - /* don't kick the migration_thread, if the curr - * task on busiest cpu can't be moved to this_cpu + /* don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest cpu can't be + * moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { @@ -2970,14 +2913,22 @@ redo: goto out_one_pinned; } + /* + * ->active_balance synchronizes accesses to + * ->active_balance_work. Once set, it's cleared + * only after active load balance is finished. + */ if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } raw_spin_unlock_irqrestore(&busiest->lock, flags); + if (active_balance) - wake_up_process(busiest->migration_thread); + stop_one_cpu_nowait(cpu_of(busiest), + active_load_balance_cpu_stop, busiest, + &busiest->active_balance_work); /* * We've kicked active balancing, reset the failure @@ -3084,24 +3035,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq) } /* - * active_load_balance is run by migration threads. It pushes running tasks - * off the busiest CPU onto idle CPUs. It requires at least 1 task to be - * running on each physical CPU where possible, and avoids physical / - * logical imbalances. - * - * Called with busiest_rq locked. + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. */ -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) +static int active_load_balance_cpu_stop(void *data) { + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; - struct rq *target_rq; + + raw_spin_lock_irq(&busiest_rq->lock); + + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) + goto out_unlock; /* Is there any task to move? */ if (busiest_rq->nr_running <= 1) - return; - - target_rq = cpu_rq(target_cpu); + goto out_unlock; /* * This condition is "impossible", if it occurs @@ -3112,8 +3068,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); - update_rq_clock(busiest_rq); - update_rq_clock(target_rq); /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { @@ -3132,6 +3086,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) schedstat_inc(sd, alb_failed); } double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; } #ifdef CONFIG_NO_HZ diff --git a/kernel/sched_features.h b/kernel/sched_features.h index d5059fd..83c66e8 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,11 +1,4 @@ /* - * Disregards a certain amount of sleep time (sched_latency_ns) and - * considers the task to be running during that period. This gives it - * a service deficit on wakeup, allowing it to run sooner. - */ -SCHED_FEAT(FAIR_SLEEPERS, 1) - -/* * Only give sleepers 50% of their service deficit. This allows * them to run sooner, but does not allow tons of sleepers to * rip the spread apart. @@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1) SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) /* - * By not normalizing the sleep time, heavy tasks get an effective - * longer period, and lighter task an effective shorter period they - * are considered running. - */ -SCHED_FEAT(NORMALIZED_SLEEPER, 0) - -/* * Place new tasks ahead so that they do not starve already running * tasks */ @@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1) /* - * Compute wakeup_gran based on task behaviour, clipped to - * [0, sched_wakeup_gran_ns] - */ -SCHED_FEAT(ADAPTIVE_GRAN, 1) - -/* - * When converting the wakeup granularity to virtual time, do it such - * that heavier tasks preempting a lighter task have an edge. - */ -SCHED_FEAT(ASYM_GRAN, 1) - -/* - * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS. - */ -SCHED_FEAT(WAKEUP_SYNC, 0) - -/* - * Wakeup preempt based on task behaviour. Tasks that do not overlap - * don't get preempted. - */ -SCHED_FEAT(WAKEUP_OVERLAP, 0) - -/* - * Use the SYNC wakeup hint, pipes and the likes use this to indicate - * the remote end is likely to consume the data we just wrote, and - * therefore has cache benefit from being placed on the same cpu, see - * also AFFINE_WAKEUPS. - */ -SCHED_FEAT(SYNC_WAKEUPS, 1) - -/* * Based on load and program behaviour, see if it makes sense to place * a newly woken task on the same cpu as the task that woke it -- * improve cache locality. Typically used with SYNC wakeups as @@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) /* - * Weaken SYNC hint based on overlap - */ -SCHED_FEAT(SYNC_LESS, 1) - -/* - * Add SYNC hint based on overlap - */ -SCHED_FEAT(SYNC_MORE, 0) - -/* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a8a6d8a..9fa0f40 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,8 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +static int +select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - /* adjust the active tasks as we might go into a long sleep */ - calc_load_account_active(rq); + calc_load_account_idle(rq); return rq->idle; } @@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq) * message if some code attempts to do it: */ static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) +dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { raw_spin_unlock_irq(&rq->lock); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b5b920a..8afb953 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); @@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) * Adding/removing a task to/from a priority array: */ static void -enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) +enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; - if (wakeup) + if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, head); + enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } -static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) +static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; @@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +static int +select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) { - struct rq *rq = task_rq(p); - if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); diff --git a/kernel/signal.c b/kernel/signal.c index dbd7fe0..bded651 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -637,12 +637,12 @@ static inline bool si_fromuser(const struct siginfo *info) /* * Bad permissions for sending the signal - * - the caller must hold at least the RCU read lock + * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - const struct cred *cred = current_cred(), *tcred; + const struct cred *cred, *tcred; struct pid *sid; int error; @@ -656,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; + cred = current_cred(); tcred = __task_cred(t); - if ((cred->euid ^ tcred->suid) && + if (!same_thread_group(current, t) && + (cred->euid ^ tcred->suid) && (cred->euid ^ tcred->uid) && (cred->uid ^ tcred->suid) && (cred->uid ^ tcred->uid) && @@ -1083,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) /* * Nuke all other threads in the group. */ -void zap_other_threads(struct task_struct *p) +int zap_other_threads(struct task_struct *p) { - struct task_struct *t; + struct task_struct *t = p; + int count = 0; p->signal->group_stop_count = 0; - for (t = next_thread(p); t != p; t = next_thread(t)) { - /* - * Don't bother with already dead threads - */ + while_each_thread(p, t) { + count++; + + /* Don't bother with already dead threads */ if (t->exit_state) continue; - - /* SIGKILL will be handled before any pending SIGSTOP */ sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } + + return count; } struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) @@ -1124,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long /* * send signal info to all the members of a group - * - the caller must hold the RCU read lock at least */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret = check_kill_permission(sig, info, p); + int ret; + + rcu_read_lock(); + ret = check_kill_permission(sig, info, p); + rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, true); @@ -2735,3 +2741,43 @@ void __init signals_init(void) { sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } + +#ifdef CONFIG_KGDB_KDB +#include <linux/kdb.h> +/* + * kdb_send_sig_info - Allows kdb to send signals without exposing + * signal internals. This function checks if the required locks are + * available before calling the main signal code, to avoid kdb + * deadlocks. + */ +void +kdb_send_sig_info(struct task_struct *t, struct siginfo *info) +{ + static struct task_struct *kdb_prev_t; + int sig, new_t; + if (!spin_trylock(&t->sighand->siglock)) { + kdb_printf("Can't do kill command now.\n" + "The sigmask lock is held somewhere else in " + "kernel, try again later\n"); + return; + } + spin_unlock(&t->sighand->siglock); + new_t = kdb_prev_t != t; + kdb_prev_t = t; + if (t->state != TASK_RUNNING && new_t) { + kdb_printf("Process is not RUNNING, sending a signal from " + "kdb risks deadlock\n" + "on the run queue locks. " + "The signal has _not_ been sent.\n" + "Reissue the kill command if you want to risk " + "the deadlock.\n"); + return; + } + sig = info->si_signo; + if (send_sig_info(sig, info, t)) + kdb_printf("Fail to deliver Signal %d to process %d.\n", + sig, t->pid); + else + kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid); +} +#endif /* CONFIG_KGDB_KDB */ diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 7494bbf..7d3f4fa 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, goto cancelled; /* the timer holds a reference whilst it is pending */ - ret = work->ops->get_ref(work); + ret = slow_work_get_ref(work); if (ret < 0) goto cant_get_ref; diff --git a/kernel/slow-work.h b/kernel/slow-work.h index 321f3c5..a29ebd1 100644 --- a/kernel/slow-work.h +++ b/kernel/slow-work.h @@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); */ static inline void slow_work_set_thread_pid(int id, pid_t pid) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG slow_work_pids[id] = pid; #endif } static inline void slow_work_mark_time(struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG work->mark = CURRENT_TIME; #endif } static inline void slow_work_begin_exec(int id, struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG slow_work_execs[id] = work; #endif } static inline void slow_work_end_exec(int id, struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG write_lock(&slow_work_execs_lock); slow_work_execs[id] = NULL; write_unlock(&slow_work_execs_lock); diff --git a/kernel/smp.c b/kernel/smp.c index 9867b6b..75c970c 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/init.h> +#include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> @@ -51,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE_FROZEN: if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); break; #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/softirq.c b/kernel/softirq.c index 7c1a67e..07b4f1b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu) preempt_enable_no_resched(); cond_resched(); preempt_disable(); - rcu_sched_qs((long)__bind_cpu); + rcu_note_context_switch((long)__bind_cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); @@ -808,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; + return notifier_from_errno(PTR_ERR(p)); } kthread_bind(p, hotcpu); per_cpu(ksoftirqd, hotcpu) = p; @@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void) void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0d4c789..4b493f6 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -155,11 +155,11 @@ void softlockup_tick(void) * Wake up the high-prio watchdog task twice per * threshold timespan. */ - if (now > touch_ts + softlockup_thresh/2) + if (time_after(now - softlockup_thresh/2, touch_ts)) wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); /* Warn about unreasonable delays: */ - if (now <= (touch_ts + softlockup_thresh)) + if (time_before_eq(now - softlockup_thresh, touch_ts)) return; per_cpu(softlockup_print_ts, this_cpu) = touch_ts; diff --git a/kernel/srcu.c b/kernel/srcu.c index bde4295..2980da3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -30,7 +30,6 @@ #include <linux/preempt.h> #include <linux/rcupdate.h> #include <linux/sched.h> -#include <linux/slab.h> #include <linux/smp.h> #include <linux/srcu.h> diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 9bb9fb1..70f8d90 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,17 +1,384 @@ -/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. - * GPL v2 and any later version. +/* + * kernel/stop_machine.c + * + * Copyright (C) 2008, 2005 IBM Corporation. + * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2 and any later version. */ +#include <linux/completion.h> #include <linux/cpu.h> -#include <linux/err.h> +#include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> +#include <linux/percpu.h> #include <linux/sched.h> #include <linux/stop_machine.h> -#include <linux/syscalls.h> #include <linux/interrupt.h> +#include <linux/kallsyms.h> #include <asm/atomic.h> -#include <asm/uaccess.h> + +/* + * Structure to determine completion condition and record errors. May + * be shared by works on different cpus. + */ +struct cpu_stop_done { + atomic_t nr_todo; /* nr left to execute */ + bool executed; /* actually executed? */ + int ret; /* collected return value */ + struct completion completion; /* fired if nr_todo reaches 0 */ +}; + +/* the actual stopper, one per every possible cpu, enabled on online cpus */ +struct cpu_stopper { + spinlock_t lock; + struct list_head works; /* list of pending works */ + struct task_struct *thread; /* stopper thread */ + bool enabled; /* is this stopper enabled? */ +}; + +static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); + +static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) +{ + memset(done, 0, sizeof(*done)); + atomic_set(&done->nr_todo, nr_todo); + init_completion(&done->completion); +} + +/* signal completion unless @done is NULL */ +static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) +{ + if (done) { + if (executed) + done->executed = true; + if (atomic_dec_and_test(&done->nr_todo)) + complete(&done->completion); + } +} + +/* queue @work to @stopper. if offline, @work is completed immediately */ +static void cpu_stop_queue_work(struct cpu_stopper *stopper, + struct cpu_stop_work *work) +{ + unsigned long flags; + + spin_lock_irqsave(&stopper->lock, flags); + + if (stopper->enabled) { + list_add_tail(&work->list, &stopper->works); + wake_up_process(stopper->thread); + } else + cpu_stop_signal_done(work->done, false); + + spin_unlock_irqrestore(&stopper->lock, flags); +} + +/** + * stop_one_cpu - stop a cpu + * @cpu: cpu to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Execute @fn(@arg) on @cpu. @fn is run in a process context with + * the highest priority preempting any task on the cpu and + * monopolizing it. This function returns after the execution is + * complete. + * + * This function doesn't guarantee @cpu stays online till @fn + * completes. If @cpu goes down in the middle, execution may happen + * partially or fully on different cpus. @fn should either be ready + * for that or the caller should ensure that @cpu stays online until + * this function completes. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * -ENOENT if @fn(@arg) was not executed because @cpu was offline; + * otherwise, the return value of @fn. + */ +int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) +{ + struct cpu_stop_done done; + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; + + cpu_stop_init_done(&done, 1); + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); + wait_for_completion(&done.completion); + return done.executed ? done.ret : -ENOENT; +} + +/** + * stop_one_cpu_nowait - stop a cpu but don't wait for completion + * @cpu: cpu to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Similar to stop_one_cpu() but doesn't wait for completion. The + * caller is responsible for ensuring @work_buf is currently unused + * and will remain untouched until stopper starts executing @fn. + * + * CONTEXT: + * Don't care. + */ +void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, + struct cpu_stop_work *work_buf) +{ + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); +} + +/* static data for stop_cpus */ +static DEFINE_MUTEX(stop_cpus_mutex); +static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); + +int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +{ + struct cpu_stop_work *work; + struct cpu_stop_done done; + unsigned int cpu; + + /* initialize works and done */ + for_each_cpu(cpu, cpumask) { + work = &per_cpu(stop_cpus_work, cpu); + work->fn = fn; + work->arg = arg; + work->done = &done; + } + cpu_stop_init_done(&done, cpumask_weight(cpumask)); + + /* + * Disable preemption while queueing to avoid getting + * preempted by a stopper which might wait for other stoppers + * to enter @fn which can lead to deadlock. + */ + preempt_disable(); + for_each_cpu(cpu, cpumask) + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), + &per_cpu(stop_cpus_work, cpu)); + preempt_enable(); + + wait_for_completion(&done.completion); + return done.executed ? done.ret : -ENOENT; +} + +/** + * stop_cpus - stop multiple cpus + * @cpumask: cpus to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu, + * @fn is run in a process context with the highest priority + * preempting any task on the cpu and monopolizing it. This function + * returns after all executions are complete. + * + * This function doesn't guarantee the cpus in @cpumask stay online + * till @fn completes. If some cpus go down in the middle, execution + * on the cpu may happen partially or fully on different cpus. @fn + * should either be ready for that or the caller should ensure that + * the cpus stay online until this function completes. + * + * All stop_cpus() calls are serialized making it safe for @fn to wait + * for all cpus to start executing it. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * -ENOENT if @fn(@arg) was not executed at all because all cpus in + * @cpumask were offline; otherwise, 0 if all executions of @fn + * returned 0, any non zero return value if any returned non zero. + */ +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +{ + int ret; + + /* static works are used, process one request at a time */ + mutex_lock(&stop_cpus_mutex); + ret = __stop_cpus(cpumask, fn, arg); + mutex_unlock(&stop_cpus_mutex); + return ret; +} + +/** + * try_stop_cpus - try to stop multiple cpus + * @cpumask: cpus to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Identical to stop_cpus() except that it fails with -EAGAIN if + * someone else is already using the facility. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * -EAGAIN if someone else is already stopping cpus, -ENOENT if + * @fn(@arg) was not executed at all because all cpus in @cpumask were + * offline; otherwise, 0 if all executions of @fn returned 0, any non + * zero return value if any returned non zero. + */ +int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +{ + int ret; + + /* static works are used, process one request at a time */ + if (!mutex_trylock(&stop_cpus_mutex)) + return -EAGAIN; + ret = __stop_cpus(cpumask, fn, arg); + mutex_unlock(&stop_cpus_mutex); + return ret; +} + +static int cpu_stopper_thread(void *data) +{ + struct cpu_stopper *stopper = data; + struct cpu_stop_work *work; + int ret; + +repeat: + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 0; + } + + work = NULL; + spin_lock_irq(&stopper->lock); + if (!list_empty(&stopper->works)) { + work = list_first_entry(&stopper->works, + struct cpu_stop_work, list); + list_del_init(&work->list); + } + spin_unlock_irq(&stopper->lock); + + if (work) { + cpu_stop_fn_t fn = work->fn; + void *arg = work->arg; + struct cpu_stop_done *done = work->done; + char ksym_buf[KSYM_NAME_LEN]; + + __set_current_state(TASK_RUNNING); + + /* cpu stop callbacks are not allowed to sleep */ + preempt_disable(); + + ret = fn(arg); + if (ret) + done->ret = ret; + + /* restore preemption and check it's still balanced */ + preempt_enable(); + WARN_ONCE(preempt_count(), + "cpu_stop: %s(%p) leaked preempt count\n", + kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, + ksym_buf), arg); + + cpu_stop_signal_done(done, true); + } else + schedule(); + + goto repeat; +} + +/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ +static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + unsigned int cpu = (unsigned long)hcpu; + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + struct task_struct *p; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + BUG_ON(stopper->thread || stopper->enabled || + !list_empty(&stopper->works)); + p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", + cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); + get_task_struct(p); + stopper->thread = p; + break; + + case CPU_ONLINE: + kthread_bind(stopper->thread, cpu); + /* strictly unnecessary, as first user will wake it */ + wake_up_process(stopper->thread); + /* mark enabled */ + spin_lock_irq(&stopper->lock); + stopper->enabled = true; + spin_unlock_irq(&stopper->lock); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_POST_DEAD: + { + struct cpu_stop_work *work; + + /* kill the stopper */ + kthread_stop(stopper->thread); + /* drain remaining works */ + spin_lock_irq(&stopper->lock); + list_for_each_entry(work, &stopper->works, list) + cpu_stop_signal_done(work->done, false); + stopper->enabled = false; + spin_unlock_irq(&stopper->lock); + /* release the stopper */ + put_task_struct(stopper->thread); + stopper->thread = NULL; + break; + } +#endif + } + + return NOTIFY_OK; +} + +/* + * Give it a higher priority so that cpu stopper is available to other + * cpu notifiers. It currently shares the same priority as sched + * migration_notifier. + */ +static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { + .notifier_call = cpu_stop_cpu_callback, + .priority = 10, +}; + +static int __init cpu_stop_init(void) +{ + void *bcpu = (void *)(long)smp_processor_id(); + unsigned int cpu; + int err; + + for_each_possible_cpu(cpu) { + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + + spin_lock_init(&stopper->lock); + INIT_LIST_HEAD(&stopper->works); + } + + /* start one for the boot cpu */ + err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, + bcpu); + BUG_ON(err == NOTIFY_BAD); + cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); + register_cpu_notifier(&cpu_stop_cpu_notifier); + + return 0; +} +early_initcall(cpu_stop_init); + +#ifdef CONFIG_STOP_MACHINE /* This controls the threads on each CPU. */ enum stopmachine_state { @@ -26,174 +393,94 @@ enum stopmachine_state { /* Exit */ STOPMACHINE_EXIT, }; -static enum stopmachine_state state; struct stop_machine_data { - int (*fn)(void *); - void *data; - int fnret; + int (*fn)(void *); + void *data; + /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ + unsigned int num_threads; + const struct cpumask *active_cpus; + + enum stopmachine_state state; + atomic_t thread_ack; }; -/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ -static unsigned int num_threads; -static atomic_t thread_ack; -static DEFINE_MUTEX(lock); -/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */ -static DEFINE_MUTEX(setup_lock); -/* Users of stop_machine. */ -static int refcount; -static struct workqueue_struct *stop_machine_wq; -static struct stop_machine_data active, idle; -static const struct cpumask *active_cpus; -static void __percpu *stop_machine_work; - -static void set_state(enum stopmachine_state newstate) +static void set_state(struct stop_machine_data *smdata, + enum stopmachine_state newstate) { /* Reset ack counter. */ - atomic_set(&thread_ack, num_threads); + atomic_set(&smdata->thread_ack, smdata->num_threads); smp_wmb(); - state = newstate; + smdata->state = newstate; } /* Last one to ack a state moves to the next state. */ -static void ack_state(void) +static void ack_state(struct stop_machine_data *smdata) { - if (atomic_dec_and_test(&thread_ack)) - set_state(state + 1); + if (atomic_dec_and_test(&smdata->thread_ack)) + set_state(smdata, smdata->state + 1); } -/* This is the actual function which stops the CPU. It runs - * in the context of a dedicated stopmachine workqueue. */ -static void stop_cpu(struct work_struct *unused) +/* This is the cpu_stop function which stops the CPU. */ +static int stop_machine_cpu_stop(void *data) { + struct stop_machine_data *smdata = data; enum stopmachine_state curstate = STOPMACHINE_NONE; - struct stop_machine_data *smdata = &idle; - int cpu = smp_processor_id(); - int err; + int cpu = smp_processor_id(), err = 0; + bool is_active; + + if (!smdata->active_cpus) + is_active = cpu == cpumask_first(cpu_online_mask); + else + is_active = cpumask_test_cpu(cpu, smdata->active_cpus); - if (!active_cpus) { - if (cpu == cpumask_first(cpu_online_mask)) - smdata = &active; - } else { - if (cpumask_test_cpu(cpu, active_cpus)) - smdata = &active; - } /* Simple state machine */ do { /* Chill out and ensure we re-read stopmachine_state. */ cpu_relax(); - if (state != curstate) { - curstate = state; + if (smdata->state != curstate) { + curstate = smdata->state; switch (curstate) { case STOPMACHINE_DISABLE_IRQ: local_irq_disable(); hard_irq_disable(); break; case STOPMACHINE_RUN: - /* On multiple CPUs only a single error code - * is needed to tell that something failed. */ - err = smdata->fn(smdata->data); - if (err) - smdata->fnret = err; + if (is_active) + err = smdata->fn(smdata->data); break; default: break; } - ack_state(); + ack_state(smdata); } } while (curstate != STOPMACHINE_EXIT); local_irq_enable(); + return err; } -/* Callback for CPUs which aren't supposed to do anything. */ -static int chill(void *unused) -{ - return 0; -} - -int stop_machine_create(void) -{ - mutex_lock(&setup_lock); - if (refcount) - goto done; - stop_machine_wq = create_rt_workqueue("kstop"); - if (!stop_machine_wq) - goto err_out; - stop_machine_work = alloc_percpu(struct work_struct); - if (!stop_machine_work) - goto err_out; -done: - refcount++; - mutex_unlock(&setup_lock); - return 0; - -err_out: - if (stop_machine_wq) - destroy_workqueue(stop_machine_wq); - mutex_unlock(&setup_lock); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(stop_machine_create); - -void stop_machine_destroy(void) -{ - mutex_lock(&setup_lock); - refcount--; - if (refcount) - goto done; - destroy_workqueue(stop_machine_wq); - free_percpu(stop_machine_work); -done: - mutex_unlock(&setup_lock); -} -EXPORT_SYMBOL_GPL(stop_machine_destroy); - int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { - struct work_struct *sm_work; - int i, ret; - - /* Set up initial state. */ - mutex_lock(&lock); - num_threads = num_online_cpus(); - active_cpus = cpus; - active.fn = fn; - active.data = data; - active.fnret = 0; - idle.fn = chill; - idle.data = NULL; - - set_state(STOPMACHINE_PREPARE); - - /* Schedule the stop_cpu work on all cpus: hold this CPU so one - * doesn't hit this CPU until we're ready. */ - get_cpu(); - for_each_online_cpu(i) { - sm_work = per_cpu_ptr(stop_machine_work, i); - INIT_WORK(sm_work, stop_cpu); - queue_work_on(i, stop_machine_wq, sm_work); - } - /* This will release the thread on our CPU. */ - put_cpu(); - flush_workqueue(stop_machine_wq); - ret = active.fnret; - mutex_unlock(&lock); - return ret; + struct stop_machine_data smdata = { .fn = fn, .data = data, + .num_threads = num_online_cpus(), + .active_cpus = cpus }; + + /* Set the initial state and stop all online cpus. */ + set_state(&smdata, STOPMACHINE_PREPARE); + return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); } int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { int ret; - ret = stop_machine_create(); - if (ret) - return ret; /* No CPUs can come up or down during this. */ get_online_cpus(); ret = __stop_machine(fn, data, cpus); put_online_cpus(); - stop_machine_destroy(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); + +#endif /* CONFIG_STOP_MACHINE */ diff --git a/kernel/sys.c b/kernel/sys.c index 8298878f..e83ddbb 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -36,6 +36,7 @@ #include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> +#include <linux/gfp.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -491,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) return -ENOMEM; old = current_cred(); - retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); - if (retval) - goto error; - retval = -EPERM; if (rgid != (gid_t) -1) { if (old->gid == rgid || @@ -542,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) return -ENOMEM; old = current_cred(); - retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); - if (retval) - goto error; - retval = -EPERM; if (capable(CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = gid; @@ -609,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) return -ENOMEM; old = current_cred(); - retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); - if (retval) - goto error; - retval = -EPERM; if (ruid != (uid_t) -1) { new->uid = ruid; @@ -674,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) return -ENOMEM; old = current_cred(); - retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); - if (retval) - goto error; - retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; @@ -718,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) if (!new) return -ENOMEM; - retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); - if (retval) - goto error; old = current_cred(); retval = -EPERM; @@ -787,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) return -ENOMEM; old = current_cred(); - retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); - if (retval) - goto error; - retval = -EPERM; if (!capable(CAP_SETGID)) { if (rgid != (gid_t) -1 && rgid != old->gid && @@ -850,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) old = current_cred(); old_fsuid = old->fsuid; - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) - goto error; - if (uid == old->uid || uid == old->euid || uid == old->suid || uid == old->fsuid || capable(CAP_SETUID)) { @@ -863,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) } } -error: abort_creds(new); return old_fsuid; @@ -887,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) old = current_cred(); old_fsgid = old->fsgid; - if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) - goto error; - if (gid == old->gid || gid == old->egid || gid == old->sgid || gid == old->fsgid || capable(CAP_SETGID)) { @@ -899,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) } } -error: abort_creds(new); return old_fsgid; @@ -1117,7 +1087,7 @@ DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE #define override_architecture(name) \ - (current->personality == PER_LINUX32 && \ + (personality(current->personality) == PER_LINUX32 && \ copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ sizeof(COMPAT_UTS_MACHINE))) #else @@ -1662,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static void argv_cleanup(char **argv, char **envp) +static void argv_cleanup(struct subprocess_info *info) { - argv_free(argv); + argv_free(info->argv); } /** @@ -1698,7 +1668,7 @@ int orderly_poweroff(bool force) goto out; } - call_usermodehelper_setcleanup(info, argv_cleanup); + call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); ret = call_usermodehelper_exec(info, UMH_NO_WAIT); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8686b0f..d24f761f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -37,6 +37,7 @@ #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/ratelimit.h> +#include <linux/compaction.h> #include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> @@ -52,6 +53,7 @@ #include <linux/slow-work.h> #include <linux/perf_event.h> #include <linux/kprobes.h> +#include <linux/pipe_fs_i.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -163,6 +165,27 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_MAGIC_SYSRQ +static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ + +static int sysrq_sysctl_handler(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int error; + + error = proc_dointvec(table, write, buffer, lenp, ppos); + if (error) + return error; + + if (write) + sysrq_toggle_support(__sysrq_enabled); + + return 0; +} + +#endif + static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { @@ -240,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */ static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif +#ifdef CONFIG_COMPACTION +static int min_extfrag_threshold; +static int max_extfrag_threshold = 1000; +#endif + static struct ctl_table kern_table[] = { { .procname = "sched_child_runs_first", @@ -567,7 +595,7 @@ static struct ctl_table kern_table[] = { .data = &__sysrq_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysrq_sysctl_handler, }, #endif #ifdef CONFIG_PROC_SYSCTL @@ -621,7 +649,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "userprocess_debug", - .data = &sysctl_userprocess_debug, + .data = &show_unhandled_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1099,6 +1127,25 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, }, +#ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", + .data = &sysctl_compact_memory, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_extfrag_handler, + .extra1 = &min_extfrag_threshold, + .extra2 = &max_extfrag_threshold, + }, + +#endif /* CONFIG_COMPACTION */ { .procname = "min_free_kbytes", .data = &min_free_kbytes, @@ -1423,6 +1470,14 @@ static struct ctl_table fs_table[] = { .child = binfmt_misc_table, }, #endif + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, + }, /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1431,7 +1486,8 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) +#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ + defined(CONFIG_S390) { .procname = "exception-trace", .data = &show_unhandled_signals, @@ -2040,8 +2096,132 @@ int proc_dostring(struct ctl_table *table, int write, buffer, lenp, ppos); } +static size_t proc_skip_spaces(char **buf) +{ + size_t ret; + char *tmp = skip_spaces(*buf); + ret = tmp - *buf; + *buf = tmp; + return ret; +} + +static void proc_skip_char(char **buf, size_t *size, const char v) +{ + while (*size) { + if (**buf != v) + break; + (*size)--; + (*buf)++; + } +} + +#define TMPBUFLEN 22 +/** + * proc_get_long - reads an ASCII formatted integer from a user buffer + * + * @buf: a kernel buffer + * @size: size of the kernel buffer + * @val: this is where the number will be stored + * @neg: set to %TRUE if number is negative + * @perm_tr: a vector which contains the allowed trailers + * @perm_tr_len: size of the perm_tr vector + * @tr: pointer to store the trailer character + * + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes read. If @tr is non-NULL and a trailing + * character exists (size is non-zero after returning from this + * function), @tr is updated with the trailing character. + */ +static int proc_get_long(char **buf, size_t *size, + unsigned long *val, bool *neg, + const char *perm_tr, unsigned perm_tr_len, char *tr) +{ + int len; + char *p, tmp[TMPBUFLEN]; + + if (!*size) + return -EINVAL; + + len = *size; + if (len > TMPBUFLEN - 1) + len = TMPBUFLEN - 1; + + memcpy(tmp, *buf, len); + + tmp[len] = 0; + p = tmp; + if (*p == '-' && *size > 1) { + *neg = true; + p++; + } else + *neg = false; + if (!isdigit(*p)) + return -EINVAL; + + *val = simple_strtoul(p, &p, 0); + + len = p - tmp; + + /* We don't know if the next char is whitespace thus we may accept + * invalid integers (e.g. 1234...a) or two integers instead of one + * (e.g. 123...1). So lets not allow such large numbers. */ + if (len == TMPBUFLEN - 1) + return -EINVAL; + + if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) + return -EINVAL; + + if (tr && (len < *size)) + *tr = *p; + + *buf += len; + *size -= len; + + return 0; +} + +/** + * proc_put_long - converts an integer to a decimal ASCII formatted string + * + * @buf: the user buffer + * @size: the size of the user buffer + * @val: the integer to be converted + * @neg: sign of the number, %TRUE for negative + * + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes written. + */ +static int proc_put_long(void __user **buf, size_t *size, unsigned long val, + bool neg) +{ + int len; + char tmp[TMPBUFLEN], *p = tmp; + + sprintf(p, "%s%lu", neg ? "-" : "", val); + len = strlen(tmp); + if (len > *size) + len = *size; + if (copy_to_user(*buf, tmp, len)) + return -EFAULT; + *size -= len; + *buf += len; + return 0; +} +#undef TMPBUFLEN + +static int proc_put_char(void __user **buf, size_t *size, char c) +{ + if (*size) { + char __user **buffer = (char __user **)buf; + if (put_user(c, *buffer)) + return -EFAULT; + (*size)--, (*buffer)++; + *buf = *buffer; + } + return 0; +} -static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, +static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { @@ -2050,33 +2230,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } else { int val = *valp; if (val < 0) { - *negp = -1; + *negp = true; *lvalp = (unsigned long)-val; } else { - *negp = 0; + *negp = false; *lvalp = (unsigned long)val; } } return 0; } +static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; + static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { -#define TMPBUFLEN 21 - int *i, vleft, first = 1, neg; - unsigned long lval; - size_t left, len; + int *i, vleft, first = 1, err = 0; + unsigned long page = 0; + size_t left; + char *kbuf; - char buf[TMPBUFLEN], *p; - char __user *s = buffer; - - if (!tbl_data || !table->maxlen || !*lenp || - (*ppos && !write)) { + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } @@ -2088,89 +2266,71 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!conv) conv = do_proc_dointvec_conv; + if (write) { + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + err = -EFAULT; + goto free; + } + kbuf[left] = 0; + } + for (; left && vleft--; i++, first=0) { + unsigned long lval; + bool neg; + if (write) { - while (left) { - char c; - if (get_user(c, s)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - s++; - } + left -= proc_skip_spaces(&kbuf); + if (!left) break; - neg = 0; - len = left; - if (len > sizeof(buf) - 1) - len = sizeof(buf) - 1; - if (copy_from_user(buf, s, len)) - return -EFAULT; - buf[len] = 0; - p = buf; - if (*p == '-' && left > 1) { - neg = 1; - p++; - } - if (*p < '0' || *p > '9') + err = proc_get_long(&kbuf, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) break; - - lval = simple_strtoul(p, &p, 0); - - len = p-buf; - if ((len < left) && *p && !isspace(*p)) - break; - s += len; - left -= len; - - if (conv(&neg, &lval, i, 1, data)) + if (conv(&neg, &lval, i, 1, data)) { + err = -EINVAL; break; + } } else { - p = buf; + if (conv(&neg, &lval, i, 0, data)) { + err = -EINVAL; + break; + } if (!first) - *p++ = '\t'; - - if (conv(&neg, &lval, i, 0, data)) + err = proc_put_char(&buffer, &left, '\t'); + if (err) + break; + err = proc_put_long(&buffer, &left, lval, neg); + if (err) break; - - sprintf(p, "%s%lu", neg ? "-" : "", lval); - len = strlen(buf); - if (len > left) - len = left; - if(copy_to_user(s, buf, len)) - return -EFAULT; - left -= len; - s += len; } } - if (!write && !first && left) { - if(put_user('\n', s)) - return -EFAULT; - left--, s++; - } + if (!write && !first && left && !err) + err = proc_put_char(&buffer, &left, '\n'); + if (write && !err && left) + left -= proc_skip_spaces(&kbuf); +free: if (write) { - while (left) { - char c; - if (get_user(c, s++)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - } + free_page(page); + if (first) + return err ? : -EINVAL; } - if (write && first) - return -EINVAL; *lenp -= left; *ppos += *lenp; - return 0; -#undef TMPBUFLEN + return err; } static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { @@ -2238,8 +2398,8 @@ struct do_proc_dointvec_minmax_conv_param { int *max; }; -static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, - int *valp, +static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, int write, void *data) { struct do_proc_dointvec_minmax_conv_param *param = data; @@ -2252,10 +2412,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, } else { int val = *valp; if (val < 0) { - *negp = -1; + *negp = true; *lvalp = (unsigned long)-val; } else { - *negp = 0; + *negp = false; *lvalp = (unsigned long)val; } } @@ -2295,102 +2455,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int unsigned long convmul, unsigned long convdiv) { -#define TMPBUFLEN 21 - unsigned long *i, *min, *max, val; - int vleft, first=1, neg; - size_t len, left; - char buf[TMPBUFLEN], *p; - char __user *s = buffer; - - if (!data || !table->maxlen || !*lenp || - (*ppos && !write)) { + unsigned long *i, *min, *max; + int vleft, first = 1, err = 0; + unsigned long page = 0; + size_t left; + char *kbuf; + + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - + i = (unsigned long *) data; min = (unsigned long *) table->extra1; max = (unsigned long *) table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; - + + if (write) { + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + err = -EFAULT; + goto free; + } + kbuf[left] = 0; + } + for (; left && vleft--; i++, min++, max++, first=0) { + unsigned long val; + if (write) { - while (left) { - char c; - if (get_user(c, s)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - s++; - } - if (!left) - break; - neg = 0; - len = left; - if (len > TMPBUFLEN-1) - len = TMPBUFLEN-1; - if (copy_from_user(buf, s, len)) - return -EFAULT; - buf[len] = 0; - p = buf; - if (*p == '-' && left > 1) { - neg = 1; - p++; - } - if (*p < '0' || *p > '9') - break; - val = simple_strtoul(p, &p, 0) * convmul / convdiv ; - len = p-buf; - if ((len < left) && *p && !isspace(*p)) + bool neg; + + left -= proc_skip_spaces(&kbuf); + + err = proc_get_long(&kbuf, &left, &val, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) break; if (neg) - val = -val; - s += len; - left -= len; - - if(neg) continue; if ((min && val < *min) || (max && val > *max)) continue; *i = val; } else { - p = buf; + val = convdiv * (*i) / convmul; if (!first) - *p++ = '\t'; - sprintf(p, "%lu", convdiv * (*i) / convmul); - len = strlen(buf); - if (len > left) - len = left; - if(copy_to_user(s, buf, len)) - return -EFAULT; - left -= len; - s += len; + err = proc_put_char(&buffer, &left, '\t'); + err = proc_put_long(&buffer, &left, val, false); + if (err) + break; } } - if (!write && !first && left) { - if(put_user('\n', s)) - return -EFAULT; - left--, s++; - } + if (!write && !first && left && !err) + err = proc_put_char(&buffer, &left, '\n'); + if (write && !err) + left -= proc_skip_spaces(&kbuf); +free: if (write) { - while (left) { - char c; - if (get_user(c, s++)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - } + free_page(page); + if (first) + return err ? : -EINVAL; } - if (write && first) - return -EINVAL; *lenp -= left; *ppos += *lenp; - return 0; -#undef TMPBUFLEN + return err; } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, @@ -2451,7 +2587,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, } -static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, +static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { @@ -2463,10 +2599,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, int val = *valp; unsigned long lval; if (val < 0) { - *negp = -1; + *negp = true; lval = (unsigned long)-val; } else { - *negp = 0; + *negp = false; lval = (unsigned long)val; } *lvalp = lval / HZ; @@ -2474,7 +2610,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, +static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { @@ -2486,10 +2622,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, int val = *valp; unsigned long lval; if (val < 0) { - *negp = -1; + *negp = true; lval = (unsigned long)-val; } else { - *negp = 0; + *negp = false; lval = (unsigned long)val; } *lvalp = jiffies_to_clock_t(lval); @@ -2497,7 +2633,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, +static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { @@ -2507,10 +2643,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, int val = *valp; unsigned long lval; if (val < 0) { - *negp = -1; + *negp = true; lval = (unsigned long)-val; } else { - *negp = 0; + *negp = false; lval = (unsigned long)val; } *lvalp = jiffies_to_msecs(lval); @@ -2607,6 +2743,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, return 0; } +/** + * proc_do_large_bitmap - read/write from/to a large bitmap + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * The bitmap is stored at table->data and the bitmap length (in bits) + * in table->maxlen. + * + * We use a range comma separated format (e.g. 1,3-4,10-10) so that + * large bitmaps may be represented in a compact manner. Writing into + * the file will clear the bitmap then update it with the given input. + * + * Returns 0 on success. + */ +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + bool first = 1; + size_t left = *lenp; + unsigned long bitmap_len = table->maxlen; + unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *tmp_bitmap = NULL; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + + if (!bitmap_len || !left || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + unsigned long page = 0; + char *kbuf; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + free_page(page); + return -EFAULT; + } + kbuf[left] = 0; + + tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), + GFP_KERNEL); + if (!tmp_bitmap) { + free_page(page); + return -ENOMEM; + } + proc_skip_char(&kbuf, &left, '\n'); + while (!err && left) { + unsigned long val_a, val_b; + bool neg; + + err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + sizeof(tr_a), &c); + if (err) + break; + if (val_a >= bitmap_len || neg) { + err = -EINVAL; + break; + } + + val_b = val_a; + if (left) { + kbuf++; + left--; + } + + if (c == '-') { + err = proc_get_long(&kbuf, &left, &val_b, + &neg, tr_b, sizeof(tr_b), + &c); + if (err) + break; + if (val_b >= bitmap_len || neg || + val_a > val_b) { + err = -EINVAL; + break; + } + if (left) { + kbuf++; + left--; + } + } + + while (val_a <= val_b) + set_bit(val_a++, tmp_bitmap); + + first = 0; + proc_skip_char(&kbuf, &left, '\n'); + } + free_page(page); + } else { + unsigned long bit_a, bit_b = 0; + + while (left) { + bit_a = find_next_bit(bitmap, bitmap_len, bit_b); + if (bit_a >= bitmap_len) + break; + bit_b = find_next_zero_bit(bitmap, bitmap_len, + bit_a + 1) - 1; + + if (!first) { + err = proc_put_char(&buffer, &left, ','); + if (err) + break; + } + err = proc_put_long(&buffer, &left, bit_a, false); + if (err) + break; + if (bit_a != bit_b) { + err = proc_put_char(&buffer, &left, '-'); + if (err) + break; + err = proc_put_long(&buffer, &left, bit_b, false); + if (err) + break; + } + + first = 0; bit_b++; + } + if (!err) + err = proc_put_char(&buffer, &left, '\n'); + } + + if (!err) { + if (write) { + if (*ppos) + bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); + else + memcpy(bitmap, tmp_bitmap, + BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); + } + kfree(tmp_bitmap); + *lenp -= left; + *ppos += *lenp; + return 0; + } else { + kfree(tmp_bitmap); + return err; + } +} + #else /* CONFIG_PROC_FS */ int proc_dostring(struct ctl_table *table, int write, diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8cd50d8..1357c57 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -13,6 +13,8 @@ #include <linux/file.h> #include <linux/ctype.h> #include <linux/netdevice.h> +#include <linux/kernel.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL_SYSCALL @@ -223,7 +225,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = { { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, {} }; @@ -1124,11 +1125,6 @@ out: return result; } -static unsigned hex_value(int ch) -{ - return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10; -} - static ssize_t bin_uuid(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { @@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file, if (!isxdigit(str[0]) || !isxdigit(str[1])) goto out; - uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); + uuid[i] = (hex_to_bin(str[0]) << 4) | + hex_to_bin(str[1]); str += 2; if (*str == '-') str++; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 899ca51..11281d5 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -22,6 +22,7 @@ #include <linux/delayacct.h> #include <linux/cpumask.h> #include <linux/percpu.h> +#include <linux/slab.h> #include <linux/cgroupstats.h> #include <linux/cgroup.h> #include <linux/fs.h> diff --git a/kernel/time.c b/kernel/time.c index 8047980..848b1c2 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -35,7 +35,6 @@ #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> -#include <linux/slab.h> #include <linux/math64.h> #include <linux/ptrace.h> @@ -133,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, */ static inline void warp_clock(void) { - write_seqlock_irq(&xtime_lock); - wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; - xtime.tv_sec += sys_tz.tz_minuteswest * 60; - update_xtime_cache(0); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); + struct timespec adjust; + + adjust = current_kernel_time(); + adjust.tv_sec += sys_tz.tz_minuteswest * 60; + do_settimeofday(&adjust); } /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1f5dde6..f08e99c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -625,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs) list_add(&cs->list, entry); } + +/* + * Maximum time we expect to go between ticks. This includes idle + * tickless time. It provides the trade off between selecting a + * mult/shift pair that is very precise but can only handle a short + * period of time, vs. a mult/shift pair that can handle long periods + * of time but isn't as precise. + * + * This is a subsystem constant, and actual hardware limitations + * may override it (ie: clocksources that wrap every 3 seconds). + */ +#define MAX_UPDATE_LENGTH 5 /* Seconds */ + +/** + * __clocksource_register_scale - Used to install new clocksources + * @t: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* + * Ideally we want to use some of the limits used in + * clocksource_max_deferment, to provide a more informed + * MAX_UPDATE_LENGTH. But for now this just gets the + * register interface working properly. + */ + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC/scale, + MAX_UPDATE_LENGTH*scale); + cs->max_idle_ns = clocksource_max_deferment(cs); + + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_select(); + clocksource_enqueue_watchdog(cs); + mutex_unlock(&clocksource_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(__clocksource_register_scale); + + /** * clocksource_register - Used to install new clocksources * @t: clocksource to be registered diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7c0f180..c631168 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -69,7 +69,7 @@ static s64 time_freq; /* time at last adjustment (secs): */ static long time_reftime; -long time_adjust; +static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b3bafd5..48b2761 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have * periodic mode. We read dev->next_event first and add to it - * when the event alrady expired. clockevents_program_event() + * when the event already expired. clockevents_program_event() * sets dev->next_event only when the event is really * programmed to the device. */ diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0a8a213..aada0e5 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -22,6 +22,29 @@ #include "tick-internal.h" +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +static int tick_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) + return -ETIME; + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + /** * tick_program_event internal worker function */ @@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, if (!ret || !force) return ret; + dev->retries++; /* - * We tried 2 times to program the device with the given - * min_delta_ns. If that's not working then we double it + * We tried 3 times to program the device with the given + * min_delta_ns. If that's not working then we increase it * and emit a warning. */ if (++i > 2) { /* Increase the min. delta and try again */ - if (!dev->min_delta_ns) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns << 1); - + if (tick_increase_min_delta(dev)) { + /* + * Get out of the loop if min_delta_ns + * hit the limit already. That's + * better than staying here forever. + * + * We clear next_event so we have a + * chance that the box survives. + */ + printk(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } i = 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f992762..813993b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -150,35 +150,65 @@ static void tick_nohz_update_jiffies(ktime_t now) touch_softlockup_watchdog(); } +/* + * Updates the per cpu time idle statistics counters + */ +static void +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +{ + ktime_t delta; + + if (ts->idle_active) { + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + if (nr_iowait_cpu(cpu) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + ts->idle_entrytime = now; + } + + if (last_update_time) + *last_update_time = ktime_to_us(now); + +} + static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t delta; - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { - ktime_t now, delta; + ktime_t now; now = ktime_get(); - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - } + + update_ts_time_stats(cpu, ts, now, NULL); + ts->idle_entrytime = now; ts->idle_active = 1; sched_clock_idle_sleep_event(); return now; } +/** + * get_cpu_idle_time_us - get the total idle time of a cpu + * @cpu: CPU number to query + * @last_update_time: variable to store update time in + * + * Return the cummulative idle time (since boot) for a given + * CPU, in microseconds. The idle time returned includes + * the iowait time (unlike what "top" and co report). + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - if (ts->idle_active) - *last_update_time = ktime_to_us(ts->idle_lastupdate); - else - *last_update_time = ktime_to_us(ktime_get()); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->idle_sleeptime); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); +/* + * get_cpu_iowait_time_us - get the total iowait time of a cpu + * @cpu: CPU number to query + * @last_update_time: variable to store update time in + * + * Return the cummulative iowait time (since boot) for a given + * CPU, in microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!tick_nohz_enabled) + return -1; + + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); + + return ktime_to_us(ts->iowait_sleeptime); +} +EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); + /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task * @@ -231,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ ts->inidle = 1; - now = tick_nohz_start_idle(ts); + now = tick_nohz_start_idle(cpu, ts); /* * If this cpu is offline and it is the one which updates @@ -272,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { + arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index 12f5c55..ac38fbb 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -19,6 +19,7 @@ #include <linux/timecompare.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/math64.h> /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1673637..caf8d4d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -165,13 +165,6 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -static struct timespec xtime_cache __attribute__ ((aligned (16))); -void update_xtime_cache(u64 nsec) -{ - xtime_cache = xtime; - timespec_add_ns(&xtime_cache, nsec); -} - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { @@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; - update_xtime_cache(0); - timekeeper.ntp_error = 0; ntp_clear(); @@ -559,7 +550,6 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } - update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -788,7 +777,6 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; - u64 nsecs; int shift = 0, maxshift; /* Make sure we're fully resumed: */ @@ -818,7 +806,8 @@ void update_wall_time(void) shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { offset = logarithmic_accumulation(offset, shift); - shift--; + if(offset < timekeeper.cycle_interval<<shift) + shift--; } /* correct the clock when NTP error is too big */ @@ -846,7 +835,9 @@ void update_wall_time(void) timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; } - /* store full nanoseconds into xtime after rounding it up and + + /* + * Store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; @@ -854,8 +845,15 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; - nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); - update_xtime_cache(nsecs); + /* + * Finally, make sure that after the rounding + * xtime.tv_nsec isn't larger then NSEC_PER_SEC + */ + if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { + xtime.tv_nsec -= NSEC_PER_SEC; + xtime.tv_sec++; + second_overflow(); + } /* check to see if there is a new clocksource to use */ update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); @@ -895,13 +893,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return xtime_cache.tv_sec; + return xtime.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime_cache; + return xtime; } struct timespec current_kernel_time(void) @@ -912,7 +910,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime_cache; + now = xtime; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -927,7 +925,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime_cache; + now = xtime; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdfb8dd..ab8f5e3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(idle_waketime); P_ns(idle_exittime); P_ns(idle_sleeptime); + P_ns(iowait_sleeptime); P(last_jiffies); P(next_jiffies); P_ns(idle_expires); @@ -228,6 +229,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, " event_handler: "); print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); } static void timer_list_show_tickdevices(struct seq_file *m) @@ -257,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.5\n"); + SEQ_printf(m, "Timer List Version: v0.6\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index c61a794..efde11e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@ #include <linux/kallsyms.h> #include <linux/perf_event.h> #include <linux/sched.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -318,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j) } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); +/** + * set_timer_slack - set the allowed slack for a timer + * @slack_hz: the amount of time (in jiffies) allowed for rounding + * + * Set the amount of time, in jiffies, that a certain timer has + * in terms of slack. By setting this value, the timer subsystem + * will schedule the actual timer somewhere between + * the time mod_timer() asks for, and that time plus the slack. + * + * By setting the slack to -1, a percentage of the delay is used + * instead. + */ +void set_timer_slack(struct timer_list *timer, int slack_hz) +{ + timer->slack = slack_hz; +} +EXPORT_SYMBOL_GPL(set_timer_slack); + static inline void set_running_timer(struct tvec_base *base, struct timer_list *timer) @@ -549,6 +568,7 @@ static void __init_timer(struct timer_list *timer, { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); + timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; timer->start_pid = -1; @@ -557,6 +577,19 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } +void setup_deferrable_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) +{ + timer->function = function; + timer->data = data; + init_timer_on_stack_key(timer, name, key); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); + /** * init_timer_key - initialize a timer * @timer: the timer to be initialized @@ -714,6 +747,46 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires) } EXPORT_SYMBOL(mod_timer_pending); +/* + * Decide where to put the timer while taking the slack into account + * + * Algorithm: + * 1) calculate the maximum (absolute) time + * 2) calculate the highest bit where the expires and new max are different + * 3) use this bit to make a mask + * 4) use the bitmask to round down the maximum time, so that all last + * bits are zeros + */ +static inline +unsigned long apply_slack(struct timer_list *timer, unsigned long expires) +{ + unsigned long expires_limit, mask; + int bit; + + expires_limit = expires; + + if (timer->slack >= 0) { + expires_limit = expires + timer->slack; + } else { + unsigned long now = jiffies; + + /* No slack, if already expired else auto slack 0.4% */ + if (time_after(expires, now)) + expires_limit = expires + (expires - now)/256; + } + mask = expires ^ expires_limit; + if (mask == 0) + return expires; + + bit = find_last_bit(&mask, BITS_PER_LONG); + + mask = (1 << bit) - 1; + + expires_limit = expires_limit & ~(mask); + + return expires_limit; +} + /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -744,6 +817,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; + expires = apply_slack(timer, expires); + return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer); @@ -880,6 +955,7 @@ int try_to_del_timer_sync(struct timer_list *timer) if (base->running_timer == timer) goto out; + timer_stats_timer_clear_start_info(timer); ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); @@ -953,6 +1029,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) return index; } +static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), + unsigned long data) +{ + int preempt_count = preempt_count(); + +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the timer from inside the + * function that is called from it, this we need to take into + * account for lockdep too. To avoid bogus "held lock freed" + * warnings as well as problems when looking into + * timer->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = timer->lockdep_map; +#endif + /* + * Couple the lock chain with the lock chain at + * del_timer_sync() by acquiring the lock_map around the fn() + * call here and in del_timer_sync(). + */ + lock_map_acquire(&lockdep_map); + + trace_timer_expire_entry(timer); + fn(data); + trace_timer_expire_exit(timer); + + lock_map_release(&lockdep_map); + + if (preempt_count != preempt_count()) { + WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + fn, preempt_count, preempt_count()); + /* + * Restore the preempt count. That gives us a decent + * chance to survive and extract information. If the + * callback kept a lock held, bad luck, but not worse + * than the BUG() we had. + */ + preempt_count() = preempt_count; + } +} + #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) /** @@ -996,45 +1113,7 @@ static inline void __run_timers(struct tvec_base *base) detach_timer(timer, 1); spin_unlock_irq(&base->lock); - { - int preempt_count = preempt_count(); - -#ifdef CONFIG_LOCKDEP - /* - * It is permissible to free the timer from - * inside the function that is called from - * it, this we need to take into account for - * lockdep too. To avoid bogus "held lock - * freed" warnings as well as problems when - * looking into timer->lockdep_map, make a - * copy and use that here. - */ - struct lockdep_map lockdep_map = - timer->lockdep_map; -#endif - /* - * Couple the lock chain with the lock chain at - * del_timer_sync() by acquiring the lock_map - * around the fn() call here and in - * del_timer_sync(). - */ - lock_map_acquire(&lockdep_map); - - trace_timer_expire_entry(timer); - fn(data); - trace_timer_expire_exit(timer); - - lock_map_release(&lockdep_map); - - if (preempt_count != preempt_count()) { - printk(KERN_ERR "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); - } - } + call_timer_fn(timer, fn, data); spin_lock_irq(&base->lock); } } @@ -1618,11 +1697,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + int err; + switch(action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (init_timers_cpu(cpu) < 0) - return NOTIFY_BAD; + err = init_timers_cpu(cpu); + if (err < 0) + return notifier_from_errno(err); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: @@ -1648,7 +1730,7 @@ void __init init_timers(void) init_timer_stats(); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 13e13d4..8b1797c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD help See Documentation/trace/ftrace-design.txt -config HAVE_HW_BRANCH_TRACER - bool - config HAVE_SYSCALL_TRACEPOINTS bool help @@ -374,14 +371,6 @@ config STACK_TRACER Say N if unsure. -config HW_BRANCH_TRACER - depends on HAVE_HW_BRANCH_TRACER - bool "Trace hw branches" - select GENERIC_TRACER - help - This tracer records all branches on the system in a circular - buffer, giving access to the last N branches for each cpu. - config KMEMTRACE bool "Trace SLAB allocations" select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 78edc64..ffb1a5b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 07f945a9..638711c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -21,6 +21,7 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <linux/debugfs.h> #include <linux/smp_lock.h> #include <linux/time.h> @@ -674,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, } } -static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_abort(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_ABORT); } -static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_insert(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_INSERT); } -static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_issue(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_ISSUE); } -static void blk_add_trace_rq_requeue(struct request_queue *q, +static void blk_add_trace_rq_requeue(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(struct request_queue *q, +static void blk_add_trace_rq_complete(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); @@ -723,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); } -static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_bounce(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); } -static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); } -static void blk_add_trace_bio_backmerge(struct request_queue *q, +static void blk_add_trace_bio_backmerge(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); } -static void blk_add_trace_bio_frontmerge(struct request_queue *q, +static void blk_add_trace_bio_frontmerge(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); } -static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_queue(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_QUEUE); } -static void blk_add_trace_getrq(struct request_queue *q, +static void blk_add_trace_getrq(void *ignore, + struct request_queue *q, struct bio *bio, int rw) { if (bio) @@ -764,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q, } -static void blk_add_trace_sleeprq(struct request_queue *q, +static void blk_add_trace_sleeprq(void *ignore, + struct request_queue *q, struct bio *bio, int rw) { if (bio) @@ -778,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q, } } -static void blk_add_trace_plug(struct request_queue *q) +static void blk_add_trace_plug(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -786,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(struct request_queue *q) +static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -799,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q) } } -static void blk_add_trace_unplug_timer(struct request_queue *q) +static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -812,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q) } } -static void blk_add_trace_split(struct request_queue *q, struct bio *bio, +static void blk_add_trace_split(void *ignore, + struct request_queue *q, struct bio *bio, unsigned int pdu) { struct blk_trace *bt = q->blk_trace; @@ -828,6 +842,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, /** * blk_add_trace_remap - Add a trace for a remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio * @dev: target device @@ -838,8 +853,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * it spans a stripe (or similar). Add a trace for that action. * **/ -static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -858,6 +874,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @rq: the source request * @dev: target device @@ -868,7 +885,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, * Add a trace for that action. * **/ -static void blk_add_trace_rq_remap(struct request_queue *q, +static void blk_add_trace_rq_remap(void *ignore, + struct request_queue *q, struct request *rq, dev_t dev, sector_t from) { @@ -920,64 +938,64 @@ static void blk_register_tracepoints(void) { int ret; - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); WARN_ON(ret); - ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); - ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); - ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); - ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); WARN_ON(ret); - ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); WARN_ON(ret); - ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); WARN_ON(ret); - ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); WARN_ON(ret); - ret = register_trace_block_getrq(blk_add_trace_getrq); + ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); WARN_ON(ret); - ret = register_trace_block_plug(blk_add_trace_plug); + ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); + ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); + ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); WARN_ON(ret); - ret = register_trace_block_split(blk_add_trace_split); + ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap); + ret = register_trace_block_remap(blk_add_trace_remap, NULL); WARN_ON(ret); - ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); } static void blk_unregister_tracepoints(void) { - unregister_trace_block_rq_remap(blk_add_trace_rq_remap); - unregister_trace_block_remap(blk_add_trace_remap); - unregister_trace_block_split(blk_add_trace_split); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); - unregister_trace_block_plug(blk_add_trace_plug); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq); - unregister_trace_block_getrq(blk_add_trace_getrq); - unregister_trace_block_bio_queue(blk_add_trace_bio_queue); - unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - unregister_trace_block_bio_complete(blk_add_trace_bio_complete); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); - unregister_trace_block_rq_complete(blk_add_trace_rq_complete); - unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); - unregister_trace_block_rq_issue(blk_add_trace_rq_issue); - unregister_trace_block_rq_insert(blk_add_trace_rq_insert); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort); + unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + unregister_trace_block_remap(blk_add_trace_remap, NULL); + unregister_trace_block_split(blk_add_trace_split, NULL); + unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); + unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); + unregister_trace_block_getrq(blk_add_trace_getrq, NULL); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); tracepoint_synchronize_unregister(); } @@ -1320,7 +1338,7 @@ out: } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { return print_one_line(iter, false); } @@ -1342,7 +1360,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) } static enum print_line_t -blk_trace_event_print_binary(struct trace_iterator *iter, int flags) +blk_trace_event_print_binary(struct trace_iterator *iter, int flags, + struct trace_event *event) { return blk_trace_synthesize_old_trace(iter) ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; @@ -1380,12 +1399,16 @@ static struct tracer blk_tracer __read_mostly = { .set_flag = blk_tracer_set_flag, }; -static struct trace_event trace_blk_event = { - .type = TRACE_BLK, +static struct trace_event_functions trace_blk_event_funcs = { .trace = blk_trace_event_print, .binary = blk_trace_event_print_binary, }; +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .funcs = &trace_blk_event_funcs, +}; + static int __init init_blk_tracer(void) { if (!register_ftrace_event(&trace_blk_event)) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d9062f5..6d2cb14 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -24,6 +24,7 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/sysctl.h> +#include <linux/slab.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/hash.h> @@ -263,6 +264,7 @@ struct ftrace_profile { unsigned long counter; #ifdef CONFIG_FUNCTION_GRAPH_TRACER unsigned long long time; + unsigned long long time_squared; #endif }; @@ -365,9 +367,9 @@ static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_printf(m, " Function " - "Hit Time Avg\n" + "Hit Time Avg s^2\n" " -------- " - "--- ---- ---\n"); + "--- ---- --- ---\n"); #else seq_printf(m, " Function Hit\n" " -------- ---\n"); @@ -383,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v) static DEFINE_MUTEX(mutex); static struct trace_seq s; unsigned long long avg; + unsigned long long stddev; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -393,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v) avg = rec->time; do_div(avg, rec->counter); + /* Sample standard deviation (s^2) */ + if (rec->counter <= 1) + stddev = 0; + else { + stddev = rec->time_squared - rec->counter * avg * avg; + /* + * Divide only 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide 1000 again. + */ + do_div(stddev, (rec->counter - 1) * 1000); + } + mutex_lock(&mutex); trace_seq_init(&s); trace_print_graph_duration(rec->time, &s); trace_seq_puts(&s, " "); trace_print_graph_duration(avg, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(stddev, &s); trace_print_seq(m, &s); mutex_unlock(&mutex); #endif @@ -649,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!stat->hash || !ftrace_profile_enabled) goto out; + /* If the calltime was zero'd ignore it */ + if (!trace->calltime) + goto out; + calltime = trace->rettime - trace->calltime; if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { @@ -667,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) } rec = ftrace_find_profiled_func(stat, trace->func); - if (rec) + if (rec) { rec->time += calltime; + rec->time_squared += calltime * calltime; + } out: local_irq_restore(flags); @@ -3211,8 +3234,8 @@ free: } static void -ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, - struct task_struct *next) +ftrace_graph_probe_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; int index; @@ -3266,7 +3289,7 @@ static int start_graph_tracing(void) } while (ret == -EAGAIN); if (!ret) { - ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); if (ret) pr_info("ftrace_graph: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); @@ -3338,11 +3361,11 @@ void unregister_ftrace_graph(void) goto out; ftrace_graph_active--; - unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); out: mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index a91da69..bbfc1bb 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, trace_wake_up(); } -static void kmemtrace_kmalloc(unsigned long call_site, +static void kmemtrace_kmalloc(void *ignore, + unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, @@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site, bytes_req, bytes_alloc, gfp_flags, -1); } -static void kmemtrace_kmem_cache_alloc(unsigned long call_site, +static void kmemtrace_kmem_cache_alloc(void *ignore, + unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, @@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site, bytes_req, bytes_alloc, gfp_flags, -1); } -static void kmemtrace_kmalloc_node(unsigned long call_site, +static void kmemtrace_kmalloc_node(void *ignore, + unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, @@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site, bytes_req, bytes_alloc, gfp_flags, node); } -static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, +static void kmemtrace_kmem_cache_alloc_node(void *ignore, + unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, @@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, bytes_req, bytes_alloc, gfp_flags, node); } -static void kmemtrace_kfree(unsigned long call_site, const void *ptr) +static void +kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) { kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); } -static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) +static void kmemtrace_kmem_cache_free(void *ignore, + unsigned long call_site, const void *ptr) { kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); } @@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void) { int err; - err = register_trace_kmalloc(kmemtrace_kmalloc); + err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); if (err) return err; - err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); + err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); if (err) return err; - err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); + err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); if (err) return err; - err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); + err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); if (err) return err; - err = register_trace_kfree(kmemtrace_kfree); + err = register_trace_kfree(kmemtrace_kfree, NULL); if (err) return err; - err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); + err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); return err; } static void kmemtrace_stop_probes(void) { - unregister_trace_kmalloc(kmemtrace_kmalloc); - unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); - unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); - unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); - unregister_trace_kfree(kmemtrace_kfree); - unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); + unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); + unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); + unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); + unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); + unregister_trace_kfree(kmemtrace_kfree, NULL); + unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); } static int kmem_trace_init(struct trace_array *tr) @@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free(struct trace_iterator *iter, int flags) +kmemtrace_print_free(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags) +kmemtrace_print_free_user(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) } } -static struct trace_event kmem_trace_alloc = { - .type = TRACE_KMEM_ALLOC, +static struct trace_event_functions kmem_trace_alloc_funcs = { .trace = kmemtrace_print_alloc, .binary = kmemtrace_print_alloc_user, }; -static struct trace_event kmem_trace_free = { - .type = TRACE_KMEM_FREE, +static struct trace_event kmem_trace_alloc = { + .type = TRACE_KMEM_ALLOC, + .funcs = &kmem_trace_alloc_funcs, +}; + +static struct trace_event_functions kmem_trace_free_funcs = { .trace = kmemtrace_print_free, .binary = kmemtrace_print_free_user, }; +static struct trace_event kmem_trace_free = { + .type = TRACE_KMEM_FREE, + .funcs = &kmem_trace_free_funcs, +}; + static struct tracer kmem_tracer __read_mostly = { .name = "kmemtrace", .init = kmem_trace_init, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9f4f565..a22582a 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -9,7 +9,6 @@ #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/module.h> -#include <linux/slab.h> #define CREATE_TRACE_POINTS #include <trace/events/power.h> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05a9f83..1da7b6e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> @@ -207,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -310,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +/* Flag when events were overwritten */ +#define RB_MISSED_EVENTS (1 << 31) +/* Missed count stored at end */ +#define RB_MISSED_STORED (1 << 30) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -329,6 +343,7 @@ struct buffer_page { local_t write; /* index for next write */ unsigned read; /* index for next read */ local_t entries; /* entries on this page */ + unsigned long real_end; /* real end of data */ struct buffer_data_page *page; /* Actual data page */ }; @@ -408,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s) (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); + ret = trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + ret = trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), @@ -431,6 +452,8 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long lost_events; + unsigned long last_overrun; local_t commit_overrun; local_t overrun; local_t entries; @@ -1201,18 +1224,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - return; + goto out; p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - return; + goto out; rb_reset_cpu(cpu_buffer); rb_check_pages(cpu_buffer); +out: spin_unlock_irq(&cpu_buffer->reader_lock); } @@ -1229,7 +1253,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) - return; + goto out; p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); @@ -1238,6 +1262,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_cpu(cpu_buffer); rb_check_pages(cpu_buffer); +out: spin_unlock_irq(&cpu_buffer->reader_lock); } @@ -1547,7 +1572,7 @@ rb_update_event(struct ring_buffer_event *event, case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) event->array[0] = length; else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); @@ -1722,11 +1747,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length = 1; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); return length; } @@ -1743,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * must fill the old tail_page with padding. */ if (tail >= BUF_PAGE_SIZE) { + /* + * If the page was filled, then we still need + * to update the real_end. Reset it to zero + * and the reader will ignore it. + */ + if (tail == BUF_PAGE_SIZE) + tail_page->real_end = 0; + local_sub(length, &tail_page->write); return; } @@ -1751,6 +1784,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, kmemcheck_annotate_bitfield(event, bitfield); /* + * Save the original length to the meta data. + * This will be used by the reader to add lost event + * counter. + */ + tail_page->real_end = tail; + + /* * If this event is bigger than the minimum size, then * we need to be careful that we don't subtract the * write counter enough to allow another writer to slip @@ -1968,17 +2008,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) { struct ring_buffer_event *event; - static int once; int ret; - if (unlikely(*delta > (1ULL << 59) && !once++)) { - printk(KERN_WARNING "Delta way too big! %llu" - " ts=%llu write stamp = %llu\n", - (unsigned long long)*delta, - (unsigned long long)*ts, - (unsigned long long)cpu_buffer->write_stamp); - WARN_ON(1); - } + WARN_ONCE(*delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + (unsigned long long)*delta, + (unsigned long long)*ts, + (unsigned long long)cpu_buffer->write_stamp); /* * The delta is too big, we to add a @@ -2827,6 +2863,7 @@ static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; + unsigned long overwrite; unsigned long flags; int nr_loops = 0; int ret; @@ -2868,6 +2905,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); + cpu_buffer->reader_page->real_end = 0; spin: /* @@ -2888,6 +2926,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); /* + * We want to make sure we read the overruns after we set up our + * pointers to the next object. The writer side does a + * cmpxchg to cross pages which acts as the mb on the writer + * side. Note, the reader will constantly fail the swap + * while the writer is updating the pointers, so this + * guarantees that the overwrite recorded here is the one we + * want to compare with the last_overrun. + */ + smp_mb(); + overwrite = local_read(&(cpu_buffer->overrun)); + + /* * Here's the tricky part. * * We need to move the pointer past the header page. @@ -2918,6 +2968,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page = reader; rb_reset_reader_page(cpu_buffer); + if (overwrite != cpu_buffer->last_overrun) { + cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; + cpu_buffer->last_overrun = overwrite; + } + goto again; out: @@ -2994,8 +3049,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) rb_advance_iter(iter); } +static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->lost_events; +} + static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_event *event; struct buffer_page *reader; @@ -3047,6 +3108,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } + if (lost_events) + *lost_events = rb_lost_events(cpu_buffer); return event; default: @@ -3157,12 +3220,14 @@ static inline int rb_ok_to_lock(void) * @buffer: The ring buffer to read * @cpu: The cpu to peak at * @ts: The timestamp counter of this event. + * @lost_events: a variable to store if events were lost (may be NULL) * * This will return the event that will be read next, but does * not consume the data. */ struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; @@ -3177,7 +3242,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) local_irq_save(flags); if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(cpu_buffer, ts); + event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) @@ -3219,13 +3284,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) /** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from + * @cpu: the cpu to read the buffer from + * @ts: a variable to store the timestamp (may be NULL) + * @lost_events: a variable to store if events were lost (may be NULL) * * Returns the next event in the ring buffer, and that event is consumed. * Meaning, that sequential reads will keep returning a different event, * and eventually empty the ring buffer if the producer is slower. */ struct ring_buffer_event * -ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; @@ -3246,9 +3315,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(cpu_buffer, ts); - if (event) + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event) { + cpu_buffer->lost_events = 0; rb_advance_reader(cpu_buffer); + } if (dolock) spin_unlock(&cpu_buffer->reader_lock); @@ -3265,23 +3336,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_start - start a non consuming read of the buffer + * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * - * This starts up an iteration through the buffer. It also disables - * the recording to the buffer until the reading is finished. - * This prevents the reading from being corrupted. This is not - * a consuming read, so a producer is not expected. + * This performs the initial preparations necessary to iterate + * through the buffer. Memory is allocated, buffer recording + * is disabled, and the iterator pointer is returned to the caller. * - * Must be paired with ring_buffer_finish. + * Disabling buffer recordng prevents the reading from being + * corrupted. This is not a consuming read, so a producer is not + * expected. + * + * After a sequence of ring_buffer_read_prepare calls, the user is + * expected to make at least one call to ring_buffer_prepare_sync. + * Afterwards, ring_buffer_read_start is invoked to get things going + * for real. + * + * This overall must be paired with ring_buffer_finish. */ struct ring_buffer_iter * -ring_buffer_read_start(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; - unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; @@ -3295,15 +3373,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) iter->cpu_buffer = cpu_buffer; atomic_inc(&cpu_buffer->record_disabled); + + return iter; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); + +/** + * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls + * + * All previously invoked ring_buffer_read_prepare calls to prepare + * iterators will be synchronized. Afterwards, read_buffer_read_start + * calls on those iterators are allowed. + */ +void +ring_buffer_read_prepare_sync(void) +{ synchronize_sched(); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @iter: The iterator returned by ring_buffer_read_prepare + * + * This finalizes the startup of an iteration through the buffer. + * The iterator comes from a call to ring_buffer_read_prepare and + * an intervening ring_buffer_read_prepare_sync must have been + * performed. + * + * Must be paired with ring_buffer_finish. + */ +void +ring_buffer_read_start(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -3397,6 +3512,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; + cpu_buffer->lost_events = 0; + cpu_buffer->last_overrun = 0; + rb_head_page_activate(cpu_buffer); } @@ -3672,6 +3790,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, struct ring_buffer_event *event; struct buffer_data_page *bpage; struct buffer_page *reader; + unsigned long missed_events; unsigned long flags; unsigned int commit; unsigned int read; @@ -3708,6 +3827,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = reader->read; commit = rb_page_commit(reader); + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -3768,9 +3890,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; + + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); } ret = read; + cpu_buffer->lost_events = 0; + + commit = local_read(&bpage->commit); + /* + * Set a flag in the commit field if we lost events + */ + if (missed_events) { + /* If there is room at the end of the page to save the + * missed events, then record it there. + */ + if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + commit += sizeof(missed_events); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + + /* + * This page may be off to user land. Zero it out here. + */ + if (commit < BUF_PAGE_SIZE) + memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index df74c79..302f8a6 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -81,7 +81,7 @@ static enum event_status read_event(int cpu) int *entry; u64 ts; - event = ring_buffer_consume(buffer, cpu, &ts); + event = ring_buffer_consume(buffer, cpu, &ts, NULL); if (!event) return EVENT_DROPPED; @@ -113,7 +113,8 @@ static enum event_status read_page(int cpu) ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); if (ret >= 0) { rpage = bpage; - commit = local_read(&rpage->commit); + /* The commit may have missed event flags set, clear them */ + commit = local_read(&rpage->commit) & 0xfffff; for (i = 0; i < commit && !kill_test; i += inc) { if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ec2ee6..086d363 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -33,10 +33,10 @@ #include <linux/kdebug.h> #include <linux/string.h> #include <linux/rwsem.h> +#include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> -#include <linux/gfp.h> #include <linux/fs.h> #include "trace.h" @@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask; * * It is default off, but you can enable it with either specifying * "ftrace_dump_on_oops" in the kernel command line, or setting - * /proc/sys/kernel/ftrace_dump_on_oops to true. + * /proc/sys/kernel/ftrace_dump_on_oops + * Set 1 if you want to dump buffers of all CPUs + * Set 2 if you want to dump the buffer of the CPU that triggered oops */ -int ftrace_dump_on_oops; + +enum ftrace_dump_mode ftrace_dump_on_oops; static int tracing_set_tracer(const char *buf); @@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { - ftrace_dump_on_oops = 1; - return 1; + if (*str++ != '=' || !*str) { + ftrace_dump_on_oops = DUMP_ALL; + return 1; + } + + if (!strcmp("orig_cpu", str)) { + ftrace_dump_on_oops = DUMP_ORIG; + return 1; + } + + return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); @@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter) } static struct trace_entry * -peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; @@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts); + event = ring_buffer_peek(iter->tr->buffer, cpu, ts, + lost_events); ftrace_enable_cpu(); @@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) } static struct trace_entry * -__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, + unsigned long *missing_events, u64 *ent_ts) { struct ring_buffer *buffer = iter->tr->buffer; struct trace_entry *ent, *next = NULL; + unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; @@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (cpu_file > TRACE_PIPE_ALL_CPU) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; - ent = peek_next_entry(iter, cpu_file, ent_ts); + ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); if (ent_cpu) *ent_cpu = cpu_file; @@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (ring_buffer_empty_cpu(buffer, cpu)) continue; - ent = peek_next_entry(iter, cpu, &ts); + ent = peek_next_entry(iter, cpu, &ts, &lost_events); /* * Pick the entry with the smallest timestamp: @@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) next = ent; next_cpu = cpu; next_ts = ts; + next_lost = lost_events; } } @@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (ent_ts) *ent_ts = next_ts; + if (missing_events) + *missing_events = next_lost; + return next; } @@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - return __find_next_entry(iter, ent_cpu, ent_ts); + return __find_next_entry(iter, ent_cpu, NULL, ent_ts); } /* Find the next real entry, and increment the iterator to the next entry */ static void *find_next_entry_inc(struct trace_iterator *iter) { - iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); + iter->ent = __find_next_entry(iter, &iter->cpu, + &iter->lost_events, &iter->ts); if (iter->ent) trace_iterator_increment(iter); @@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ ftrace_disable_cpu(); - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); + ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, + &iter->lost_events); ftrace_enable_cpu(); } @@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m) } -static void +void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1914,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) } if (event) - return event->trace(iter, sym_flags); + return event->funcs->trace(iter, sym_flags, event); if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) goto partial; @@ -1940,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event) - return event->raw(iter, 0); + return event->funcs->raw(iter, 0, event); if (!trace_seq_printf(s, "%d ?\n", entry->type)) goto partial; @@ -1967,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event) { - enum print_line_t ret = event->hex(iter, 0); + enum print_line_t ret = event->funcs->hex(iter, 0, event); if (ret != TRACE_TYPE_HANDLED) return ret; } @@ -1992,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) } event = ftrace_find_event(entry->type); - return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; + return event ? event->funcs->binary(iter, 0, event) : + TRACE_TYPE_HANDLED; } -static int trace_empty(struct trace_iterator *iter) +int trace_empty(struct trace_iterator *iter) { int cpu; @@ -2030,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); if (ret != TRACE_TYPE_UNHANDLED) @@ -2058,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return print_trace_fmt(iter); } +void trace_default_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2070,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v) } if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); - else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return 0; - print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_lat_help_header(m); - } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_func_help_header(m); - } + else + trace_default_header(m); + } else if (iter->leftover) { /* * If we filled the seq_file buffer earlier, we @@ -2166,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file) if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) { - iter->buffer_iter[cpu] = - ring_buffer_read_start(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->tr->buffer, cpu); + } + ring_buffer_read_prepare_sync(); + for_each_tracing_cpu(cpu) { + ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_start(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare_sync(); + ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } @@ -3269,12 +3310,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, size_t len, unsigned int flags) { - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct trace_iterator *iter = filp->private_data; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .flags = flags, .ops = &tracing_pipe_buf_ops, @@ -3285,6 +3326,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, size_t rem; unsigned int i; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); if (unlikely(old_tracer != current_trace && current_trace)) { @@ -3315,23 +3359,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { - pages[i] = alloc_page(GFP_KERNEL); - if (!pages[i]) + for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + spd.pages[i] = alloc_page(GFP_KERNEL); + if (!spd.pages[i]) break; rem = tracing_fill_pipe_page(rem, iter); /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, - page_address(pages[i]), + page_address(spd.pages[i]), iter->seq.len); if (ret < 0) { - __free_page(pages[i]); + __free_page(spd.pages[i]); break; } - partial[i].offset = 0; - partial[i].len = iter->seq.len; + spd.partial[i].offset = 0; + spd.partial[i].len = iter->seq.len; trace_seq_init(&iter->seq); } @@ -3342,12 +3386,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - return splice_to_pipe(pipe, &spd); + ret = splice_to_pipe(pipe, &spd); +out: + splice_shrink_spd(pipe, &spd); + return ret; out_err: mutex_unlock(&iter->mutex); - - return ret; + goto out; } static ssize_t @@ -3620,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; - unsigned int pos; ssize_t ret; size_t size; @@ -3647,11 +3692,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) return 0; - pos = ring_buffer_page_len(info->spare); - - if (pos < PAGE_SIZE) - memset(info->spare + pos, 0, PAGE_SIZE - pos); - read: size = PAGE_SIZE - info->read; if (size > count) @@ -3746,11 +3786,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; - struct partial_page partial[PIPE_BUFFERS]; - struct page *pages[PIPE_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, @@ -3759,22 +3799,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int entries, size, i; size_t ret; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + if (*ppos & (PAGE_SIZE - 1)) { WARN_ONCE(1, "Ftrace: previous read must page-align\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (len & (PAGE_SIZE - 1)) { WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); - if (len < PAGE_SIZE) - return -EINVAL; + if (len < PAGE_SIZE) { + ret = -EINVAL; + goto out; + } len &= PAGE_MASK; } trace_access_lock(info->cpu); entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); - for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3829,11 +3875,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, else ret = 0; /* TODO: block */ - return ret; + goto out; } ret = splice_to_pipe(pipe, &spd); - + splice_shrink_spd(pipe, &spd); +out: return ret; } @@ -4324,7 +4371,7 @@ static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { if (ftrace_dump_on_oops) - ftrace_dump(); + ftrace_dump(ftrace_dump_on_oops); return NOTIFY_OK; } @@ -4341,7 +4388,7 @@ static int trace_die_handler(struct notifier_block *self, switch (val) { case DIE_OOPS: if (ftrace_dump_on_oops) - ftrace_dump(); + ftrace_dump(ftrace_dump_on_oops); break; default: break; @@ -4382,7 +4429,8 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -static void __ftrace_dump(bool disable_tracing) +static void +__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) { static arch_spinlock_t ftrace_dump_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -4415,12 +4463,25 @@ static void __ftrace_dump(bool disable_tracing) /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - printk(KERN_TRACE "Dumping ftrace buffer:\n"); - /* Simulate the iterator */ iter.tr = &global_trace; iter.trace = current_trace; - iter.cpu_file = TRACE_PIPE_ALL_CPU; + + switch (oops_dump_mode) { + case DUMP_ALL: + iter.cpu_file = TRACE_PIPE_ALL_CPU; + break; + case DUMP_ORIG: + iter.cpu_file = raw_smp_processor_id(); + break; + case DUMP_NONE: + goto out_enable; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + iter.cpu_file = TRACE_PIPE_ALL_CPU; + } + + printk(KERN_TRACE "Dumping ftrace buffer:\n"); /* * We need to stop all tracing on all CPUS to read the @@ -4459,6 +4520,7 @@ static void __ftrace_dump(bool disable_tracing) else printk(KERN_TRACE "---------------------------------\n"); + out_enable: /* Re-enable tracing if requested */ if (!disable_tracing) { trace_flags |= old_userobj; @@ -4475,9 +4537,9 @@ static void __ftrace_dump(bool disable_tracing) } /* By default: disable tracing after the dump */ -void ftrace_dump(void) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { - __ftrace_dump(true); + __ftrace_dump(true, oops_dump_mode); } __init static int tracer_alloc_buffers(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2825ef2..2cd9639 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,7 +34,6 @@ enum trace_type { TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, @@ -103,29 +102,17 @@ struct syscall_trace_exit { long ret; }; -struct kprobe_trace_entry { +struct kprobe_trace_entry_head { struct trace_entry ent; unsigned long ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - -struct kretprobe_trace_entry { +struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; unsigned long ret_ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kretprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ @@ -378,6 +364,9 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_default_header(struct seq_file *m); +void print_trace_header(struct seq_file *m, struct trace_iterator *iter); +int trace_empty(struct trace_iterator *iter); void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); @@ -416,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else -static inline void ftrace_trace_stack(struct trace_array *tr, +static inline void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { } -static inline void ftrace_trace_userstack(struct trace_array *tr, +static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { } @@ -467,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ @@ -491,9 +478,29 @@ extern int trace_clock_id; /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -extern enum print_line_t print_graph_function(struct trace_iterator *iter); + +/* Flag options */ +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 +#define TRACE_GRAPH_PRINT_DURATION 0x10 +#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 + +extern enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags); +extern void print_graph_headers_flags(struct seq_file *s, u32 flags); extern enum print_line_t trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); +extern void graph_trace_open(struct trace_iterator *iter); +extern void graph_trace_close(struct trace_iterator *iter); +extern int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, int pc); +extern void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, int pc); + #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ @@ -524,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr) #endif /* CONFIG_DYNAMIC_FTRACE */ #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t -print_graph_function(struct trace_iterator *iter) +print_graph_function_flags(struct trace_iterator *iter, u32 flags) { return TRACE_TYPE_UNHANDLED; } @@ -771,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); +struct list_head * +trace_get_fields(struct ftrace_event_call *event_call); + static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index b9bc4d4..8d3538b 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr) } static enum print_line_t trace_branch_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct trace_branch *field; @@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s) " |\n"); } +static struct trace_event_functions trace_branch_funcs = { + .trace = trace_branch_print, +}; + static struct trace_event trace_branch_event = { .type = TRACE_BRANCH, - .trace = trace_branch_print, + .funcs = &trace_branch_funcs, }; static struct tracer branch_trace __read_mostly = diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6fbfb8f..9d589d8 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -84,7 +84,7 @@ u64 notrace trace_clock_global(void) int this_cpu; u64 now; - raw_local_irq_save(flags); + local_irq_save(flags); this_cpu = raw_smp_processor_id(); now = cpu_clock(this_cpu); @@ -110,7 +110,7 @@ u64 notrace trace_clock_global(void) arch_spin_unlock(&trace_clock_struct.lock); out: - raw_local_irq_restore(flags); + local_irq_restore(flags); return now; } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c16a08f..dc008c1 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(hw_branch, hw_branch_entry, - - TRACE_HW_BRANCHES, - - F_STRUCT( - __field( u64, from ) - __field( u64, to ) - ), - - F_printk("from: %llx to: %llx", __entry->from, __entry->to) -); - FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, TRACE_KMEM_ALLOC, diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 81f691e..8a2b73f 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,70 +9,98 @@ #include <linux/kprobes.h> #include "trace.h" -DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); -EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); - EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); -static char *perf_trace_buf; -static char *perf_trace_buf_nmi; +static char *perf_trace_buf[4]; -typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ; +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * suprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) + perf_trace_t; /* Count the events in use (per event id, not per instance) */ static int total_ref_count; -static int perf_trace_event_enable(struct ftrace_event_call *event) +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { - char *buf; + struct hlist_head *list; int ret = -ENOMEM; + int cpu; - if (event->perf_refcount++ > 0) + p_event->tp_event = tp_event; + if (tp_event->perf_refcount++ > 0) return 0; - if (!total_ref_count) { - buf = (char *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail_buf; + list = alloc_percpu(struct hlist_head); + if (!list) + goto fail; - rcu_assign_pointer(perf_trace_buf, buf); + for_each_possible_cpu(cpu) + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); - buf = (char *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail_buf_nmi; + tp_event->perf_events = list; - rcu_assign_pointer(perf_trace_buf_nmi, buf); - } + if (!total_ref_count) { + char *buf; + int i; - ret = event->perf_event_enable(event); - if (!ret) { - total_ref_count++; - return 0; + for (i = 0; i < 4; i++) { + buf = (char *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail; + + perf_trace_buf[i] = buf; + } } -fail_buf_nmi: + if (tp_event->class->reg) + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + else + ret = tracepoint_probe_register(tp_event->name, + tp_event->class->perf_probe, + tp_event); + + if (ret) + goto fail; + + total_ref_count++; + return 0; + +fail: if (!total_ref_count) { - free_percpu(perf_trace_buf_nmi); - free_percpu(perf_trace_buf); - perf_trace_buf_nmi = NULL; - perf_trace_buf = NULL; + int i; + + for (i = 0; i < 4; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } + + if (!--tp_event->perf_refcount) { + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; } -fail_buf: - event->perf_refcount--; return ret; } -int perf_trace_enable(int event_id) +int perf_trace_init(struct perf_event *p_event) { - struct ftrace_event_call *event; + struct ftrace_event_call *tp_event; + int event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->perf_event_enable && - try_module_get(event->mod)) { - ret = perf_trace_event_enable(event); + list_for_each_entry(tp_event, &ftrace_events, list) { + if (tp_event->event.type == event_id && + tp_event->class && + (tp_event->class->perf_probe || + tp_event->class->reg) && + try_module_get(tp_event->mod)) { + ret = perf_trace_event_init(tp_event, p_event); break; } } @@ -81,88 +109,87 @@ int perf_trace_enable(int event_id) return ret; } -static void perf_trace_event_disable(struct ftrace_event_call *event) +int perf_trace_enable(struct perf_event *p_event) { - char *buf, *nmi_buf; - - if (--event->perf_refcount > 0) - return; + struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head *list; - event->perf_event_disable(event); + list = tp_event->perf_events; + if (WARN_ON_ONCE(!list)) + return -EINVAL; - if (!--total_ref_count) { - buf = perf_trace_buf; - rcu_assign_pointer(perf_trace_buf, NULL); - - nmi_buf = perf_trace_buf_nmi; - rcu_assign_pointer(perf_trace_buf_nmi, NULL); + list = this_cpu_ptr(list); + hlist_add_head_rcu(&p_event->hlist_entry, list); - /* - * Ensure every events in profiling have finished before - * releasing the buffers - */ - synchronize_sched(); + return 0; +} - free_percpu(buf); - free_percpu(nmi_buf); - } +void perf_trace_disable(struct perf_event *p_event) +{ + hlist_del_rcu(&p_event->hlist_entry); } -void perf_trace_disable(int event_id) +void perf_trace_destroy(struct perf_event *p_event) { - struct ftrace_event_call *event; + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { - perf_trace_event_disable(event); - module_put(event->mod); - break; + if (--tp_event->perf_refcount > 0) + goto out; + + if (tp_event->class->reg) + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); + else + tracepoint_probe_unregister(tp_event->name, + tp_event->class->perf_probe, + tp_event); + + /* + * Ensure our callback won't be called anymore. See + * tracepoint_probe_unregister() and __DO_TRACE(). + */ + synchronize_sched(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < 4; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; } } +out: mutex_unlock(&event_mutex); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - int *rctxp, unsigned long *irq_flags) + struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; - char *trace_buf, *raw_data; - int pc, cpu; + unsigned long flags; + char *raw_data; + int pc; - pc = preempt_count(); + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(*irq_flags); + pc = preempt_count(); *rctxp = perf_swevent_get_recursion_context(); if (*rctxp < 0) - goto err_recursion; - - cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference_sched(perf_trace_buf); - - if (!trace_buf) - goto err; + return NULL; - raw_data = per_cpu_ptr(trace_buf, cpu); + raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); entry = (struct trace_entry *)raw_data; - tracing_generic_entry_update(entry, *irq_flags, pc); + local_save_flags(flags); + tracing_generic_entry_update(entry, flags, pc); entry->type = type; return raw_data; -err: - perf_swevent_put_recursion_context(*rctxp); -err_recursion: - local_irq_restore(*irq_flags); - return NULL; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index beab8bf..53cffc0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,6 +15,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/slab.h> #include <linux/delay.h> #include <asm/setup.h> @@ -28,11 +29,23 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +struct list_head * +trace_get_fields(struct ftrace_event_call *event_call) +{ + if (!event_call->class->get_fields) + return &event_call->class->fields; + return event_call->class->get_fields(event_call); +} + int trace_define_field(struct ftrace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct ftrace_event_field *field; + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) @@ -55,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, field->size = size; field->is_signed = is_signed; - list_add(&field->link, &call->fields); + head = trace_get_fields(call); + list_add(&field->link, head); return 0; @@ -93,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call) void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; + struct list_head *head; - list_for_each_entry_safe(field, next, &call->fields, link) { + head = trace_get_fields(call); + list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); kfree(field->type); kfree(field->name); @@ -106,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call) { int id; - id = register_ftrace_event(call->event); + id = register_ftrace_event(&call->event); if (!id) return -ENODEV; - call->id = id; - INIT_LIST_HEAD(&call->fields); return 0; } @@ -123,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, switch (enable) { case 0: - if (call->enabled) { - call->enabled = 0; + if (call->flags & TRACE_EVENT_FL_ENABLED) { + call->flags &= ~TRACE_EVENT_FL_ENABLED; tracing_stop_cmdline_record(); - call->unregfunc(call); + if (call->class->reg) + call->class->reg(call, TRACE_REG_UNREGISTER); + else + tracepoint_probe_unregister(call->name, + call->class->probe, + call); } break; case 1: - if (!call->enabled) { + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { tracing_start_cmdline_record(); - ret = call->regfunc(call); + if (call->class->reg) + ret = call->class->reg(call, TRACE_REG_REGISTER); + else + ret = tracepoint_probe_register(call->name, + call->class->probe, + call); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " "%s\n", call->name); break; } - call->enabled = 1; + call->flags |= TRACE_EVENT_FL_ENABLED; } break; } @@ -170,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->regfunc) + if (!call->name || !call->class || + (!call->class->probe && !call->class->reg)) continue; if (match && strcmp(match, call->name) != 0 && - strcmp(match, call->system) != 0) + strcmp(match, call->class->system) != 0) continue; - if (sub && strcmp(sub, call->system) != 0) + if (sub && strcmp(sub, call->class->system) != 0) continue; if (event && strcmp(event, call->name) != 0) @@ -296,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->regfunc) + if (call->class && (call->class->probe || call->class->reg)) return call; } @@ -327,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; list_for_each_entry_continue(call, &ftrace_events, list) { - if (call->enabled) + if (call->flags & TRACE_EVENT_FL_ENABLED) return call; } @@ -354,8 +379,8 @@ static int t_show(struct seq_file *m, void *v) { struct ftrace_event_call *call = v; - if (strcmp(call->system, TRACE_SYSTEM) != 0) - seq_printf(m, "%s:", call->system); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + seq_printf(m, "%s:", call->class->system); seq_printf(m, "%s\n", call->name); return 0; @@ -386,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call = filp->private_data; char *buf; - if (call->enabled) + if (call->flags & TRACE_EVENT_FL_ENABLED) buf = "1\n"; else buf = "0\n"; @@ -449,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->regfunc) + if (!call->name || !call->class || + (!call->class->probe && !call->class->reg)) continue; - if (system && strcmp(call->system, system) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -460,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!call->enabled); + set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -524,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, { struct ftrace_event_call *call = filp->private_data; struct ftrace_event_field *field; + struct list_head *head; struct trace_seq *s; int common_field_count = 5; char *buf; @@ -539,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->id); + trace_seq_printf(s, "ID: %d\n", call->event.type); trace_seq_printf(s, "format:\n"); - list_for_each_entry_reverse(field, &call->fields, link) { + head = trace_get_fields(call); + list_for_each_entry_reverse(field, head, link) { /* * Smartly shows the array type(except dynamic array). * Normal: @@ -612,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return -ENOMEM; trace_seq_init(s); - trace_seq_printf(s, "%d\n", call->id); + trace_seq_printf(s, "%d\n", call->event.type); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); @@ -918,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { + struct list_head *head; int ret; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, TRACE_SYSTEM) != 0) - d_events = event_subsystem_dir(call->system, d_events); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + d_events = event_subsystem_dir(call->class->system, d_events); call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { @@ -934,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->regfunc) + if (call->class->probe || call->class->reg) trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id && call->perf_event_enable) +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && (call->class->perf_probe || call->class->reg)) trace_create_file("id", 0444, call->dir, call, id); +#endif - if (call->define_fields) { - ret = trace_define_common_fields(call); - if (!ret) - ret = call->define_fields(call); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; + if (call->class->define_fields) { + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = trace_define_common_fields(call); + if (!ret) + ret = call->class->define_fields(call); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; + } } trace_create_file("filter", 0644, call->dir, call, filter); @@ -969,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) if (!call->name) return -EINVAL; - if (call->raw_init) { - ret = call->raw_init(call); + if (call->class->raw_init) { + ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1034,13 +1072,13 @@ static void remove_subsystem_dir(const char *name) static void __trace_remove_event_call(struct ftrace_event_call *call) { ftrace_event_enable_disable(call, 0); - if (call->event) - __unregister_ftrace_event(call->event); + if (call->event.funcs) + __unregister_ftrace_event(&call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); trace_destroy_fields(call); destroy_preds(call); - remove_subsystem_dir(call->system); + remove_subsystem_dir(call->class->system); } /* Remove an event_call */ @@ -1131,8 +1169,8 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; - if (call->raw_init) { - ret = call->raw_init(call); + if (call->class->raw_init) { + ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1285,8 +1323,8 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; - if (call->raw_init) { - ret = call->raw_init(call); + if (call->class->raw_init) { + ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1387,8 +1425,8 @@ static __init void event_trace_self_tests(void) list_for_each_entry(call, &ftrace_events, list) { - /* Only test those that have a regfunc */ - if (!call->regfunc) + /* Only test those that have a probe */ + if (!call->class || !call->class->probe) continue; /* @@ -1398,8 +1436,8 @@ static __init void event_trace_self_tests(void) * syscalls as we test. */ #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS - if (call->system && - strcmp(call->system, "syscalls") == 0) + if (call->class->system && + strcmp(call->class->system, "syscalls") == 0) continue; #endif @@ -1409,7 +1447,7 @@ static __init void event_trace_self_tests(void) * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (call->enabled) { + if (call->flags & TRACE_EVENT_FL_ENABLED) { pr_warning("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4615f62..57bb1bb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,6 +22,7 @@ #include <linux/ctype.h> #include <linux/mutex.h> #include <linux/perf_event.h> +#include <linux/slab.h> #include "trace.h" #include "trace_output.h" @@ -499,8 +500,10 @@ static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { struct ftrace_event_field *field; + struct list_head *head; - list_for_each_entry(field, &call->fields, link) { + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; } @@ -544,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call) struct event_filter *filter = call->filter; int i; - call->filter_active = 0; + call->flags &= ~TRACE_EVENT_FL_FILTERED; filter->n_preds = 0; for (i = 0; i < MAX_FILTER_PRED; i++) @@ -571,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call) { __free_preds(call->filter); call->filter = NULL; - call->filter_active = 0; + call->flags &= ~TRACE_EVENT_FL_FILTERED; } static struct event_filter *__alloc_preds(void) @@ -610,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call) if (call->filter) return 0; - call->filter_active = 0; + call->flags &= ~TRACE_EVENT_FL_FILTERED; call->filter = __alloc_preds(); if (IS_ERR(call->filter)) return PTR_ERR(call->filter); @@ -624,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system) int err; list_for_each_entry(call, &ftrace_events, list) { - if (!call->define_fields) + if (!call->class || !call->class->define_fields) continue; - if (strcmp(call->system, system->name) != 0) + if (strcmp(call->class->system, system->name) != 0) continue; err = init_preds(call); @@ -643,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { - if (!call->define_fields) + if (!call->class || !call->class->define_fields) continue; - if (strcmp(call->system, system->name) != 0) + if (strcmp(call->class->system, system->name) != 0) continue; filter_disable_preds(call); @@ -1248,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(call, &ftrace_events, list) { struct event_filter *filter = call->filter; - if (!call->define_fields) + if (!call->class || !call->class->define_fields) continue; - if (strcmp(call->system, system->name) != 0) + if (strcmp(call->class->system, system->name) != 0) continue; /* try to see if the filter can be applied */ @@ -1265,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system, if (err) filter_disable_preds(call); else { - call->filter_active = 1; + call->flags |= TRACE_EVENT_FL_FILTERED; replace_filter_string(filter, filter_string); } fail = false; @@ -1314,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (err) append_filter_err(ps, call->filter); else - call->filter_active = 1; + call->flags |= TRACE_EVENT_FL_FILTERED; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1392,12 +1395,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (call->id == event_id) + if (call->event.type == event_id) break; } err = -EINVAL; - if (!call) + if (&call->list == &ftrace_events) goto out_unlock; err = -EEXIST; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e091f64..8536e2a 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ static int ftrace_raw_init_event(struct ftrace_event_call *call) { - INIT_LIST_HEAD(&call->fields); + INIT_LIST_HEAD(&call->class->fields); return 0; } @@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) #define F_printk(fmt, args...) #fmt ", " __stringify(args) #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ + \ +struct ftrace_event_class event_class_ftrace_##call = { \ + .system = __stringify(TRACE_SYSTEM), \ + .define_fields = ftrace_define_fields_##call, \ + .raw_init = ftrace_raw_init_event, \ +}; \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ - .id = type, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event, \ + .event.type = etype, \ + .class = &event_class_ftrace_##call, \ .print_fmt = print, \ - .define_fields = ftrace_define_fields_##call, \ }; \ #include "trace_entries.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e6989d9..79f4bac 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace.h" @@ -39,7 +40,7 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 +#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -178,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } -static int __trace_graph_entry(struct trace_array *tr, +int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, int pc) @@ -245,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } -static void __trace_graph_return(struct trace_array *tr, +void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, int pc) @@ -489,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter, * We need to consume the current entry to see * the next one. */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + ring_buffer_consume(iter->tr->buffer, iter->cpu, + NULL, NULL); event = ring_buffer_peek(iter->tr->buffer, iter->cpu, - NULL); + NULL, NULL); } if (!event) @@ -525,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter, /* Signal a overhead of time execution to the output */ static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s) +print_graph_overhead(unsigned long long duration, struct trace_seq *s, + u32 flags) { /* If duration disappear, we don't need anything */ - if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) + if (!(flags & TRACE_GRAPH_PRINT_DURATION)) return 1; /* Non nested entry or return */ if (duration == -1) return trace_seq_printf(s, " "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { /* Duration exceeded 100 msecs */ if (duration > 100000ULL) return trace_seq_printf(s, "! "); @@ -561,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s) static enum print_line_t print_graph_irq(struct trace_iterator *iter, unsigned long addr, - enum trace_type type, int cpu, pid_t pid) + enum trace_type type, int cpu, pid_t pid, u32 flags) { int ret; struct trace_seq *s = &iter->seq; @@ -571,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_UNHANDLED; /* Absolute time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + if (flags & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + if (flags & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -595,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_overhead(-1, s); + ret = print_graph_overhead(-1, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -608,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_PARTIAL_LINE; /* Don't close the duration column if haven't one */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + if (flags & TRACE_GRAPH_PRINT_DURATION) trace_seq_printf(s, " |"); ret = trace_seq_printf(s, "\n"); @@ -678,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *entry, - struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) + struct ftrace_graph_ret_entry *ret_entry, + struct trace_seq *s, u32 flags) { struct fgraph_data *data = iter->private; struct ftrace_graph_ret *graph_ret; @@ -710,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter, } /* Overhead */ - ret = print_graph_overhead(duration, s); + ret = print_graph_overhead(duration, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Duration */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + if (flags & TRACE_GRAPH_PRINT_DURATION) { ret = print_graph_duration(duration, s); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -738,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, static enum print_line_t print_graph_entry_nested(struct trace_iterator *iter, struct ftrace_graph_ent_entry *entry, - struct trace_seq *s, int cpu) + struct trace_seq *s, int cpu, u32 flags) { struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; @@ -758,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No overhead */ - ret = print_graph_overhead(-1, s); + ret = print_graph_overhead(-1, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* No time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + if (flags & TRACE_GRAPH_PRINT_DURATION) { ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -789,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter, static enum print_line_t print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, - int type, unsigned long addr) + int type, unsigned long addr, u32 flags) { struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; @@ -802,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (type) { /* Interrupt */ - ret = print_graph_irq(iter, addr, type, cpu, ent->pid); + ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } /* Absolute time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + if (flags & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + if (flags & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, ent->pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -844,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, - struct trace_iterator *iter) + struct trace_iterator *iter, u32 flags) { struct fgraph_data *data = iter->private; struct ftrace_graph_ent *call = &field->graph_ent; @@ -852,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t ret; int cpu = iter->cpu; - if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) return TRACE_TYPE_PARTIAL_LINE; leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) - ret = print_graph_entry_leaf(iter, field, leaf_ret, s); + ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); else - ret = print_graph_entry_nested(iter, field, s, cpu); + ret = print_graph_entry_nested(iter, field, s, cpu, flags); if (data) { /* @@ -878,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, - struct trace_entry *ent, struct trace_iterator *iter) + struct trace_entry *ent, struct trace_iterator *iter, + u32 flags) { unsigned long long duration = trace->rettime - trace->calltime; struct fgraph_data *data = iter->private; @@ -908,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } } - if (print_graph_prologue(iter, s, 0, 0)) + if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; /* Overhead */ - ret = print_graph_overhead(duration, s); + ret = print_graph_overhead(duration, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Duration */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + if (flags & TRACE_GRAPH_PRINT_DURATION) { ret = print_graph_duration(duration, s); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -947,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } /* Overrun */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { + if (flags & TRACE_GRAPH_PRINT_OVERRUN) { ret = trace_seq_printf(s, " (Overruns: %lu)\n", trace->overrun); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } - ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); + ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, + cpu, pid, flags); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -962,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } static enum print_line_t -print_graph_comment(struct trace_seq *s, struct trace_entry *ent, - struct trace_iterator *iter) +print_graph_comment(struct trace_seq *s, struct trace_entry *ent, + struct trace_iterator *iter, u32 flags) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct fgraph_data *data = iter->private; @@ -975,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (data) depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; - if (print_graph_prologue(iter, s, 0, 0)) + if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; /* No overhead */ - ret = print_graph_overhead(-1, s); + ret = print_graph_overhead(-1, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* No time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + if (flags & TRACE_GRAPH_PRINT_DURATION) { ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1019,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (!event) return TRACE_TYPE_UNHANDLED; - ret = event->trace(iter, sym_flags); + ret = event->funcs->trace(iter, sym_flags, event); if (ret != TRACE_TYPE_HANDLED) return ret; } @@ -1039,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function(struct trace_iterator *iter) +print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1060,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter) if (data && data->failed) { field = &data->ent; iter->cpu = data->cpu; - ret = print_graph_entry(field, s, iter); + ret = print_graph_entry(field, s, iter, flags); if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; ret = TRACE_TYPE_NO_CONSUME; @@ -1080,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter) struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); saved = *field; - return print_graph_entry(&saved, s, iter); + return print_graph_entry(&saved, s, iter, flags); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter); + return print_graph_return(&field->ret, s, entry, iter, flags); } + case TRACE_STACK: + case TRACE_FN: + /* dont trace stack and functions as comments */ + return TRACE_TYPE_UNHANDLED; + default: - return print_graph_comment(s, entry, iter); + return print_graph_comment(s, entry, iter, flags); } return TRACE_TYPE_HANDLED; } -static void print_lat_header(struct seq_file *s) +static enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + return print_graph_function_flags(iter, tracer_flags.val); +} + +static enum print_line_t +print_graph_function_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return print_graph_function(iter); +} + +static void print_lat_header(struct seq_file *s, u32 flags) { static const char spaces[] = " " /* 16 spaces */ " " /* 4 spaces */ " "; /* 17 spaces */ int size = 0; - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) size += 16; - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + if (flags & TRACE_GRAPH_PRINT_CPU) size += 4; - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + if (flags & TRACE_GRAPH_PRINT_PROC) size += 17; seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); @@ -1116,43 +1140,48 @@ static void print_lat_header(struct seq_file *s) seq_printf(s, "#%.*s|||| / \n", size, spaces); } -static void print_graph_headers(struct seq_file *s) +void print_graph_headers_flags(struct seq_file *s, u32 flags) { int lat = trace_flags & TRACE_ITER_LATENCY_FMT; if (lat) - print_lat_header(s); + print_lat_header(s, flags); /* 1st line */ seq_printf(s, "#"); - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " TIME "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + if (flags & TRACE_GRAPH_PRINT_CPU) seq_printf(s, " CPU"); - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " TASK/PID "); if (lat) seq_printf(s, "|||||"); - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ seq_printf(s, "#"); - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " | "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + if (flags & TRACE_GRAPH_PRINT_CPU) seq_printf(s, " | "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + if (flags & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " | | "); if (lat) seq_printf(s, "|||||"); - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); } -static void graph_trace_open(struct trace_iterator *iter) +void print_graph_headers(struct seq_file *s) +{ + print_graph_headers_flags(s, tracer_flags.val); +} + +void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ struct fgraph_data *data; @@ -1187,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter) pr_warning("function graph tracer: not enough memory\n"); } -static void graph_trace_close(struct trace_iterator *iter) +void graph_trace_close(struct trace_iterator *iter) { struct fgraph_data *data = iter->private; @@ -1197,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter) } } +static struct trace_event_functions graph_functions = { + .trace = print_graph_function_event, +}; + +static struct trace_event graph_trace_entry_event = { + .type = TRACE_GRAPH_ENT, + .funcs = &graph_functions, +}; + +static struct trace_event graph_trace_ret_event = { + .type = TRACE_GRAPH_RET, + .funcs = &graph_functions +}; + static struct tracer graph_trace __read_mostly = { .name = "function_graph", .open = graph_trace_open, @@ -1218,6 +1261,16 @@ static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + if (!register_ftrace_event(&graph_trace_entry_event)) { + pr_warning("Warning: could not register graph trace events\n"); + return 1; + } + + if (!register_ftrace_event(&graph_trace_ret_event)) { + pr_warning("Warning: could not register graph trace events\n"); + return 1; + } + return register_tracer(&graph_trace); } diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c deleted file mode 100644 index 7b97000..0000000 --- a/kernel/trace/trace_hw_branches.c +++ /dev/null @@ -1,312 +0,0 @@ -/* - * h/w branch tracer for x86 based on BTS - * - * Copyright (C) 2008-2009 Intel Corporation. - * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009 - */ -#include <linux/kallsyms.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/fs.h> - -#include <asm/ds.h> - -#include "trace_output.h" -#include "trace.h" - - -#define BTS_BUFFER_SIZE (1 << 13) - -static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer); -static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer); - -#define this_tracer per_cpu(hwb_tracer, smp_processor_id()) - -static int trace_hw_branches_enabled __read_mostly; -static int trace_hw_branches_suspended __read_mostly; -static struct trace_array *hw_branch_trace __read_mostly; - - -static void bts_trace_init_cpu(int cpu) -{ - per_cpu(hwb_tracer, cpu) = - ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu), - BTS_BUFFER_SIZE, NULL, (size_t)-1, - BTS_KERNEL); - - if (IS_ERR(per_cpu(hwb_tracer, cpu))) - per_cpu(hwb_tracer, cpu) = NULL; -} - -static int bts_trace_init(struct trace_array *tr) -{ - int cpu; - - hw_branch_trace = tr; - trace_hw_branches_enabled = 0; - - get_online_cpus(); - for_each_online_cpu(cpu) { - bts_trace_init_cpu(cpu); - - if (likely(per_cpu(hwb_tracer, cpu))) - trace_hw_branches_enabled = 1; - } - trace_hw_branches_suspended = 0; - put_online_cpus(); - - /* If we could not enable tracing on a single cpu, we fail. */ - return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - if (likely(per_cpu(hwb_tracer, cpu))) { - ds_release_bts(per_cpu(hwb_tracer, cpu)); - per_cpu(hwb_tracer, cpu) = NULL; - } - } - trace_hw_branches_enabled = 0; - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_start(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_resume_bts(per_cpu(hwb_tracer, cpu)); - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_stop(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - trace_hw_branches_suspended = 1; - put_online_cpus(); -} - -static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - /* The notification is sent with interrupts enabled. */ - if (trace_hw_branches_enabled) { - bts_trace_init_cpu(cpu); - - if (trace_hw_branches_suspended && - likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - } - break; - - case CPU_DOWN_PREPARE: - /* The notification is sent with interrupts enabled. */ - if (likely(per_cpu(hwb_tracer, cpu))) { - ds_release_bts(per_cpu(hwb_tracer, cpu)); - per_cpu(hwb_tracer, cpu) = NULL; - } - } - - return NOTIFY_DONE; -} - -static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { - .notifier_call = bts_hotcpu_handler -}; - -static void bts_trace_print_header(struct seq_file *m) -{ - seq_puts(m, "# CPU# TO <- FROM\n"); -} - -static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) -{ - unsigned long symflags = TRACE_ITER_SYM_OFFSET; - struct trace_entry *entry = iter->ent; - struct trace_seq *seq = &iter->seq; - struct hw_branch_entry *it; - - trace_assign_type(it, entry); - - if (entry->type == TRACE_HW_BRANCHES) { - if (trace_seq_printf(seq, "%4d ", iter->cpu) && - seq_print_ip_sym(seq, it->to, symflags) && - trace_seq_printf(seq, "\t <- ") && - seq_print_ip_sym(seq, it->from, symflags) && - trace_seq_printf(seq, "\n")) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_UNHANDLED; -} - -void trace_hw_branch(u64 from, u64 to) -{ - struct ftrace_event_call *call = &event_hw_branch; - struct trace_array *tr = hw_branch_trace; - struct ring_buffer_event *event; - struct ring_buffer *buf; - struct hw_branch_entry *entry; - unsigned long irq1; - int cpu; - - if (unlikely(!tr)) - return; - - if (unlikely(!trace_hw_branches_enabled)) - return; - - local_irq_save(irq1); - cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) - goto out; - - buf = tr->buffer; - event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, from); - entry->ent.type = TRACE_HW_BRANCHES; - entry->from = from; - entry->to = to; - if (!filter_check_discard(call, entry, buf, event)) - trace_buffer_unlock_commit(buf, event, 0, 0); - - out: - atomic_dec(&tr->data[cpu]->disabled); - local_irq_restore(irq1); -} - -static void trace_bts_at(const struct bts_trace *trace, void *at) -{ - struct bts_struct bts; - int err = 0; - - WARN_ON_ONCE(!trace->read); - if (!trace->read) - return; - - err = trace->read(this_tracer, at, &bts); - if (err < 0) - return; - - switch (bts.qualifier) { - case BTS_BRANCH: - trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); - break; - } -} - -/* - * Collect the trace on the current cpu and write it into the ftrace buffer. - * - * pre: tracing must be suspended on the current cpu - */ -static void trace_bts_cpu(void *arg) -{ - struct trace_array *tr = (struct trace_array *)arg; - const struct bts_trace *trace; - unsigned char *at; - - if (unlikely(!tr)) - return; - - if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) - return; - - if (unlikely(!this_tracer)) - return; - - trace = ds_read_bts(this_tracer); - if (!trace) - return; - - for (at = trace->ds.top; (void *)at < trace->ds.end; - at += trace->ds.size) - trace_bts_at(trace, at); - - for (at = trace->ds.begin; (void *)at < trace->ds.top; - at += trace->ds.size) - trace_bts_at(trace, at); -} - -static void trace_bts_prepare(struct trace_iterator *iter) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - /* - * We need to collect the trace on the respective cpu since ftrace - * implicitly adds the record for the current cpu. - * Once that is more flexible, we could collect the data from any cpu. - */ - on_each_cpu(trace_bts_cpu, iter->tr, 1); - - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_resume_bts(per_cpu(hwb_tracer, cpu)); - put_online_cpus(); -} - -static void trace_bts_close(struct trace_iterator *iter) -{ - tracing_reset_online_cpus(iter->tr); -} - -void trace_hw_branch_oops(void) -{ - if (this_tracer) { - ds_suspend_bts_noirq(this_tracer); - trace_bts_cpu(hw_branch_trace); - ds_resume_bts_noirq(this_tracer); - } -} - -struct tracer bts_tracer __read_mostly = -{ - .name = "hw-branch-tracer", - .init = bts_trace_init, - .reset = bts_trace_reset, - .print_header = bts_trace_print_header, - .print_line = bts_trace_print_line, - .start = bts_trace_start, - .stop = bts_trace_stop, - .open = trace_bts_prepare, - .close = trace_bts_close, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_hw_branches, -#endif /* CONFIG_FTRACE_SELFTEST */ -}; - -__init static int init_bts_trace(void) -{ - register_hotcpu_notifier(&bts_hotcpu_notifier); - return register_tracer(&bts_tracer); -} -device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2974bc7..6fd486e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -34,6 +34,9 @@ static int trace_type __read_mostly; static int save_lat_flag; +static void stop_irqsoff_tracer(struct trace_array *tr, int graph); +static int start_irqsoff_tracer(struct trace_array *tr, int graph); + #ifdef CONFIG_PREEMPT_TRACER static inline int preempt_trace(void) @@ -55,6 +58,23 @@ irq_trace(void) # define irq_trace() (0) #endif +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + /* * Sequence count - we record it when starting a measurement and * skip the latency if the sequence has changed - some other section @@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +{ + int cpu; + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_irqsoff_tracer(irqsoff_trace, !set); + + for_each_possible_cpu(cpu) + per_cpu(tracing_cpu, cpu) = 0; + + tracing_max_latency = 0; + tracing_reset_online_cpus(irqsoff_trace); + + return start_irqsoff_tracer(irqsoff_trace, set); +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int ret; + int cpu; + int pc; + + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) + return 0; + + local_save_flags(flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(flags)) + return 0; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else + ret = 0; + + atomic_dec(&data->disabled); + return ret; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) + return; + + local_save_flags(flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(flags)) + return; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + } + + atomic_dec(&data->disabled); +} + +static void irqsoff_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); + +} + +static void irqsoff_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + u32 flags = GRAPH_TRACER_FLAGS; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, flags); + + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_print_header(struct seq_file *s) +{ + if (is_graph()) { + struct trace_iterator *iter = s->private; + u32 flags = GRAPH_TRACER_FLAGS; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + print_graph_headers_flags(s, flags); + } else + trace_default_header(s); +} + +static void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (!is_graph()) + trace_function(tr, ip, parent_ip, flags, pc); + else { + trace_graph_function(tr, parent_ip, flags, pc); + trace_graph_function(tr, ip, flags, pc); + } +} + +#else +#define __trace_function trace_function + +static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } +static void irqsoff_print_header(struct seq_file *s) { } +static void irqsoff_trace_open(struct trace_iterator *iter) { } +static void irqsoff_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out_unlock; - trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); /* Skip 5 functions to get to the irq/preempt enable function */ __trace_stack(tr, flags, 5, pc); @@ -172,7 +388,7 @@ out_unlock: out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } static inline void @@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, preempt_count()); per_cpu(tracing_cpu, cpu) = 1; @@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, preempt_count()); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); @@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static void start_irqsoff_tracer(struct trace_array *tr) +static int start_irqsoff_tracer(struct trace_array *tr, int graph) { - register_ftrace_function(&trace_ops); - if (tracing_is_enabled()) + int ret = 0; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&irqsoff_graph_return, + &irqsoff_graph_entry); + + if (!ret && tracing_is_enabled()) tracer_enabled = 1; else tracer_enabled = 0; + + return ret; } -static void stop_irqsoff_tracer(struct trace_array *tr) +static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); } static void __irqsoff_tracer_init(struct trace_array *tr) @@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr) /* make sure that the tracer is visible */ smp_wmb(); tracing_reset_online_cpus(tr); - start_irqsoff_tracer(tr); + + if (start_irqsoff_tracer(tr, is_graph())) + printk(KERN_ERR "failed to start irqsoff tracer\n"); } static void irqsoff_tracer_reset(struct trace_array *tr) { - stop_irqsoff_tracer(tr); + stop_irqsoff_tracer(tr, is_graph()); if (!save_lat_flag) trace_flags &= ~TRACE_ITER_LATENCY_FMT; @@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly = .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly = .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1251e36..f52b5f5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -29,6 +29,8 @@ #include <linux/ctype.h> #include <linux/ptrace.h> #include <linux/perf_event.h> +#include <linux/stringify.h> +#include <asm/bitsperlong.h> #include "trace.h" #include "trace_output.h" @@ -40,7 +42,6 @@ /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_NARGS "__probe_nargs" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" @@ -52,56 +53,102 @@ const char *reserved_field_names[] = { "common_tgid", "common_lock_depth", FIELD_STRING_IP, - FIELD_STRING_NARGS, FIELD_STRING_RETIP, FIELD_STRING_FUNC, }; -struct fetch_func { - unsigned long (*func)(struct pt_regs *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ +static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, void *data)\ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ +} \ +static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); + +struct fetch_param { + fetch_func_t fn; void *data; }; -static __kprobes unsigned long call_fetch(struct fetch_func *f, - struct pt_regs *regs) +static __kprobes void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) { - return f->func(regs, f->data); + return fprm->fn(regs, fprm->data, dest); } -/* fetch handlers */ -static __kprobes unsigned long fetch_register(struct pt_regs *regs, - void *offset) -{ - return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(kind) \ +DEFINE_FETCH_##kind(u8) \ +DEFINE_FETCH_##kind(u16) \ +DEFINE_FETCH_##kind(u32) \ +DEFINE_FETCH_##kind(u64) + +#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ + ((FETCH_FUNC_NAME(kind, u8) == fn) || \ + (FETCH_FUNC_NAME(kind, u16) == fn) || \ + (FETCH_FUNC_NAME(kind, u32) == fn) || \ + (FETCH_FUNC_NAME(kind, u64) == fn)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ } - -static __kprobes unsigned long fetch_stack(struct pt_regs *regs, - void *num) -{ - return regs_get_kernel_stack_nth(regs, - (unsigned int)((unsigned long)num)); +DEFINE_BASIC_FETCH_FUNCS(reg) + +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ } +DEFINE_BASIC_FETCH_FUNCS(stack) -static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) -{ - unsigned long retval; - - if (probe_kernel_address(addr, retval)) - return 0; - return retval; +#define DEFINE_FETCH_retval(type) \ +static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ } - -static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, - void *dummy) -{ - return regs_return_value(regs); -} - -static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, - void *dummy) -{ - return kernel_stack_pointer(regs); +DEFINE_BASIC_FETCH_FUNCS(retval) + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ } +DEFINE_BASIC_FETCH_FUNCS(memory) /* Memory fetching by symbol */ struct symbol_cache { @@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) return sc; } -static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) -{ - struct symbol_cache *sc = data; - - if (sc->addr) - return fetch_memory(regs, (void *)sc->addr); - else - return 0; +#define DEFINE_FETCH_symbol(type) \ +static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(symbol) -/* Special indirect memory access interface */ -struct indirect_fetch_data { - struct fetch_func orig; +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; long offset; }; -static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) -{ - struct indirect_fetch_data *ind = data; - unsigned long addr; - - addr = call_fetch(&ind->orig, regs); - if (addr) { - addr += ind->offset; - return fetch_memory(regs, (void *)addr); - } else - return 0; +#define DEFINE_FETCH_deref(type) \ +static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + fetch_memory_##type(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(deref) -static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (data->orig.func == fetch_indirect) - free_indirect_fetch_data(data->orig.data); - else if (data->orig.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(kind, type) \ + .kind = FETCH_FUNC_NAME(kind, type) + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + {.name = #ptype, \ + .size = sizeof(ftype), \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } + +/* Fetch type information table */ +static const struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + /* Fetch functions */ + fetch_func_t reg; + fetch_func_t stack; + fetch_func_t retval; + fetch_func_t memory; + fetch_func_t symbol; + fetch_func_t deref; +} fetch_type_table[] = { + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), +}; + +static const struct fetch_type *find_fetch_type(const char *type) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) + if (strcmp(type, fetch_type_table[i].name) == 0) + return &fetch_type_table[i]; + return NULL; +} + +/* Special function : only accept unsigned long */ +static __kprobes void fetch_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} + /** * Kprobe event core functions */ struct probe_arg { - struct fetch_func fetch; - const char *name; + struct fetch_param fetch; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ }; /* Flags for trace_probe */ @@ -202,8 +324,9 @@ struct trace_probe { unsigned long nhit; unsigned int flags; /* For TP_FLAG_* */ const char *symbol; /* symbol name */ + struct ftrace_event_class class; struct ftrace_event_call call; - struct trace_event event; + ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; }; @@ -212,6 +335,7 @@ struct trace_probe { (offsetof(struct trace_probe, args) + \ (sizeof(struct probe_arg) * (n))) + static __kprobes int probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; @@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } -static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) -{ - int ret = -EINVAL; - - if (ff->func == fetch_register) { - const char *name; - name = regs_query_register_name((unsigned int)((long)ff->data)); - ret = snprintf(buf, n, "%%%s", name); - } else if (ff->func == fetch_stack) - ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); - else if (ff->func == fetch_memory) - ret = snprintf(buf, n, "@0x%p", ff->data); - else if (ff->func == fetch_symbol) { - struct symbol_cache *sc = ff->data; - if (sc->offset) - ret = snprintf(buf, n, "@%s%+ld", sc->symbol, - sc->offset); - else - ret = snprintf(buf, n, "@%s", sc->symbol); - } else if (ff->func == fetch_retvalue) - ret = snprintf(buf, n, "$retval"); - else if (ff->func == fetch_stack_address) - ret = snprintf(buf, n, "$stack"); - else if (ff->func == fetch_indirect) { - struct indirect_fetch_data *id = ff->data; - size_t l = 0; - ret = snprintf(buf, n, "%+ld(", id->offset); - if (ret >= n) - goto end; - l += ret; - ret = probe_arg_string(buf + l, n - l, &id->orig); - if (ret < 0) - goto end; - l += ret; - ret = snprintf(buf + l, n - l, ")"); - ret += l; - } -end: - if (ret >= n) - return -ENOSPC; - return ret; -} - static int register_probe_event(struct trace_probe *tp); static void unregister_probe_event(struct trace_probe *tp); @@ -323,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; } + tp->call.class = &tp->class; tp->call.name = kstrdup(event, GFP_KERNEL); if (!tp->call.name) goto error; @@ -332,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; } - tp->call.system = kstrdup(group, GFP_KERNEL); - if (!tp->call.system) + tp->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->class.system) goto error; INIT_LIST_HEAD(&tp->list); @@ -347,11 +429,12 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (arg->fetch.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); - else if (arg->fetch.func == fetch_indirect) - free_indirect_fetch_data(arg->fetch.data); kfree(arg->name); + kfree(arg->comm); } static void free_trace_probe(struct trace_probe *tp) @@ -361,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) free_probe_arg(&tp->args[i]); - kfree(tp->call.system); + kfree(tp->call.class->system); kfree(tp->call.name); kfree(tp->symbol); kfree(tp); @@ -374,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event, list_for_each_entry(tp, &probe_list, list) if (strcmp(tp->call.name, event) == 0 && - strcmp(tp->call.system, group) == 0) + strcmp(tp->call.class->system, group) == 0) return tp; return NULL; } @@ -399,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp) mutex_lock(&probe_lock); /* register as an event */ - old_tp = find_probe_event(tp->call.name, tp->call.system); + old_tp = find_probe_event(tp->call.name, tp->call.class->system); if (old_tp) { /* delete old event */ unregister_trace_probe(old_tp); @@ -457,28 +540,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_ARGS 16 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) -static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) { - ff->func = fetch_retvalue; - ff->data = NULL; - } else + if (is_return) + f->fn = t->retval; + else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - ff->func = fetch_stack_address; - ff->data = NULL; + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) + f->fn = fetch_stack_address; + else + ret = -EINVAL; } else if (isdigit(arg[5])) { ret = strict_strtoul(arg + 5, 10, ¶m); if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - ff->func = fetch_stack; - ff->data = (void *)param; + f->fn = t->stack; + f->data = (void *)param; } } else ret = -EINVAL; @@ -488,7 +573,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) } /* Recursive argument parser */ -static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int __parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; @@ -497,13 +583,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, ff, is_return); + ret = parse_probe_vars(arg + 1, t, f, is_return); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - ff->func = fetch_register; - ff->data = (void *)(unsigned long)ret; + f->fn = t->reg; + f->data = (void *)(unsigned long)ret; ret = 0; } break; @@ -512,26 +598,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - ff->func = fetch_memory; - ff->data = (void *)param; + f->fn = t->memory; + f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); if (ret) break; - ff->data = alloc_symbol_cache(arg + 1, offset); - if (ff->data) - ff->func = fetch_symbol; - else - ret = -EINVAL; + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->symbol; } break; - case '+': /* indirect memory */ + case '+': /* deref memory */ case '-': tmp = strchr(arg, '('); - if (!tmp) { - ret = -EINVAL; + if (!tmp) break; - } *tmp = '\0'; ret = strict_strtol(arg + 1, 0, &offset); if (ret) @@ -541,38 +623,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { - struct indirect_fetch_data *id; + struct deref_fetch_param *dprm; + const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - id = kzalloc(sizeof(struct indirect_fetch_data), - GFP_KERNEL); - if (!id) + dprm = kzalloc(sizeof(struct deref_fetch_param), + GFP_KERNEL); + if (!dprm) return -ENOMEM; - id->offset = offset; - ret = __parse_probe_arg(arg, &id->orig, is_return); + dprm->offset = offset; + ret = __parse_probe_arg(arg, t2, &dprm->orig, + is_return); if (ret) - kfree(id); + kfree(dprm); else { - ff->func = fetch_indirect; - ff->data = (void *)id; + f->fn = t->deref; + f->data = (void *)dprm; } - } else - ret = -EINVAL; + } break; - default: - /* TODO: support custom handler */ - ret = -EINVAL; } + if (!ret && !f->fn) + ret = -EINVAL; return ret; } /* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_arg(char *arg, struct trace_probe *tp, + struct probe_arg *parg, int is_return) { + const char *t; + if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; } - return __parse_probe_arg(arg, ff, is_return); + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = tp->size; + tp->size += parg->type->size; + return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); } /* Return 1 if name is reserved or already used by another argument */ @@ -602,15 +704,18 @@ static int create_trace_probe(int argc, char **argv) * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG - * Indirect memory fetch: + * Dereferencing memory fetch: * +|-offs(ARG) : fetch memory at ARG +|- offs address. * Alias name of args: * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_probe *tp; int i, ret = 0; int is_return = 0, is_delete = 0; - char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + char *symbol = NULL, *event = NULL, *group = NULL; + char *arg, *tmp; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -723,13 +828,6 @@ static int create_trace_probe(int argc, char **argv) else arg = argv[i]; - if (conflict_field_name(argv[i], tp->args, i)) { - pr_info("Argument%d name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); if (!tp->args[i].name) { pr_info("Failed to allocate argument%d name '%s'.\n", @@ -737,9 +835,19 @@ static int create_trace_probe(int argc, char **argv) ret = -ENOMEM; goto error; } + tmp = strchr(tp->args[i].name, ':'); + if (tmp) + *tmp = '_'; /* convert : to _ */ + + if (conflict_field_name(tp->args[i].name, tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } /* Parse fetch argument */ - ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { pr_info("Parse error at argument%d. (%d)\n", i, ret); kfree(tp->args[i].name); @@ -794,11 +902,10 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { struct trace_probe *tp = v; - int i, ret; - char buf[MAX_ARGSTR_LEN + 1]; + int i; seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); + seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); if (!tp->symbol) seq_printf(m, " 0x%p", tp->rp.kp.addr); @@ -807,15 +914,10 @@ static int probes_seq_show(struct seq_file *m, void *v) else seq_printf(m, " %s", probe_symbol(tp)); - for (i = 0; i < tp->nr_args; i++) { - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); - if (ret < 0) { - pr_warning("Argument%d decoding error(%d).\n", i, ret); - return ret; - } - seq_printf(m, " %s=%s", tp->args[i].name, buf); - } + for (i = 0; i < tp->nr_args; i++) + seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); seq_printf(m, "\n"); + return 0; } @@ -945,9 +1047,10 @@ static const struct file_operations kprobe_profile_ops = { static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -957,18 +1060,18 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; - event = trace_current_buffer_lock_reserve(&buffer, call->id, size, - irq_flags, pc); + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -979,9 +1082,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -989,19 +1093,19 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; - event = trace_current_buffer_lock_reserve(&buffer, call->id, size, - irq_flags, pc); + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1009,17 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, /* Event entry printers */ enum print_line_t -print_kprobe_event(struct trace_iterator *iter, int flags) +print_kprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) { - struct kprobe_trace_entry *field; + struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; - struct trace_event *event; struct trace_probe *tp; + u8 *data; int i; - field = (struct kprobe_trace_entry *)iter->ent; - event = ftrace_find_event(field->ent.type); - tp = container_of(event, struct trace_probe, event); + field = (struct kprobe_trace_entry_head *)iter->ent; + tp = container_of(event, struct trace_probe, call.event); if (!trace_seq_printf(s, "%s: (", tp->call.name)) goto partial; @@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1044,17 +1149,17 @@ partial: } enum print_line_t -print_kretprobe_event(struct trace_iterator *iter, int flags) +print_kretprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) { - struct kretprobe_trace_entry *field; + struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; - struct trace_event *event; struct trace_probe *tp; + u8 *data; int i; - field = (struct kretprobe_trace_entry *)iter->ent; - event = ftrace_find_event(field->ent.type); - tp = container_of(event, struct trace_probe, event); + field = (struct kretprobe_trace_entry_head *)iter->ent; + tp = container_of(event, struct trace_probe, call.event); if (!trace_seq_printf(s, "%s: (", tp->call.name)) goto partial; @@ -1071,9 +1176,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1110,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call) static int probe_event_raw_init(struct ftrace_event_call *event_call) { - INIT_LIST_HEAD(&event_call->fields); - return 0; } @@ -1129,29 +1233,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call) static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kprobe_trace_entry field; + struct kprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kretprobe_trace_entry field; + struct kretprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } @@ -1176,8 +1294,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", - tp->args[i].name); + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); @@ -1219,28 +1337,30 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; + struct hlist_head *head; + u8 *data; int size, __size, i; - unsigned long irq_flags; int rctx; - __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) return; - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); } /* Kretprobe profile handler */ @@ -1249,30 +1369,31 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; + struct hlist_head *head; + u8 *data; int size, __size, i; - unsigned long irq_flags; int rctx; - __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) return; - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, - irq_flags, regs); + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); } static int probe_perf_enable(struct ftrace_event_call *call) @@ -1302,6 +1423,26 @@ static void probe_perf_disable(struct ftrace_event_call *call) } #endif /* CONFIG_PERF_EVENTS */ +static __kprobes +int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return probe_event_enable(event); + case TRACE_REG_UNREGISTER: + probe_event_disable(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return probe_perf_enable(event); + case TRACE_REG_PERF_UNREGISTER: + probe_perf_disable(event); + return 0; +#endif + } + return 0; +} static __kprobes int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) @@ -1331,6 +1472,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) return 0; /* We don't tweek kernel, so just return 0 */ } +static struct trace_event_functions kretprobe_funcs = { + .trace = print_kretprobe_event +}; + +static struct trace_event_functions kprobe_funcs = { + .trace = print_kprobe_event +}; + static int register_probe_event(struct trace_probe *tp) { struct ftrace_event_call *call = &tp->call; @@ -1338,36 +1487,31 @@ static int register_probe_event(struct trace_probe *tp) /* Initialize ftrace_event_call */ if (probe_is_return(tp)) { - tp->event.trace = print_kretprobe_event; - call->raw_init = probe_event_raw_init; - call->define_fields = kretprobe_event_define_fields; + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &kretprobe_funcs; + call->class->raw_init = probe_event_raw_init; + call->class->define_fields = kretprobe_event_define_fields; } else { - tp->event.trace = print_kprobe_event; - call->raw_init = probe_event_raw_init; - call->define_fields = kprobe_event_define_fields; + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &kprobe_funcs; + call->class->raw_init = probe_event_raw_init; + call->class->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) return -ENOMEM; - call->event = &tp->event; - call->id = register_ftrace_event(&tp->event); - if (!call->id) { + ret = register_ftrace_event(&call->event); + if (!ret) { kfree(call->print_fmt); return -ENODEV; } - call->enabled = 0; - call->regfunc = probe_event_enable; - call->unregfunc = probe_event_disable; - -#ifdef CONFIG_PERF_EVENTS - call->perf_event_enable = probe_perf_enable; - call->perf_event_disable = probe_perf_disable; -#endif + call->flags = 0; + call->class->reg = kprobe_register; call->data = tp; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); kfree(call->print_fmt); - unregister_ftrace_event(&tp->event); + unregister_ftrace_event(&call->event); } return ret; } diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 94103cd..8eaf007 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -23,6 +23,7 @@ #include <linux/debugfs.h> #include <linux/ftrace.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace_output.h" @@ -33,12 +34,6 @@ #include <asm/atomic.h> -/* - * For now, let us restrict the no. of symbols traced simultaneously to number - * of available hardware breakpoint registers. - */ -#define KSYM_TRACER_MAX HBP_NUM - #define KSYM_TRACER_OP_LEN 3 /* rw- */ struct trace_ksym { @@ -52,7 +47,6 @@ struct trace_ksym { static struct trace_array *ksym_trace_array; -static unsigned int ksym_filter_entry_count; static unsigned int ksym_tracing_enabled; static HLIST_HEAD(ksym_filter_head); @@ -180,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) struct trace_ksym *entry; int ret = -ENOMEM; - if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { - printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" - " new requests for tracing can be accepted now.\n", - KSYM_TRACER_MAX); - return -ENOSPC; - } - entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); if (!entry) return -ENOMEM; @@ -202,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) if (IS_ERR(entry->ksym_hbp)) { ret = PTR_ERR(entry->ksym_hbp); - printk(KERN_INFO "ksym_tracer request failed. Try again" - " later!!\n"); + if (ret == -ENOSPC) { + printk(KERN_ERR "ksym_tracer: Maximum limit reached." + " No new requests for tracing can be accepted now.\n"); + } else { + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + } goto err; } hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); - ksym_filter_entry_count++; return 0; @@ -264,7 +255,6 @@ static void __ksym_trace_reset(void) hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, ksym_hlist) { unregister_wide_hw_breakpoint(entry->ksym_hbp); - ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry); @@ -337,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file, goto out_unlock; } /* Error or "symbol:---" case: drop it */ - ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0acd834..017fa37 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/mmiotrace.h> #include <linux/pci.h> +#include <linux/slab.h> #include <linux/time.h> #include <asm/atomic.h> diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8e46b33..57c1b45 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -209,6 +209,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c) return 1; } +EXPORT_SYMBOL(trace_seq_putc); int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) { @@ -253,7 +254,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) void *ret; if (s->full) - return 0; + return NULL; if (len > ((PAGE_SIZE - 1) - s->len)) { s->full = 1; @@ -355,6 +356,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, } EXPORT_SYMBOL(ftrace_print_symbols_seq); +const char * +ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +{ + int i; + const char *ret = p->buffer + p->len; + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_hex_seq); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { @@ -726,6 +742,9 @@ int register_ftrace_event(struct trace_event *event) if (WARN_ON(!event)) goto out; + if (WARN_ON(!event->funcs)) + goto out; + INIT_LIST_HEAD(&event->list); if (!event->type) { @@ -758,14 +777,14 @@ int register_ftrace_event(struct trace_event *event) goto out; } - if (event->trace == NULL) - event->trace = trace_nop_print; - if (event->raw == NULL) - event->raw = trace_nop_print; - if (event->hex == NULL) - event->hex = trace_nop_print; - if (event->binary == NULL) - event->binary = trace_nop_print; + if (event->funcs->trace == NULL) + event->funcs->trace = trace_nop_print; + if (event->funcs->raw == NULL) + event->funcs->raw = trace_nop_print; + if (event->funcs->hex == NULL) + event->funcs->hex = trace_nop_print; + if (event->funcs->binary == NULL) + event->funcs->binary = trace_nop_print; key = event->type & (EVENT_HASHSIZE - 1); @@ -807,13 +826,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); * Standard events */ -enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { return TRACE_TYPE_HANDLED; } /* TRACE_FN */ -static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -840,7 +861,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; @@ -854,7 +876,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -867,7 +890,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -880,14 +904,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static struct trace_event trace_fn_event = { - .type = TRACE_FN, +static struct trace_event_functions trace_fn_funcs = { .trace = trace_fn_trace, .raw = trace_fn_raw, .hex = trace_fn_hex, .binary = trace_fn_bin, }; +static struct trace_event trace_fn_event = { + .type = TRACE_FN, + .funcs = &trace_fn_funcs, +}; + /* TRACE_CTX an TRACE_WAKE */ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, char *delim) @@ -916,13 +944,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_print(iter, "==>"); } static enum print_line_t trace_wake_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { return trace_ctxwake_print(iter, " +"); } @@ -950,12 +979,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_raw(iter, 0); } -static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_raw(iter, '+'); } @@ -984,18 +1015,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_hex(iter, 0); } -static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_hex(iter, '+'); } static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct ctx_switch_entry *field; struct trace_seq *s = &iter->seq; @@ -1012,25 +1045,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } -static struct trace_event trace_ctx_event = { - .type = TRACE_CTX, +static struct trace_event_functions trace_ctx_funcs = { .trace = trace_ctx_print, .raw = trace_ctx_raw, .hex = trace_ctx_hex, .binary = trace_ctxwake_bin, }; -static struct trace_event trace_wake_event = { - .type = TRACE_WAKE, +static struct trace_event trace_ctx_event = { + .type = TRACE_CTX, + .funcs = &trace_ctx_funcs, +}; + +static struct trace_event_functions trace_wake_funcs = { .trace = trace_wake_print, .raw = trace_wake_raw, .hex = trace_wake_hex, .binary = trace_ctxwake_bin, }; +static struct trace_event trace_wake_event = { + .type = TRACE_WAKE, + .funcs = &trace_wake_funcs, +}; + /* TRACE_SPECIAL */ static enum print_line_t trace_special_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct special_entry *field; @@ -1046,7 +1087,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter, } static enum print_line_t trace_special_hex(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct special_entry *field; struct trace_seq *s = &iter->seq; @@ -1061,7 +1102,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter, } static enum print_line_t trace_special_bin(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct special_entry *field; struct trace_seq *s = &iter->seq; @@ -1075,18 +1116,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } -static struct trace_event trace_special_event = { - .type = TRACE_SPECIAL, +static struct trace_event_functions trace_special_funcs = { .trace = trace_special_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, }; +static struct trace_event trace_special_event = { + .type = TRACE_SPECIAL, + .funcs = &trace_special_funcs, +}; + /* TRACE_STACK */ static enum print_line_t trace_stack_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct stack_entry *field; struct trace_seq *s = &iter->seq; @@ -1114,17 +1159,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_stack_event = { - .type = TRACE_STACK, +static struct trace_event_functions trace_stack_funcs = { .trace = trace_stack_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, }; +static struct trace_event trace_stack_event = { + .type = TRACE_STACK, + .funcs = &trace_stack_funcs, +}; + /* TRACE_USER_STACK */ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct userstack_entry *field; struct trace_seq *s = &iter->seq; @@ -1143,17 +1192,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_user_stack_event = { - .type = TRACE_USER_STACK, +static struct trace_event_functions trace_user_stack_funcs = { .trace = trace_user_stack_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, }; +static struct trace_event trace_user_stack_event = { + .type = TRACE_USER_STACK, + .funcs = &trace_user_stack_funcs, +}; + /* TRACE_BPRINT */ static enum print_line_t -trace_bprint_print(struct trace_iterator *iter, int flags) +trace_bprint_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; @@ -1178,7 +1232,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags) static enum print_line_t -trace_bprint_raw(struct trace_iterator *iter, int flags) +trace_bprint_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct bprint_entry *field; struct trace_seq *s = &iter->seq; @@ -1197,16 +1252,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } +static struct trace_event_functions trace_bprint_funcs = { + .trace = trace_bprint_print, + .raw = trace_bprint_raw, +}; static struct trace_event trace_bprint_event = { .type = TRACE_BPRINT, - .trace = trace_bprint_print, - .raw = trace_bprint_raw, + .funcs = &trace_bprint_funcs, }; /* TRACE_PRINT */ static enum print_line_t trace_print_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct print_entry *field; struct trace_seq *s = &iter->seq; @@ -1225,7 +1283,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } -static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct print_entry *field; @@ -1240,12 +1299,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_print_event = { - .type = TRACE_PRINT, +static struct trace_event_functions trace_print_funcs = { .trace = trace_print_print, .raw = trace_print_raw, }; +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .funcs = &trace_print_funcs, +}; + static struct trace_event *events[] __initdata = { &trace_fn_event, diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 9d91c72..c038eba 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void); extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, - int flags); + int flags, struct trace_event *event); extern int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 5fca0f5..8f758d0 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr, } static void -probe_sched_switch(struct rq *__rq, struct task_struct *prev, - struct task_struct *next) +probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; unsigned long flags; @@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, } static void -probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) +probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) { struct trace_array_cpu *data; unsigned long flags; @@ -139,21 +138,21 @@ static int tracing_sched_register(void) { int ret; - ret = register_trace_sched_wakeup(probe_sched_wakeup); + ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } - ret = register_trace_sched_wakeup_new(probe_sched_wakeup); + ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = register_trace_sched_switch(probe_sched_switch); + ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); @@ -162,17 +161,17 @@ static int tracing_sched_register(void) return ret; fail_deprobe_wake_new: - unregister_trace_sched_wakeup_new(probe_sched_wakeup); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: - unregister_trace_sched_wakeup(probe_sched_wakeup); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { - unregister_trace_sched_switch(probe_sched_switch); - unregister_trace_sched_wakeup_new(probe_sched_wakeup); - unregister_trace_sched_wakeup(probe_sched_wakeup); + unregister_trace_sched_switch(probe_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0271742..0e73bc2 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -98,7 +98,8 @@ static int report_latency(cycle_t delta) return 1; } -static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) +static void +probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) { if (task != wakeup_task) return; @@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) } static void notrace -probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) +probe_wakeup_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; cycle_t T0, T1, delta; @@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(struct rq *rq, struct task_struct *p, int success) +probe_wakeup(void *ignore, struct task_struct *p, int success) { struct trace_array_cpu *data; int cpu = smp_processor_id(); @@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr) { int ret; - ret = register_trace_sched_wakeup(probe_wakeup); + ret = register_trace_sched_wakeup(probe_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return; } - ret = register_trace_sched_wakeup_new(probe_wakeup); + ret = register_trace_sched_wakeup_new(probe_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = register_trace_sched_switch(probe_wakeup_sched_switch); + ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } - ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_migrate_task\n"); @@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr) return; fail_deprobe_wake_new: - unregister_trace_sched_wakeup_new(probe_wakeup); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); fail_deprobe: - unregister_trace_sched_wakeup(probe_wakeup); + unregister_trace_sched_wakeup(probe_wakeup, NULL); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; unregister_ftrace_function(&trace_ops); - unregister_trace_sched_switch(probe_wakeup_sched_switch); - unregister_trace_sched_wakeup_new(probe_wakeup); - unregister_trace_sched_wakeup(probe_wakeup); - unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); + unregister_trace_sched_wakeup(probe_wakeup, NULL); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); } static int __wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 280fea4..250e7f9 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,6 +3,7 @@ #include <linux/stringify.h> #include <linux/kthread.h> #include <linux/delay.h> +#include <linux/slab.h> static inline int trace_valid_entry(struct trace_entry *entry) { @@ -16,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_HW_BRANCHES: case TRACE_KSYM: return 1; } @@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) struct trace_entry *entry; unsigned int loops = 0; - while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { + while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { entry = ring_buffer_event_data(event); /* @@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* Maximum number of functions to trace before diagnosing a hang */ #define GRAPH_MAX_FUNC_TEST 100000000 -static void __ftrace_dump(bool disable_tracing); +static void +__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ @@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); if (ftrace_dump_on_oops) - __ftrace_dump(false); + __ftrace_dump(false, DUMP_ALL); return 0; } @@ -754,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_HW_BRANCH_TRACER -int -trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr) -{ - struct trace_iterator *iter; - struct tracer tracer; - unsigned long count; - int ret; - - if (!trace->open) { - printk(KERN_CONT "missing open function..."); - return -1; - } - - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* - * The hw-branch tracer needs to collect the trace from the various - * cpu trace buffers - before tracing is stopped. - */ - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - memcpy(&tracer, trace, sizeof(tracer)); - - iter->trace = &tracer; - iter->tr = tr; - iter->pos = -1; - mutex_init(&iter->mutex); - - trace->open(iter); - - mutex_destroy(&iter->mutex); - kfree(iter); - - tracing_stop(); - - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT "no entries found.."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_HW_BRANCH_TRACER */ - #ifdef CONFIG_KSYM_TRACER static int ksym_selftest_dummy; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index a4bb239..96cffb2 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -10,6 +10,7 @@ #include <linux/list.h> +#include <linux/slab.h> #include <linux/rbtree.h> #include <linux/debugfs.h> #include "trace_stat.h" diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 33c2a5b..34e3580 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@ #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/slab.h> #include <linux/kernel.h> #include <linux/ftrace.h> #include <linux/perf_event.h> @@ -14,6 +15,54 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type); +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type); + +static int syscall_enter_define_fields(struct ftrace_event_call *call); +static int syscall_exit_define_fields(struct ftrace_event_call *call); + +static struct list_head * +syscall_get_enter_fields(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + return &entry->enter_fields; +} + +static struct list_head * +syscall_get_exit_fields(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + return &entry->exit_fields; +} + +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, +}; + +struct ftrace_event_class event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, +}; + +struct ftrace_event_class event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .get_fields = syscall_get_exit_fields, + .raw_init = init_syscall_trace, +}; + extern unsigned long __start_syscalls_metadata[]; extern unsigned long __stop_syscalls_metadata[]; @@ -52,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) } enum print_line_t -print_syscall_enter(struct trace_iterator *iter, int flags) +print_syscall_enter(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -67,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags) if (!entry) goto end; - if (entry->enter_event->id != ent->type) { + if (entry->enter_event->event.type != ent->type) { WARN_ON_ONCE(1); goto end; } @@ -104,7 +154,8 @@ end: } enum print_line_t -print_syscall_exit(struct trace_iterator *iter, int flags) +print_syscall_exit(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -122,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } - if (entry->exit_event->id != ent->type) { + if (entry->exit_event->event.type != ent->type) { WARN_ON_ONCE(1); return TRACE_TYPE_UNHANDLED; } @@ -204,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -int syscall_enter_define_fields(struct ftrace_event_call *call) +static int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; @@ -227,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call) return ret; } -int syscall_exit_define_fields(struct ftrace_event_call *call) +static int syscall_exit_define_fields(struct ftrace_event_call *call) { struct syscall_trace_exit trace; int ret; @@ -242,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -void ftrace_syscall_enter(struct pt_regs *regs, long id) +void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; @@ -264,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; event = trace_current_buffer_lock_reserve(&buffer, - sys_data->enter_event->id, size, 0, 0); + sys_data->enter_event->event.type, size, 0, 0); if (!event) return; @@ -277,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(struct pt_regs *regs, long ret) +void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -296,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) return; event = trace_current_buffer_lock_reserve(&buffer, - sys_data->exit_event->id, sizeof(*entry), 0, 0); + sys_data->exit_event->event.type, sizeof(*entry), 0, 0); if (!event) return; @@ -319,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) - ret = register_trace_sys_enter(ftrace_syscall_enter); + ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); if (!ret) { set_bit(num, enabled_enter_syscalls); sys_refcount_enter++; @@ -339,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) sys_refcount_enter--; clear_bit(num, enabled_enter_syscalls); if (!sys_refcount_enter) - unregister_trace_sys_enter(ftrace_syscall_enter); + unregister_trace_sys_enter(ftrace_syscall_enter, NULL); mutex_unlock(&syscall_trace_lock); } @@ -353,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) - ret = register_trace_sys_exit(ftrace_syscall_exit); + ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); if (!ret) { set_bit(num, enabled_exit_syscalls); sys_refcount_exit++; @@ -373,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) sys_refcount_exit--; clear_bit(num, enabled_exit_syscalls); if (!sys_refcount_exit) - unregister_trace_sys_exit(ftrace_syscall_exit); + unregister_trace_sys_exit(ftrace_syscall_exit, NULL); mutex_unlock(&syscall_trace_lock); } @@ -433,11 +484,11 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static void perf_syscall_enter(struct pt_regs *regs, long id) +static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; - unsigned long flags; + struct hlist_head *head; int syscall_nr; int rctx; int size; @@ -460,14 +511,16 @@ static void perf_syscall_enter(struct pt_regs *regs, long id) return; rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->id, &rctx, &flags); + sys_data->enter_event->event.type, regs, &rctx); if (!rec) return; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } int perf_sysenter_enable(struct ftrace_event_call *call) @@ -479,7 +532,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call) mutex_lock(&syscall_trace_lock); if (!sys_perf_refcount_enter) - ret = register_trace_sys_enter(perf_syscall_enter); + ret = register_trace_sys_enter(perf_syscall_enter, NULL); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -501,15 +554,15 @@ void perf_sysenter_disable(struct ftrace_event_call *call) sys_perf_refcount_enter--; clear_bit(num, enabled_perf_enter_syscalls); if (!sys_perf_refcount_enter) - unregister_trace_sys_enter(perf_syscall_enter); + unregister_trace_sys_enter(perf_syscall_enter, NULL); mutex_unlock(&syscall_trace_lock); } -static void perf_syscall_exit(struct pt_regs *regs, long ret) +static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - unsigned long flags; + struct hlist_head *head; int syscall_nr; int rctx; int size; @@ -535,14 +588,15 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret) return; rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->id, &rctx, &flags); + sys_data->exit_event->event.type, regs, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); + head = this_cpu_ptr(sys_data->exit_event->perf_events); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } int perf_sysexit_enable(struct ftrace_event_call *call) @@ -554,7 +608,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call) mutex_lock(&syscall_trace_lock); if (!sys_perf_refcount_exit) - ret = register_trace_sys_exit(perf_syscall_exit); + ret = register_trace_sys_exit(perf_syscall_exit, NULL); if (ret) { pr_info("event trace: Could not activate" "syscall exit trace point"); @@ -576,9 +630,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call) sys_perf_refcount_exit--; clear_bit(num, enabled_perf_exit_syscalls); if (!sys_perf_refcount_exit) - unregister_trace_sys_exit(perf_syscall_exit); + unregister_trace_sys_exit(perf_syscall_exit, NULL); mutex_unlock(&syscall_trace_lock); } #endif /* CONFIG_PERF_EVENTS */ +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_enter(event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_enter(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysenter_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysenter_disable(event); + return 0; +#endif + } + return 0; +} + +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_exit(event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_exit(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysexit_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysexit_disable(event); + return 0; +#endif + } + return 0; +} diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 40cafb0..a7cc379 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,7 @@ #include <trace/events/workqueue.h> #include <linux/list.h> #include <linux/percpu.h> +#include <linux/slab.h> #include <linux/kref.h> #include "trace_stat.h" #include "trace.h" @@ -48,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref) /* Insertion of a work */ static void -probe_workqueue_insertion(struct task_struct *wq_thread, +probe_workqueue_insertion(void *ignore, + struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -69,7 +71,8 @@ found: /* Execution of a work */ static void -probe_workqueue_execution(struct task_struct *wq_thread, +probe_workqueue_execution(void *ignore, + struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -89,7 +92,8 @@ found: } /* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) +static void probe_workqueue_creation(void *ignore, + struct task_struct *wq_thread, int cpu) { struct cpu_workqueue_stats *cws; unsigned long flags; @@ -113,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) } /* Destruction of a cpu workqueue thread */ -static void probe_workqueue_destruction(struct task_struct *wq_thread) +static void +probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) { /* Workqueue only execute on one cpu */ int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -258,19 +263,19 @@ int __init trace_workqueue_early_init(void) { int ret, cpu; - ret = register_trace_workqueue_insertion(probe_workqueue_insertion); + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); if (ret) goto out; - ret = register_trace_workqueue_execution(probe_workqueue_execution); + ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); if (ret) goto no_insertion; - ret = register_trace_workqueue_creation(probe_workqueue_creation); + ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); if (ret) goto no_execution; - ret = register_trace_workqueue_destruction(probe_workqueue_destruction); + ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); if (ret) goto no_creation; @@ -282,11 +287,11 @@ int __init trace_workqueue_early_init(void) return 0; no_creation: - unregister_trace_workqueue_creation(probe_workqueue_creation); + unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); no_execution: - unregister_trace_workqueue_execution(probe_workqueue_execution); + unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); no_insertion: - unregister_trace_workqueue_insertion(probe_workqueue_insertion); + unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); out: pr_warning("trace_workqueue: unable to trace workqueues\n"); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index cc89be5..c77f3ec 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; */ struct tracepoint_entry { struct hlist_node hlist; - void **funcs; + struct tracepoint_func *funcs; int refcount; /* Number of times armed. 0 if disarmed. */ char name[0]; }; @@ -64,12 +64,12 @@ struct tp_probes { struct rcu_head rcu; struct list_head list; } u; - void *probes[0]; + struct tracepoint_func probes[0]; }; static inline void *allocate_probes(int count) { - struct tp_probes *p = kmalloc(count * sizeof(void *) + struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func) + sizeof(struct tp_probes), GFP_KERNEL); return p == NULL ? NULL : p->probes; } @@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head) kfree(container_of(head, struct tp_probes, u.rcu)); } -static inline void release_probes(void *old) +static inline void release_probes(struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, @@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry) if (!tracepoint_debug || !entry->funcs) return; - for (i = 0; entry->funcs[i]; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); + for (i = 0; entry->funcs[i].func; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func); } -static void * -tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) +static struct tracepoint_func * +tracepoint_entry_add_probe(struct tracepoint_entry *entry, + void *probe, void *data) { int nr_probes = 0; - void **old, **new; + struct tracepoint_func *old, *new; WARN_ON(!probe); @@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) old = entry->funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes]; nr_probes++) - if (old[nr_probes] == probe) + for (nr_probes = 0; old[nr_probes].func; nr_probes++) + if (old[nr_probes].func == probe && + old[nr_probes].data == data) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ @@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) if (new == NULL) return ERR_PTR(-ENOMEM); if (old) - memcpy(new, old, nr_probes * sizeof(void *)); - new[nr_probes] = probe; - new[nr_probes + 1] = NULL; + memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); + new[nr_probes].func = probe; + new[nr_probes].data = data; + new[nr_probes + 1].func = NULL; entry->refcount = nr_probes + 1; entry->funcs = new; debug_print_probes(entry); @@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) } static void * -tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) +tracepoint_entry_remove_probe(struct tracepoint_entry *entry, + void *probe, void *data) { int nr_probes = 0, nr_del = 0, i; - void **old, **new; + struct tracepoint_func *old, *new; old = entry->funcs; @@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes]; nr_probes++) { - if ((!probe || old[nr_probes] == probe)) + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if (!probe || + (old[nr_probes].func == probe && + old[nr_probes].data == data)) nr_del++; } @@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) new = allocate_probes(nr_probes - nr_del + 1); if (new == NULL) return ERR_PTR(-ENOMEM); - for (i = 0; old[i]; i++) - if ((probe && old[i] != probe)) + for (i = 0; old[i].func; i++) + if (probe && + (old[i].func != probe || old[i].data != data)) new[j++] = old[i]; - new[nr_probes - nr_del] = NULL; + new[nr_probes - nr_del].func = NULL; entry->refcount = nr_probes - nr_del; entry->funcs = new; } @@ -315,18 +322,19 @@ static void tracepoint_update_probes(void) module_update_tracepoints(); } -static void *tracepoint_add_probe(const char *name, void *probe) +static struct tracepoint_func * +tracepoint_add_probe(const char *name, void *probe, void *data) { struct tracepoint_entry *entry; - void *old; + struct tracepoint_func *old; entry = get_tracepoint(name); if (!entry) { entry = add_tracepoint(name); if (IS_ERR(entry)) - return entry; + return (struct tracepoint_func *)entry; } - old = tracepoint_entry_add_probe(entry, probe); + old = tracepoint_entry_add_probe(entry, probe, data); if (IS_ERR(old) && !entry->refcount) remove_tracepoint(entry); return old; @@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe) * Returns 0 if ok, error value on error. * The probe address must at least be aligned on the architecture pointer size. */ -int tracepoint_probe_register(const char *name, void *probe) +int tracepoint_probe_register(const char *name, void *probe, void *data) { - void *old; + struct tracepoint_func *old; mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe); + old = tracepoint_add_probe(name, probe, data); mutex_unlock(&tracepoints_mutex); if (IS_ERR(old)) return PTR_ERR(old); @@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe) } EXPORT_SYMBOL_GPL(tracepoint_probe_register); -static void *tracepoint_remove_probe(const char *name, void *probe) +static struct tracepoint_func * +tracepoint_remove_probe(const char *name, void *probe, void *data) { struct tracepoint_entry *entry; - void *old; + struct tracepoint_func *old; entry = get_tracepoint(name); if (!entry) return ERR_PTR(-ENOENT); - old = tracepoint_entry_remove_probe(entry, probe); + old = tracepoint_entry_remove_probe(entry, probe, data); if (IS_ERR(old)) return old; if (!entry->refcount) @@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe) * itself uses stop_machine(), which insures that every preempt disabled section * have finished. */ -int tracepoint_probe_unregister(const char *name, void *probe) +int tracepoint_probe_unregister(const char *name, void *probe, void *data) { - void *old; + struct tracepoint_func *old; mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe); + old = tracepoint_remove_probe(name, probe, data); mutex_unlock(&tracepoints_mutex); if (IS_ERR(old)) return PTR_ERR(old); @@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old) * * caller must call tracepoint_probe_update_all() */ -int tracepoint_probe_register_noupdate(const char *name, void *probe) +int tracepoint_probe_register_noupdate(const char *name, void *probe, + void *data) { - void *old; + struct tracepoint_func *old; mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe); + old = tracepoint_add_probe(name, probe, data); if (IS_ERR(old)) { mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); @@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); * * caller must call tracepoint_probe_update_all() */ -int tracepoint_probe_unregister_noupdate(const char *name, void *probe) +int tracepoint_probe_unregister_noupdate(const char *name, void *probe, + void *data) { - void *old; + struct tracepoint_func *old; mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe); + old = tracepoint_remove_probe(name, probe, data); if (IS_ERR(old)) { mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); diff --git a/kernel/user.c b/kernel/user.c index 766467b..7e72614 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,7 +16,6 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/user_namespace.h> -#include "cred-internals.h" struct user_namespace init_user_ns = { .kref = { @@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up, *new; - /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() - * atomic. - */ spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); @@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - /* This case is not possible when CONFIG_USER_SCHED - * is defined, since we serialize alloc_uid() using - * uids_mutex. Hence no need to call - * sched_destroy_user() or remove_user_sysfs_dir(). - */ key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); @@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) return up; - put_user_ns(new->user_ns); - kmem_cache_free(uid_cachep, new); out_unlock: return NULL; } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 076c7c8..2591583 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/highuid.h> #include <linux/cred.h> /* @@ -54,8 +55,8 @@ int create_user_ns(struct cred *new) #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - /* alloc_uid() incremented the userns refcount. Just set it to 1 */ - kref_set(&ns->kref, 1); + /* root_user holds a reference to ns, our reference can be dropped */ + put_user_ns(ns); return 0; } @@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref) schedule_work(&ns->destroyer); } EXPORT_SYMBOL(free_user_ns); + +uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return uid; + + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (uid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowuid; +} + +gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return gid; + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (gid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowgid; +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dee4865..327d2de 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work, atomic_long_set(&work->data, new); } +/* + * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. + */ +static inline void clear_wq_data(struct work_struct *work) +{ + unsigned long flags = *work_data_bits(work) & + (1UL << WORK_STRUCT_STATIC); + atomic_long_set(&work->data, flags); +} + static inline struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) { @@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work, wait_on_work(work); } while (unlikely(ret < 0)); - work_clear_pending(work); + clear_wq_data(work); return ret; } @@ -774,7 +784,7 @@ void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(keventd_wq, get_cpu()); + cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); __queue_work(cwq, &dwork->work); put_cpu(); } @@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func) return 0; } +/** + * flush_scheduled_work - ensure that any scheduled work has run to completion. + * + * Forces execution of the kernel-global workqueue and blocks until its + * completion. + * + * Think twice before calling this function! It's very easy to get into + * trouble if you don't take great care. Either of the following situations + * will lead to deadlock: + * + * One of the work items currently on the workqueue needs to acquire + * a lock held by your code or its caller. + * + * Your code is running in the context of a work routine. + * + * They will be detected by lockdep when they occur, but the first might not + * occur very often. It depends on what work items are on the workqueue and + * what locks they need, which you have no control over. + * + * In most situations flushing the entire workqueue is overkill; you merely + * need to know that a particular work item isn't queued and isn't running. + * In such cases you should use cancel_delayed_work_sync() or + * cancel_work_sync() instead. + */ void flush_scheduled_work(void) { flush_workqueue(keventd_wq); @@ -1076,7 +1110,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned int cpu = (unsigned long)hcpu; struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - int ret = NOTIFY_OK; + int err = 0; action &= ~CPU_TASKS_FROZEN; @@ -1090,12 +1124,13 @@ undo: switch (action) { case CPU_UP_PREPARE: - if (!create_workqueue_thread(cwq, cpu)) + err = create_workqueue_thread(cwq, cpu); + if (!err) break; printk(KERN_ERR "workqueue [%s] for %i failed\n", wq->name, cpu); action = CPU_UP_CANCELED; - ret = NOTIFY_BAD; + err = -ENOMEM; goto undo; case CPU_ONLINE: @@ -1116,7 +1151,7 @@ undo: cpumask_clear_cpu(cpu, cpu_populated_map); } - return ret; + return notifier_from_errno(err); } #ifdef CONFIG_SMP |